diff --git a/CMakeLists.txt b/CMakeLists.txt index 75fcf9659b..2f4fd3323d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,11 +216,14 @@ OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE "Enable SSE instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE2 "Enable SSE2 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) -OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE3 "Enable SSE3 instructions" ON IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_POPCNT "Enable POPCNT instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_AVX2 "Enable AVX2 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_FMA3 "Enable FMA3 instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 2f9068c60d..13559b5c8a 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -128,10 +128,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSE2) add_extra_compiler_option(-msse2) endif() - if (ENABLE_NEON) + if(ENABLE_NEON) add_extra_compiler_option("-mfpu=neon") endif() - if (ENABLE_VFPV3 AND NOT ENABLE_NEON) + if(ENABLE_VFPV3 AND NOT ENABLE_NEON) add_extra_compiler_option("-mfpu=vfpv3") endif() @@ -140,6 +140,13 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_AVX) add_extra_compiler_option(-mavx) endif() + if(ENABLE_AVX2) + add_extra_compiler_option(-mavx2) + + if(ENABLE_FMA3) + add_extra_compiler_option(-mfma) + endif() + endif() # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed. if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx") @@ -158,6 +165,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSE42) add_extra_compiler_option(-msse4.2) endif() + + if(ENABLE_POPCNT) + add_extra_compiler_option(-mpopcnt) + endif() endif() endif(NOT MINGW) @@ -214,7 +225,10 @@ if(MSVC) set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi") endif() - if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600) + if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800) + set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2") + endif() + if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:") set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX") endif() @@ -236,7 +250,7 @@ if(MSVC) endif() endif() - if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX) + if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2) set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi") endif() @@ -308,6 +322,7 @@ if(MSVC) endforeach() if(NOT ENABLE_NOISY_WARNINGS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251") #class 'std::XXX' needs to have dll-interface to be used by clients of YYY + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4251) # class 'std::XXX' needs to have dll-interface to be used by clients of YYY + ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4324) # 'struct_name' : structure was padded due to __declspec(align()) endif() endif() diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index e43fbbc951..f2acaa3fb4 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -813,4 +814,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val) } // cv +#include "sse_utils.hpp" + #endif //__OPENCV_CORE_BASE_HPP__ diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 06894d7a5d..3fdaa6954d 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -104,17 +105,32 @@ #endif /* CPU features and intrinsics support */ -#define CV_CPU_NONE 0 -#define CV_CPU_MMX 1 -#define CV_CPU_SSE 2 -#define CV_CPU_SSE2 3 -#define CV_CPU_SSE3 4 -#define CV_CPU_SSSE3 5 -#define CV_CPU_SSE4_1 6 -#define CV_CPU_SSE4_2 7 -#define CV_CPU_POPCNT 8 -#define CV_CPU_AVX 10 -#define CV_CPU_NEON 11 +#define CV_CPU_NONE 0 +#define CV_CPU_MMX 1 +#define CV_CPU_SSE 2 +#define CV_CPU_SSE2 3 +#define CV_CPU_SSE3 4 +#define CV_CPU_SSSE3 5 +#define CV_CPU_SSE4_1 6 +#define CV_CPU_SSE4_2 7 +#define CV_CPU_POPCNT 8 + +#define CV_CPU_AVX 10 +#define CV_CPU_AVX2 11 +#define CV_CPU_FMA3 12 + +#define CV_CPU_AVX_512F 13 +#define CV_CPU_AVX_512BW 14 +#define CV_CPU_AVX_512CD 15 +#define CV_CPU_AVX_512DQ 16 +#define CV_CPU_AVX_512ER 17 +#define CV_CPU_AVX_512IFMA512 18 +#define CV_CPU_AVX_512PF 19 +#define CV_CPU_AVX_512VBMI 20 +#define CV_CPU_AVX_512VL 21 + +#define CV_CPU_NEON 100 + // when adding to this list remember to update the enum in core/utility.cpp #define CV_HARDWARE_MAX_FEATURE 255 @@ -123,6 +139,7 @@ #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2) # include +# define CV_MMX 1 # define CV_SSE 1 # define CV_SSE2 1 # if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500) @@ -141,7 +158,15 @@ # include # define CV_SSE4_2 1 # endif -# if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219) +# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500) +# ifdef _MSC_VER +# include +# else +# include +# endif +# define CV_POPCNT 1 +# endif +# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0) // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # include @@ -152,6 +177,13 @@ # define __xgetbv() 0 # endif # endif +# if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0) +# include +# define CV_AVX2 1 +# if defined __FMA__ +# define CV_FMA3 1 +# endif +# endif #endif #if (defined WIN32 || defined _WIN32) && defined(_M_ARM) @@ -166,6 +198,12 @@ #endif // __CUDACC__ +#ifndef CV_POPCNT +#define CV_POPCNT 0 +#endif +#ifndef CV_MMX +# define CV_MMX 0 +#endif #ifndef CV_SSE # define CV_SSE 0 #endif @@ -187,6 +225,40 @@ #ifndef CV_AVX # define CV_AVX 0 #endif +#ifndef CV_AVX2 +# define CV_AVX2 0 +#endif +#ifndef CV_FMA3 +# define CV_FMA3 0 +#endif +#ifndef CV_AVX_512F +# define CV_AVX_512F 0 +#endif +#ifndef CV_AVX_512BW +# define CV_AVX_512BW 0 +#endif +#ifndef CV_AVX_512CD +# define CV_AVX_512CD 0 +#endif +#ifndef CV_AVX_512DQ +# define CV_AVX_512DQ 0 +#endif +#ifndef CV_AVX_512ER +# define CV_AVX_512ER 0 +#endif +#ifndef CV_AVX_512IFMA512 +# define CV_AVX_512IFMA512 0 +#endif +#ifndef CV_AVX_512PF +# define CV_AVX_512PF 0 +#endif +#ifndef CV_AVX_512VBMI +# define CV_AVX_512VBMI 0 +#endif +#ifndef CV_AVX_512VL +# define CV_AVX_512VL 0 +#endif + #ifndef CV_NEON # define CV_NEON 0 #endif diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp new file mode 100644 index 0000000000..e0283eb3f3 --- /dev/null +++ b/modules/core/include/opencv2/core/sse_utils.hpp @@ -0,0 +1,645 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_CORE_SSE_UTILS_HPP__ +#define __OPENCV_CORE_SSE_UTILS_HPP__ + +#ifndef __cplusplus +# error sse_utils.hpp header must be compiled as C++ +#endif + +#if CV_SSE2 + +inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); + __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); + __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); + __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); + + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); + + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); + + v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); + v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); + v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); + v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); +} + +inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); + __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); + __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0); + __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0); + __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1); + __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); + __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); + + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); + __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); + + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); + __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); + __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); + + v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); + v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); + v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); + v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); + v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); + v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); +} + +inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); + __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); + __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); + __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1); + __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0); + __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0); + __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1); + __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7); + __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7); + + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7); + __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7); + + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4); + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4); + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5); + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5); + __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6); + __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6); + __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7); + __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7); + + v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4); + v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4); + v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5); + v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5); + v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6); + v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6); + v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7); + v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); +} + +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i v_mask = _mm_set1_epi16(0x00ff); + + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); + + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); + + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); + + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); + + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); +} + +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i v_mask = _mm_set1_epi16(0x00ff); + + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); + + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); + + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); + + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); + + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); +} + +inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i v_mask = _mm_set1_epi16(0x00ff); + + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); + __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8)); + + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); + __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask)); + __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8)); + + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); + __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8)); + + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); + __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8)); + + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); + v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); + v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); + v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); +} + +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); + __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); + __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); + __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); + + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); + + v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); + v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); + v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); + v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); +} + +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); + __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); + __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); + __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0); + __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1); + __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); + __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); + __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); + + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); + __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); + __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); + + v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); + v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); + v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); + v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); + v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); + v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); +} + +inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); + __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); + __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1); + __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1); + __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0); + __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); + __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); + __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); + + __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); + __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5); + __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6); + __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7); + __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7); + + __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4); + __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5); + __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6); + __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7); + __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7); + + v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4); + v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4); + v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5); + v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5); + v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6); + v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6); + v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7); + v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); +} + +#if CV_SSE4_1 + +inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) +{ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); + + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); + + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); +} + +inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, + __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) +{ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); + + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); + + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); + v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); +} + +inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, + __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) +{ + __m128i v_mask = _mm_set1_epi32(0x0000ffff); + + __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); + __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); + __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); + __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); + __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); + __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); + __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); + __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); + + __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); + __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); + __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); + __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); + __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); + __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); + __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); + __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); + + __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); + __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); + __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); + __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); + __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); + __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); + __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); + __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); + + v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); + v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); + v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); + v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); + v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); + v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); + v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); + v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); +} + +#endif // CV_SSE4_1 + +inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) +{ + __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); + __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); + __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); + __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); + + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); + + v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); + v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); + v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); + v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); +} + +inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, + __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) +{ + __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); + __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); + __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); + __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); + __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); + __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); + + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); + __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); + __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); + + v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); + v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); + v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); + v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); + v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); + v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); +} + +inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, + __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) +{ + __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); + __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); + __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); + __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1); + __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0); + __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0); + __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1); + __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1); + + __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4); + __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4); + __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5); + __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5); + __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6); + __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6); + __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7); + __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7); + + v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4); + v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4); + v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5); + v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5); + v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6); + v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6); + v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7); + v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); +} + +inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); + __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); + __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); + + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); + + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); + v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); + v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); +} + +inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, + __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); + __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); + __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); + __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); + __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); + + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); + __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); + __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); + + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); + v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); + v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); + v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); + v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); +} + +inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, + __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); + __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); + __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); + __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); + __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); + __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); + __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo); + __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi); + + __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); + __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); + __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); + __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); + __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); + __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); + __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo); + __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi); + + v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); + v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); + v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); + v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); + v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); + v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); + v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo); + v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); +} + +#endif // CV_SSE2 + +#endif //__OPENCV_CORE_SSE_UTILS_HPP__ diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index 88989ef5cb..f89560a809 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -13,6 +13,7 @@ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -281,16 +282,30 @@ CV_EXPORTS_W int64 getCPUTickCount(); remember to keep this list identical to the one in cvdef.h */ enum CpuFeatures { - CPU_MMX = 1, - CPU_SSE = 2, - CPU_SSE2 = 3, - CPU_SSE3 = 4, - CPU_SSSE3 = 5, - CPU_SSE4_1 = 6, - CPU_SSE4_2 = 7, - CPU_POPCNT = 8, - CPU_AVX = 10, - CPU_NEON = 11 + CPU_MMX = 1, + CPU_SSE = 2, + CPU_SSE2 = 3, + CPU_SSE3 = 4, + CPU_SSSE3 = 5, + CPU_SSE4_1 = 6, + CPU_SSE4_2 = 7, + CPU_POPCNT = 8, + + CPU_AVX = 10, + CPU_AVX2 = 11, + CPU_FMA3 = 12, + + CPU_AVX_512F = 13, + CPU_AVX_512BW = 14, + CPU_AVX_512CD = 15, + CPU_AVX_512DQ = 16, + CPU_AVX_512ER = 17, + CPU_AVX_512IFMA512 = 18, + CPU_AVX_512PF = 19, + CPU_AVX_512VBMI = 20, + CPU_AVX_512VL = 21, + + CPU_NEON = 100 }; /** @brief Returns true if the specified feature is supported by the host hardware. diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp index 3598c8639f..c6c2a1b29f 100644 --- a/modules/core/perf/perf_arithm.cpp +++ b/modules/core/perf/perf_arithm.cpp @@ -242,3 +242,31 @@ PERF_TEST_P(Size_MatType, multiplyScale, TYPICAL_MATS_CORE_ARITHM) SANITY_CHECK(c, 1e-8); } + +PERF_TEST_P(Size_MatType, divide, TYPICAL_MATS_CORE_ARITHM) +{ + Size sz = get<0>(GetParam()); + int type = get<1>(GetParam()); + cv::Mat a(sz, type), b(sz, type), c(sz, type); + double scale = 0.5; + + declare.in(a, b, WARMUP_RNG).out(c); + + TEST_CYCLE() divide(a, b, c, scale); + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST_P(Size_MatType, reciprocal, TYPICAL_MATS_CORE_ARITHM) +{ + Size sz = get<0>(GetParam()); + int type = get<1>(GetParam()); + cv::Mat b(sz, type), c(sz, type); + double scale = 0.5; + + declare.in(b, WARMUP_RNG).out(c); + + TEST_CYCLE() divide(scale, b, c); + + SANITY_CHECK_NOTHING(); +} diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index f881c785b3..c4de2c4bed 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -63,6 +64,10 @@ FUNCTOR_TEMPLATE(VLoadStore128); #if CV_SSE2 FUNCTOR_TEMPLATE(VLoadStore64); FUNCTOR_TEMPLATE(VLoadStore128Aligned); +#if CV_AVX2 +FUNCTOR_TEMPLATE(VLoadStore256); +FUNCTOR_TEMPLATE(VLoadStore256Aligned); +#endif #endif #endif @@ -75,17 +80,28 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si #endif Op op; - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) + for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) { int x = 0; #if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = vop(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else #if CV_SSE2 if( USE_SSE2 ) { -#endif +#endif // CV_SSE2 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) { typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); @@ -97,9 +113,13 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si } #if CV_SSE2 } -#endif -#endif -#if CV_SSE2 +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + +#if CV_AVX2 + // nothing +#elif CV_SSE2 if( USE_SSE2 ) { for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) ) @@ -110,6 +130,7 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si } } #endif + #if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { @@ -136,13 +157,26 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, #endif Op op; - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) + for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) { int x = 0; -#if CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= sz.width - 8; x += 8 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op32(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 if( USE_SSE2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) @@ -158,12 +192,24 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, } } } -#endif +#endif // CV_AVX2 + #if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= sz.width - 8; x += 8 ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = op32(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else #if CV_SSE2 if( USE_SSE2 ) { -#endif +#endif // CV_SSE2 for( ; x <= sz.width - 8; x += 8 ) { typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); @@ -175,8 +221,10 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, } #if CV_SSE2 } -#endif -#endif +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + #if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { @@ -204,13 +252,26 @@ void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, #endif Op op; - for( ; sz.height--; src1 += step1/sizeof(src1[0]), - src2 += step2/sizeof(src2[0]), - dst += step/sizeof(dst[0]) ) + for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) { int x = 0; -#if CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= sz.width - 4; x += 4 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op64(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 if( USE_SSE2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) @@ -243,7 +304,141 @@ void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, } } -#if CV_SSE2 +#if CV_AVX2 + +#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ + static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ + } + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p); } \ + static void store(template_arg * p, reg_type v) { store_body (p, v); } \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & b) const \ + { \ + body; \ + } \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & ) const \ + { \ + body; \ + } \ + } + +FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); +FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); + +FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); + + +static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, + 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, + 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, + return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, + __m256i d = _mm256_subs_epi8(a, b); + __m256i m = _mm256_cmpgt_epi8(b, a); + return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, + return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, + __m256i M = _mm256_max_epi16(a, b); + __m256i m = _mm256_min_epi16(a, b); + return _mm256_subs_epi16(M, m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, + __m256i d = _mm256_sub_epi32(a, b); + __m256i m = _mm256_cmpgt_epi32(b, a); + return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, + return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, double, + return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); + ); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); + +#elif CV_SSE2 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ template <> \ @@ -2574,6 +2769,263 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, } } +template +struct Div_SIMD +{ + int operator() (const T *, const T *, T *, int, double) const + { + return 0; + } +}; + +#if CV_SSE2 + +#if CV_SSE4_1 + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero); + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); + + __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + + int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8); + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); + + __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero); + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_unpackhi_epi16(v_src1, v_zero); + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + + int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16); + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src1d = _mm_cvtepi32_pd(v_src1i); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16); + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src1d = _mm_cvtepi32_pd(v_src1i); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + + int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 4; x += 4) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128d v_src1d = _mm_cvtepi32_pd(v_src1); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + + v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d)); + + __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + } + + return x; + } +}; + +#endif + template static void div_( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) @@ -2582,9 +3034,11 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); + Div_SIMD vop; + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int i = 0; + int i = vop(src1, src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for( ; i <= size.width - 4; i += 4 ) { @@ -2621,6 +3075,232 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, } } +template +struct Recip_SIMD +{ + int operator() (const T *, T *, int, double) const + { + return 0; + } +}; + +#if CV_SSE2 + +#if CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + + int operator() (const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero); + + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + + int operator() (const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8); + + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero); + _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero))); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + + int operator() (const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_unpackhi_epi16(v_src2, v_zero); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + + int operator() (const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16); + __m128d v_src2d = _mm_cvtepi32_pd(v_src2i); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16); + v_src2d = _mm_cvtepi32_pd(v_src2i); + v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8)); + v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + + __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1))); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); } + + int operator() (const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + __m128d v_scale = _mm_set1_pd(scale); + __m128i v_zero = _mm_setzero_si128(); + + for ( ; x <= width - 4; x += 4) + { + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128d v_src2d = _mm_cvtepi32_pd(v_src2); + __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + + v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8)); + __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d)); + + __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1))); + __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero); + _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst)); + } + + return x; + } +}; + +#endif + template static void recip_( const T*, size_t, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) @@ -2628,9 +3308,11 @@ recip_( const T*, size_t, const T* src2, size_t step2, step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); + Recip_SIMD vop; + for( ; size.height--; src2 += step2, dst += step ) { - int i = 0; + int i = vop(src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for( ; i <= size.width - 4; i += 4 ) { @@ -3564,6 +4246,130 @@ struct Cmp_SIMD uint8x8_t v_mask; }; +#elif CV_SSE2 + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi8(-1); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi32(0xffffffff); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + #endif template static void @@ -3676,7 +4482,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste { int x =0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); __m128i c128 = _mm_set1_epi8 (-128); for( ; x <= size.width - 16; x += 16 ) @@ -3692,7 +4499,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -3714,7 +4521,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste { int x = 0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -3724,7 +4532,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste _mm_storeu_si128((__m128i*)(dst + x), r00); } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -3804,7 +4612,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st { int x =0; #if CV_SSE2 - if( USE_SSE2){// + if( USE_SSE2) + { __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -3828,7 +4637,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -3843,8 +4652,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); } - - #endif + #endif for( ; x < size.width; x++ ){ dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); @@ -3858,7 +4666,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st { int x = 0; #if CV_SSE2 - if( USE_SSE2 ){ + if( USE_SSE2 ) + { __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); for( ; x <= size.width - 16; x += 16 ) { @@ -3882,7 +4691,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st x += 8; } } - #elif CV_NEON + #elif CV_NEON uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); for( ; x <= size.width - 16; x += 16 ) @@ -3897,8 +4706,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); } - #endif - for( ; x < size.width; x++ ) + #endif + for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); } } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 829b984c9f..090acf5508 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -62,8 +63,11 @@ template struct VSplit4; #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, \ + data_type* dst1) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -72,9 +76,11 @@ template struct VSplit4; #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ + struct name \ + { \ void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2){ \ + data_type* dst2) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -84,9 +90,11 @@ template struct VSplit4; #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ template<> \ - struct name{ \ + struct name \ + { \ void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2, data_type* dst3){ \ + data_type* dst2, data_type* dst3) const \ + { \ reg_type r = load_func(src); \ store_func(dst0, r.val[0]); \ store_func(dst1, r.val[1]); \ @@ -96,28 +104,174 @@ template struct VSplit4; } SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); -SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 ); SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); -SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16); SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); -SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32); SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); -SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 ); SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); -SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16); SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); -SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32); SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); -SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 ); SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); -SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16); SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); -SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32); SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); + +#elif CV_SSE2 + +template +struct VSplit2 +{ + VSplit2() : support(false) { } + void operator()(const T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit3 +{ + VSplit3() : support(false) { } + void operator()(const T *, T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit4 +{ + VSplit4() : support(false) { } + void operator()(const T *, T *, T *, T *, T *) const { } + + bool support; +}; + +#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit2() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + } \ + \ + bool support; \ +} + +#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit3() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1, data_type * dst2) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + } \ + \ + bool support; \ +} + +#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit4() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ + data_type * dst2, data_type * dst3) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ + reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ + } \ + \ + bool support; \ +} + +SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + #endif template static void @@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i); } +#elif CV_SSE2 + if (cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); } +#elif CV_SSE2 + if (cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn ) for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); } +#elif CV_SSE2 + if (cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } + } #endif for( ; i < len; i++, j += cn ) { @@ -265,28 +459,177 @@ template struct VMerge4; } MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); -MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 ); MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); -MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16); MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); -MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32); MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); -MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 ); MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); -MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16); MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); -MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32); MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); -MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 ); MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); -MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16); MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); -MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32); MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); + +#elif CV_SSE2 + +template +struct VMerge2 +{ + VMerge2() : support(false) { } + void operator()(const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge3 +{ + VMerge3() : support(false) { } + void operator()(const T *, const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge4 +{ + VMerge4() : support(false) { } + void operator()(const T *, const T *, const T *, const T *, T *) const { } + + bool support; +}; + +#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge2() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + } \ + \ + bool support; \ +} + +#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge3() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + } \ + \ + bool support; \ +} + +#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge4() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + const data_type * src2, const data_type * src3, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ + reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ + } \ + \ + bool support; \ +} + +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); + +#if CV_SSE4_1 +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +#endif + +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); + #endif template static void @@ -314,6 +657,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, dst + j); } +#elif CV_SSE2 + if(cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { @@ -335,6 +689,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, src2 + i, dst + j); } +#elif CV_SSE2 + if(cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { @@ -357,6 +722,17 @@ merge_( const T** src, T* dst, int len, int cn ) for( ; i < len - inc_i; i += inc_i, j += inc_j) vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); } +#elif CV_SSE2 + if(cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } #endif for( ; i < len; i++, j += cn ) { @@ -1123,6 +1499,48 @@ struct cvtScaleAbs_SIMD } }; +template <> +struct cvtScaleAbs_SIMD +{ + int operator () (const schar * src, uchar * dst, int width, + float scale, float shift) const + { + int x = 0; + + if (USE_SSE2) + { + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), + v_zero_f = _mm_setzero_ps(); + __m128i v_zero_i = _mm_setzero_si128(); + + for ( ; x <= width - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x)); + __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8), + v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8); + __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); + v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); + __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift); + v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); + __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); + v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3); + __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps( + _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift); + v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4); + + __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), + _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4))); + _mm_storeu_si128((__m128i *)(dst + x), v_dst_i); + } + } + + return x; + } +}; + template <> struct cvtScaleAbs_SIMD { @@ -1242,6 +1660,44 @@ struct cvtScaleAbs_SIMD } }; +template <> +struct cvtScaleAbs_SIMD +{ + int operator () (const double * src, uchar * dst, int width, + float scale, float shift) const + { + int x = 0; + + if (USE_SSE2) + { + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift), + v_zero_f = _mm_setzero_ps(); + __m128i v_zero_i = _mm_setzero_si128(); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + + __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift); + v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1); + + __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift); + v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2); + + __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), + _mm_cvtps_epi32(v_dst2)); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i)); + } + } + + return x; + } +}; + #elif CV_NEON template <> @@ -1489,7 +1945,1582 @@ struct cvtScale_SIMD } }; -#if CV_NEON +#if CV_SSE2 + +// from uchar + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero); + + __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); + + v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); + } + + return x; + } +}; + +// from schar + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))); + v_src = _mm_srai_epi16(v_src, 8); + + __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); + + v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); + } + + return x; + } +}; + +// from ushort + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + + __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); + + v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); + } + + return x; + } +}; + +// from short + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const short * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift); + + _mm_storeu_ps(dst + x, v_dst_0); + _mm_storeu_ps(dst + x + 4, v_dst_1); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + + __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); + + v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16); + v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift); + v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift); + _mm_storeu_pd(dst + x + 4, v_dst_0); + _mm_storeu_pd(dst + x + 6, v_dst_1); + } + + return x; + } +}; + +// from int + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const int * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + v_src = _mm_loadu_si128((__m128i const *)(src + x + 4)); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, int * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); + + v_src = _mm_srli_si128(v_src, 8); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); + + __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1))); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, float * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); + + v_src = _mm_srli_si128(v_src, 8); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); + + _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0), + _mm_cvtpd_ps(v_dst_1))); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x)); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); + + v_src = _mm_srli_si128(v_src, 8); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift); + + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); + } + + return x; + } +}; + +// from float + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const float * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_loadu_ps(src + x + 4); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0)); + _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + _mm_storeu_ps(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128 v_src = _mm_loadu_ps(src + x); + __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); + v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); + __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift); + + _mm_storeu_pd(dst + x, v_dst_0); + _mm_storeu_pd(dst + x + 2, v_dst_1); + } + + return x; + } +}; + +// from double + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct cvtScale_SIMD +{ + cvtScale_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator () (const double * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift); + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2))); + __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)), + _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6))); + __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0), + _mm_cvtps_epi32(v_dst_1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, int * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128d v_src = _mm_loadu_pd(src + x); + __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + v_src = _mm_loadu_pd(src + x + 2); + __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1))); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, float * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 4; x += 4) + { + __m128d v_src = _mm_loadu_pd(src + x); + __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + v_src = _mm_loadu_pd(src + x + 2); + __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + + __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0), + _mm_cvtpd_ps(v_dst1)); + + _mm_storeu_ps(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const double * src, double * dst, int width, double scale, double shift) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift); + + for ( ; x <= width - 2; x += 2) + { + __m128d v_src = _mm_loadu_pd(src + x); + __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift); + _mm_storeu_pd(dst + x, v_dst); + } + + return x; + } +}; + +#elif CV_NEON // from uchar @@ -2294,26 +4325,44 @@ cvtScale_( const short* src, size_t sstep, { int x = 0; - #if CV_SSE2 - if(USE_SSE2)//~5X - { - __m128 scale128 = _mm_set1_ps (scale); - __m128 shift128 = _mm_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) - { - __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); - __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); - __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); - __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); - rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); - rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); - r0 = _mm_cvtps_epi32(rf0); - r1 = _mm_cvtps_epi32(rf1); + #if CV_AVX2 + if (USE_AVX2) + { + __m256 scale256 = _mm256_set1_ps(scale); + __m256 shift256 = _mm256_set1_ps(shift); + const int shuffle = 0xD8; - _mm_storeu_si128((__m128i*)(dst + x), r0); - _mm_storeu_si128((__m128i*)(dst + x + 4), r1); - } + for ( ; x <= size.width - 16; x += 16) + { + __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x)); + v_src = _mm256_permute4x64_epi64(v_src, shuffle); + __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16); + __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16); + __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256); + __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256); + _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0)); + _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1)); } + } + #endif + #if CV_SSE2 + if (USE_SSE2)//~5X + { + __m128 scale128 = _mm_set1_ps (scale); + __m128 shift128 = _mm_set1_ps (shift); + for(; x <= size.width - 8; x += 8 ) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x)); + + __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); + __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16)); + rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); + rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); + + _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0)); + _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1)); + } + } #elif CV_NEON float32x4_t v_shift = vdupq_n_f32(shift); for(; x <= size.width - 8; x += 8 ) @@ -2330,24 +4379,6 @@ cvtScale_( const short* src, size_t sstep, } #endif - //We will wait Haswell - /* - #if CV_AVX - if(USE_AVX)//2X - bad variant - { - ////TODO:AVX implementation (optimization?) required - __m256 scale256 = _mm256_set1_ps (scale); - __m256 shift256 = _mm256_set1_ps (shift); - for(; x <= size.width - 8; x += 8 ) - { - __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x))); - __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256); - __m256i res = _mm256_cvtps_epi32(r0); - _mm256_storeu_si256 ((__m256i*)(dst+x), res); - } - } - #endif*/ - for(; x < size.width; x++ ) dst[x] = saturate_cast(src[x]*scale + shift); } @@ -2362,7 +4393,180 @@ struct Cvt_SIMD } }; -#if CV_NEON +#if CV_SSE2 + +// from double + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, uchar * dst, int width) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst)); + } + + return x; + } +}; + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, schar * dst, int width) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +template <> +struct Cvt_SIMD +{ + bool haveSIMD; + Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } + + int operator() (const double * src, ushort * dst, int width) const + { + int x = 0; + + if (!haveSIMD) + return x; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +#endif // CV_SSE4_1 + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, short * dst, int width) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + for ( ; x <= width - 8; x += 8) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)); + __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)); + + v_src0 = _mm_movelh_ps(v_src0, v_src1); + v_src1 = _mm_movelh_ps(v_src2, v_src3); + + __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + _mm_storeu_si128((__m128i *)(dst + x), v_dst); + } + + return x; + } +}; + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, int * dst, int width) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + for ( ; x <= width - 4; x += 4) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + v_src0 = _mm_movelh_ps(v_src0, v_src1); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0)); + } + + return x; + } +}; + +template <> +struct Cvt_SIMD +{ + int operator() (const double * src, float * dst, int width) const + { + int x = 0; + + if (!USE_SSE2) + return x; + + for ( ; x <= width - 4; x += 4) + { + __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x)); + __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)); + + _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1)); + } + + return x; + } +}; + + +#elif CV_NEON // from uchar @@ -2931,8 +5135,9 @@ cvt_( const float* src, size_t sstep, { int x = 0; #if CV_SSE2 - if(USE_SSE2){ - for( ; x <= size.width - 8; x += 8 ) + if(USE_SSE2) + { + for( ; x <= size.width - 8; x += 8 ) { __m128 src128 = _mm_loadu_ps (src + x); __m128i src_int128 = _mm_cvtps_epi32 (src128); diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 301ea80a1f..fe8ffd7718 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -11,6 +11,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 1c045f3faa..13ada1d1d6 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -593,14 +594,46 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre { const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1]; double *angle = (double*)ptrs[2]; - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + if (USE_SSE2) + { + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } + } +#endif + + for( ; k < len; k++ ) { buf[0][k] = (float)x[k]; buf[1][k] = (float)y[k]; } FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + if (USE_SSE2) + { + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } + } +#endif + + for( ; k < len; k++ ) angle[k] = buf[0][k]; } ptrs[0] += len*esz1; @@ -698,14 +731,46 @@ void cartToPolar( InputArray src1, InputArray src2, double *angle = (double*)ptrs[3]; Magnitude_64f(x, y, (double*)ptrs[2], len); - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + if (USE_SSE2) + { + for ( ; k <= len - 4; k += 4) + { + __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), + _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); + __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), + _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); + + _mm_storeu_ps(buf[0] + k, v_dst0); + _mm_storeu_ps(buf[1] + k, v_dst1); + } + } +#endif + + for( ; k < len; k++ ) { buf[0][k] = (float)x[k]; buf[1][k] = (float)y[k]; } FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees ); - for( k = 0; k < len; k++ ) + k = 0; + +#if CV_SSE2 + if (USE_SSE2) + { + for ( ; k <= len - 4; k += 4) + { + __m128 v_src = _mm_loadu_ps(buf[0] + k); + _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); + _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); + } + } +#endif + + for( ; k < len; k++ ) angle[k] = buf[0][k]; } ptrs[0] += len*esz1; @@ -771,14 +836,77 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval, /*static const double cos_a2 = 1;*/ double k1; - int i; + int i = 0; if( !angle_in_degrees ) k1 = N/(2*CV_PI); else k1 = N/360.; - for( i = 0; i < len; i++ ) +#if CV_AVX2 + if (USE_AVX2) + { + __m128d v_k1 = _mm_set1_pd(k1); + __m128d v_1 = _mm_set1_pd(1); + __m128i v_N1 = _mm_set1_epi32(N - 1); + __m128i v_N4 = _mm_set1_epi32(N >> 2); + __m128d v_sin_a0 = _mm_set1_pd(sin_a0); + __m128d v_sin_a2 = _mm_set1_pd(sin_a2); + __m128d v_cos_a0 = _mm_set1_pd(cos_a0); + + for ( ; i <= len - 4; i += 4) + { + __m128 v_angle = _mm_loadu_ps(angle + i); + + // 0-1 + __m128d v_t = _mm_mul_pd(_mm_cvtps_pd(v_angle), v_k1); + __m128i v_it = _mm_cvtpd_epi32(v_t); + v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); + + __m128i v_sin_idx = _mm_and_si128(v_it, v_N1); + __m128i v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1); + + __m128d v_t2 = _mm_mul_pd(v_t, v_t); + __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); + __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); + + __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); + __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); + + __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), + _mm_mul_pd(v_cos_a, v_sin_b)); + __m128d v_cos_val_0 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b), + _mm_mul_pd(v_sin_a, v_sin_b)); + + // 2-3 + v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1); + v_it = _mm_cvtpd_epi32(v_t); + v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); + + v_sin_idx = _mm_and_si128(v_it, v_N1); + v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1); + + v_t2 = _mm_mul_pd(v_t, v_t); + v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); + v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); + + v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); + v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); + + __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), + _mm_mul_pd(v_cos_a, v_sin_b)); + __m128d v_cos_val_1 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b), + _mm_mul_pd(v_sin_a, v_sin_b)); + + _mm_storeu_ps(sinval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_sin_val_0), + _mm_cvtpd_ps(v_sin_val_1))); + _mm_storeu_ps(cosval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_cos_val_0), + _mm_cvtpd_ps(v_cos_val_1))); + } + } +#endif + + for( ; i < len; i++ ) { double t = angle[i]*k1; int it = cvRound(t); @@ -914,6 +1042,16 @@ void polarToCart( InputArray src1, InputArray src2, vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m)); vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m)); } + #elif CV_SSE2 + if (USE_SSE2) + { + for( ; k <= len - 4; k += 4 ) + { + __m128 v_m = _mm_loadu_ps(mag + k); + _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m)); + _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m)); + } + } #endif for( ; k < len; k++ ) @@ -939,10 +1077,10 @@ void polarToCart( InputArray src1, InputArray src2, x[k] = buf[0][k]*m; y[k] = buf[1][k]*m; } else - for( k = 0; k < len; k++ ) - { - x[k] = buf[0][k]; y[k] = buf[1][k]; - } + { + std::memcpy(x, buf[0], sizeof(float) * len); + std::memcpy(y, buf[1], sizeof(float) * len); + } } if( ptrs[0] ) diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index b2f36b3292..6c8bad2444 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index ef154400e2..0f85cc5568 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -192,6 +192,7 @@ struct NoVec extern volatile bool USE_SSE2; extern volatile bool USE_SSE4_2; extern volatile bool USE_AVX; +extern volatile bool USE_AVX2; enum { BLOCK_SIZE = 1024 }; diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index ca56a7c966..87c423dc3b 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -72,7 +73,114 @@ struct Sum_SIMD } }; -#if CV_NEON +#if CV_SSE2 + +template <> +struct Sum_SIMD +{ + int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); + + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + + v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); + + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + } + + int CV_DECL_ALIGNED(16) ar[4]; + _mm_store_si128((__m128i*)ar, v_sum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + dst[j] += ar[j + i]; + + return x / cn; + } +}; + +template <> +struct Sum_SIMD +{ + int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + return 0; + + int x = 0; + __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; + + for ( ; x <= len - 4; x += 4) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x)); + v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src)); + v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8))); + } + + double CV_DECL_ALIGNED(16) ar[4]; + _mm_store_pd(ar, v_sum0); + _mm_store_pd(ar + 2, v_sum1); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + dst[j] += ar[j + i]; + + return x / cn; + } +}; + +template <> +struct Sum_SIMD +{ + int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + return 0; + + int x = 0; + __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; + + for ( ; x <= len - 4; x += 4) + { + __m128 v_src = _mm_loadu_ps(src0 + x); + v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src)); + v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); + v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src)); + } + + double CV_DECL_ALIGNED(16) ar[4]; + _mm_store_pd(ar, v_sum0); + _mm_store_pd(ar + 2, v_sum1); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + dst[j] += ar[j + i]; + + return x / cn; + } +}; + + +#elif CV_NEON template <> struct Sum_SIMD @@ -396,6 +504,38 @@ static int countNonZero_(const T* src, int len ) return nz; } +#if CV_SSE2 + +static const uchar * initPopcountTable() +{ + static uchar tab[256]; + static volatile bool initialized = false; + if( !initialized ) + { + // we compute inverse popcount table, + // since we pass (img[x] == 0) mask as index in the table. + unsigned int j = 0u; +#if CV_POPCNT + if (checkHardwareSupport(CV_CPU_POPCNT)) + for( ; j < 256u; j++ ) + tab[j] = (uchar)(8 - _mm_popcnt_u32(j)); +#else + for( ; j < 256u; j++ ) + { + int val = 0; + for( int mask = 1; mask < 256; mask += mask ) + val += (j & mask) == 0; + tab[j] = (uchar)val; + } +#endif + initialized = true; + } + + return tab; +} + +#endif + static int countNonZero8u( const uchar* src, int len ) { int i=0, nz = 0; @@ -403,21 +543,7 @@ static int countNonZero8u( const uchar* src, int len ) if(USE_SSE2)//5x-6x { __m128i pattern = _mm_setzero_si128 (); - static uchar tab[256]; - static volatile bool initialized = false; - if( !initialized ) - { - // we compute inverse popcount table, - // since we pass (img[x] == 0) mask as index in the table. - for( int j = 0; j < 256; j++ ) - { - int val = 0; - for( int mask = 1; mask < 256; mask += mask ) - val += (j & mask) == 0; - tab[j] = (uchar)val; - } - initialized = true; - } + static const uchar * tab = initPopcountTable(); for (; i<=len-16; i+=16) { @@ -467,7 +593,22 @@ static int countNonZero8u( const uchar* src, int len ) static int countNonZero16u( const ushort* src, int len ) { int i = 0, nz = 0; -#if CV_NEON +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero = _mm_setzero_si128 (); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + i)); + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_cmpeq_epi16(v_src, v_zero), v_zero)); + nz += tab[val]; + } + + src += i; + } +#elif CV_NEON int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; uint32x4_t v_nz = vdupq_n_u32(0u); uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1); @@ -503,7 +644,27 @@ static int countNonZero16u( const ushort* src, int len ) static int countNonZero32s( const int* src, int len ) { int i = 0, nz = 0; -#if CV_NEON +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero = _mm_setzero_si128 (); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((const __m128i*)(src + i)); + __m128i v_dst0 = _mm_cmpeq_epi32(v_src, v_zero); + + v_src = _mm_loadu_si128((const __m128i*)(src + i + 4)); + __m128i v_dst1 = _mm_cmpeq_epi32(v_src, v_zero); + + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero)); + nz += tab[val]; + } + + src += i; + } +#elif CV_NEON int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; uint32x4_t v_nz = vdupq_n_u32(0u); int32x4_t v_zero = vdupq_n_s32(0.0f); @@ -541,7 +702,25 @@ static int countNonZero32s( const int* src, int len ) static int countNonZero32f( const float* src, int len ) { int i = 0, nz = 0; -#if CV_NEON +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero_i = _mm_setzero_si128(); + __m128 v_zero_f = _mm_setzero_ps(); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_dst0 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i), v_zero_f)); + __m128i v_dst1 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i + 4), v_zero_f)); + + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i)); + nz += tab[val]; + } + + src += i; + } +#elif CV_NEON int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; uint32x4_t v_nz = vdupq_n_u32(0u); float32x4_t v_zero = vdupq_n_f32(0.0f); @@ -577,7 +756,34 @@ static int countNonZero32f( const float* src, int len ) } static int countNonZero64f( const double* src, int len ) -{ return countNonZero_(src, len); } +{ + int i = 0, nz = 0; +#if CV_SSE2 + if (USE_SSE2) + { + __m128i v_zero_i = _mm_setzero_si128(); + __m128d v_zero_d = _mm_setzero_pd(); + static const uchar * tab = initPopcountTable(); + + for ( ; i <= len - 8; i += 8) + { + __m128i v_dst0 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i), v_zero_d)); + __m128i v_dst1 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 2), v_zero_d)); + __m128i v_dst2 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 4), v_zero_d)); + __m128i v_dst3 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 6), v_zero_d)); + + v_dst0 = _mm_packs_epi32(v_dst0, v_dst1); + v_dst1 = _mm_packs_epi32(v_dst2, v_dst3); + + int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i)); + nz += tab[val]; + } + + src += i; + } +#endif + return nz + countNonZero_(src, len - i); +} typedef int (*CountNonZeroFunc)(const uchar*, int); @@ -594,6 +800,137 @@ static CountNonZeroFunc getCountNonZeroTab(int depth) return countNonZeroTab[depth]; } +template +struct SumSqr_SIMD +{ + int operator () (const T *, const uchar *, ST *, SQT *, int, int) const + { + return 0; + } +}; + +#if CV_SSE2 + +template <> +struct SumSqr_SIMD +{ + int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_unpacklo_epi8(v_src, v_zero); + + __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); + __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + + v_half = _mm_unpackhi_epi8(v_src, v_zero); + v_mullo = _mm_mullo_epi16(v_half, v_half); + v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero); + + __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); + __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); + v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero)); + v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + int CV_DECL_ALIGNED(16) ar[8]; + _mm_store_si128((__m128i*)ar, v_sum); + _mm_store_si128((__m128i*)(ar + 4), v_sqsum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + { + sum[j] += ar[j + i]; + sqsum[j] += ar[4 + j + i]; + } + + return x / cn; + } +}; + +template <> +struct SumSqr_SIMD +{ + int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const + { + if (mask || (cn != 1 && cn != 2) || !USE_SSE2) + return 0; + + int x = 0; + __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; + + for ( ; x <= len - 16; x += 16) + { + __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); + __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); + + __m128i v_mullo = _mm_mullo_epi16(v_half, v_half); + __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + + v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); + v_mullo = _mm_mullo_epi16(v_half, v_half); + v_mulhi = _mm_mulhi_epi16(v_half, v_half); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + for ( ; x <= len - 8; x += 8) + { + __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); + + __m128i v_mullo = _mm_mullo_epi16(v_src, v_src); + __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); + v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi)); + v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi)); + } + + int CV_DECL_ALIGNED(16) ar[8]; + _mm_store_si128((__m128i*)ar, v_sum); + _mm_store_si128((__m128i*)(ar + 4), v_sqsum); + + for (int i = 0; i < 4; i += cn) + for (int j = 0; j < cn; ++j) + { + sum[j] += ar[j + i]; + sqsum[j] += ar[4 + j + i]; + } + + return x / cn; + } +}; + +#endif + template static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn ) { @@ -601,14 +938,15 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le if( !mask ) { - int i; - int k = cn % 4; + SumSqr_SIMD vop; + int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4; + src += i * cn; if( k == 1 ) { ST s0 = sum[0]; SQT sq0 = sqsum[0]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v = src[0]; s0 += v; sq0 += (SQT)v*v; @@ -620,7 +958,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le { ST s0 = sum[0], s1 = sum[1]; SQT sq0 = sqsum[0], sq1 = sqsum[1]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0 = src[0], v1 = src[1]; s0 += v0; sq0 += (SQT)v0*v0; @@ -633,7 +971,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le { ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0 = src[0], v1 = src[1], v2 = src[2]; s0 += v0; sq0 += (SQT)v0*v0; @@ -649,7 +987,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le src = src0 + k; ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3]; SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3]; - for( i = 0; i < len; i++, src += cn ) + for( ; i < len; i++, src += cn ) { T v0, v1; v0 = src[0], v1 = src[1]; @@ -924,7 +1262,6 @@ cv::Scalar cv::sum( InputArray _src ) } } #endif - SumFunc func = getSumFunc(depth); CV_Assert( cn <= 4 && func != 0 ); diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index daf13a2dda..b6f3466f71 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -89,6 +90,22 @@ pop ebx } } + static void __cpuidex(int* cpuid_data, int, int) + { + __asm + { + push edi + mov edi, cpuid_data + mov eax, 7 + mov ecx, 0 + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + pop edi + } + } #endif #endif @@ -208,7 +225,7 @@ struct HWFeatures enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; HWFeatures(void) - { + { memset( have, 0, sizeof(have) ); x86_family = 0; } @@ -252,10 +269,54 @@ struct HWFeatures f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; + f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0; f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX + + // make the second call to the cpuid command in order to get + // information about extended features like AVX2 + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuidex(cpuid_data, 7, 0); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $7, %%eax\n\t" + "movl $0, %%ecx\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%eax\n\t" + "pushl %%edx\n\t" + "movl $7,%%eax\n\t" + "movl $0,%%ecx\n\t" + "cpuid\n\t" + "popl %%edx\n\t" + "popl %%eax\n\t" + : "=b"(cpuid_data[1]), "=c"(cpuid_data[2]) + : + : "cc" + ); + #endif + #endif + f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; + + f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0; + f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0; + f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0; + f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0; + f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0; + f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0; + f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0; + f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0; + f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0; } #if defined ANDROID || defined __linux__ @@ -318,6 +379,7 @@ IPPInitializer ippInitializer; volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2]; volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2]; volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX]; +volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2]; void setUseOptimized( bool flag ) { diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 443111f48d..ffc20777b5 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -10,8 +10,7 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp index d0d3847bed..0541819f89 100644 --- a/modules/core/test/ocl/test_arithm.cpp +++ b/modules/core/test/ocl/test_arithm.cpp @@ -1577,7 +1577,7 @@ PARAM_TEST_CASE(ConvertScaleAbs, MatDepth, Channels, bool) Size roiSize = randomSize(1, MAX_VALUE); Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); - randomSubMat(src, src_roi, roiSize, srcBorder, stype, 2, 11); // FIXIT: Test with minV, maxV + randomSubMat(src, src_roi, roiSize, srcBorder, stype, -11, 11); // FIXIT: Test with minV, maxV Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, 5, 16); diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index d2e8b39aa3..23dc4576ba 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. / // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index 1311d5abb9..233218b3e2 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -11,6 +11,7 @@ // For Open Source Computer Vision Library // // Copyright (C) 2000, Intel Corporation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp index 18a91d9544..06fc73153f 100644 --- a/modules/imgproc/src/clahe.cpp +++ b/modules/imgproc/src/clahe.cpp @@ -11,6 +11,7 @@ // For Open Source Computer Vision Library // // Copyright (C) 2013, NVIDIA Corporation, all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index f0a8fd8584..b900cf1845 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -965,6 +966,11 @@ struct Gray2RGB5x5 #if CV_NEON v_n7 = vdup_n_u8(~7); v_n3 = vdup_n_u8(~3); + #elif CV_SSE2 + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + v_n7 = _mm_set1_epi16(~7); + v_n3 = _mm_set1_epi16(~3); + v_zero = _mm_setzero_si128(); #endif } @@ -982,6 +988,26 @@ struct Gray2RGB5x5 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8)); vst1q_u16((ushort *)dst + i, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; i <= n - 16; i += 16 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3), + _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3), + _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } + } #endif for ( ; i < n; i++ ) { @@ -998,6 +1024,26 @@ struct Gray2RGB5x5 uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10)); vst1q_u16((ushort *)dst + i, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; i <= n - 16; i += 8 ) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i)); + + __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3); + __m128i v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi32(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst); + + v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3); + v_dst = _mm_or_si128(v_src_p, + _mm_or_si128(_mm_slli_epi16(v_src_p, 5), + _mm_slli_epi16(v_src_p, 10))); + _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst); + } + } #endif for( ; i < n; i++ ) { @@ -1010,6 +1056,9 @@ struct Gray2RGB5x5 #if CV_NEON uint8x8_t v_n7, v_n3; + #elif CV_SSE2 + __m128i v_n7, v_n3, v_zero; + bool haveSIMD; #endif }; @@ -1042,6 +1091,14 @@ struct RGB5x52Gray v_delta = vdupq_n_u32(1 << (yuv_shift - 1)); v_f8 = vdupq_n_u16(0xf8); v_fc = vdupq_n_u16(0xfc); + #elif CV_SSE2 + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + v_b2y = _mm_set1_epi16(B2Y); + v_g2y = _mm_set1_epi16(G2Y); + v_r2y = _mm_set1_epi16(R2Y); + v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_f8 = _mm_set1_epi16(0xf8); + v_fc = _mm_set1_epi16(0xfc); #endif } @@ -1067,6 +1124,42 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } + #elif CV_SSE2 + if (haveSIMD) + { + __m128i v_zero = _mm_setzero_si128(); + + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8); + + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } + } #endif for ( ; i < n; i++) { @@ -1095,6 +1188,42 @@ struct RGB5x52Gray vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); } + #elif CV_SSE2 + if (haveSIMD) + { + __m128i v_zero = _mm_setzero_si128(); + + for ( ; i <= n - 8; i += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i)); + __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8), + v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8), + v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8); + + __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y); + __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y); + __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y); + __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y); + __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y); + __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y); + + __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta), + _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r)); + + __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta), + _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r)); + + v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift); + v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift); + + __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1); + _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero)); + } + } #endif for ( ; i < n; i++) { @@ -1111,6 +1240,11 @@ struct RGB5x52Gray uint16x4_t v_b2y, v_g2y, v_r2y; uint32x4_t v_delta; uint16x8_t v_f8, v_fc; + #elif CV_SSE2 + bool haveSIMD; + __m128i v_b2y, v_g2y, v_r2y; + __m128i v_delta; + __m128i v_f8, v_fc; #endif }; @@ -1327,6 +1461,219 @@ struct RGB2Gray float32x4_t v_cb, v_cg, v_cr; }; +#elif CV_SSE2 + +#if CV_SSE4_1 + +template <> +struct RGB2Gray +{ + typedef ushort channel_type; + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : + srccn(_srccn) + { + static const int coeffs0[] = { R2Y, G2Y, B2Y }; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0])); + if( blueIdx == 0 ) + std::swap(coeffs[0], coeffs[2]); + + v_cb = _mm_set1_epi16((short)coeffs[0]); + v_cg = _mm_set1_epi16((short)coeffs[1]); + v_cr = _mm_set1_epi16((short)coeffs[2]); + v_delta = _mm_set1_epi32(1 << (yuv_shift - 1)); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); + } + + // 16s x 8 + void process(__m128i v_b, __m128i v_g, __m128i v_r, + __m128i & v_gray) const + { + __m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr); + __m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg); + __m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb); + __m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr); + __m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg); + __m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb); + + __m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r), + _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g)); + v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0); + v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift); + + __m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r), + _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g)); + v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1); + v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift); + + v_gray = _mm_packus_epi32(v_gray0, v_gray1); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; + + if (scn == 3 && haveSIMD) + { + for ( ; i <= n - 16; i += 16, src += scn * 16) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); + + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128i v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128i v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_si128((__m128i *)(dst + i), v_gray0); + _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); + } + } + else if (scn == 4 && haveSIMD) + { + for ( ; i <= n - 16; i += 16, src += scn * 16) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); + __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48)); + __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56)); + + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); + + __m128i v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128i v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_si128((__m128i *)(dst + i), v_gray0); + _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1); + } + } + + for( ; i < n; i++, src += scn) + dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift); + } + + int srccn, coeffs[3]; + __m128i v_cb, v_cg, v_cr; + __m128i v_delta; + bool haveSIMD; +}; + +#endif // CV_SSE4_1 + +template <> +struct RGB2Gray +{ + typedef float channel_type; + + RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const float coeffs0[] = { 0.299f, 0.587f, 0.114f }; + memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + + v_cb = _mm_set1_ps(coeffs[0]); + v_cg = _mm_set1_ps(coeffs[1]); + v_cr = _mm_set1_ps(coeffs[2]); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + } + + void process(__m128 v_b, __m128 v_g, __m128 v_r, + __m128 & v_gray) const + { + v_gray = _mm_mul_ps(v_r, v_cr); + v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg)); + v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb)); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, i = 0; + float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + + if (scn == 3 && haveSIMD) + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128 v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128 v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_ps(dst + i, v_gray0); + _mm_storeu_ps(dst + i + 4, v_gray1); + } + } + else if (scn == 4 && haveSIMD) + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + __m128 v_a0 = _mm_loadu_ps(src + 24); + __m128 v_a1 = _mm_loadu_ps(src + 28); + + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); + + __m128 v_gray0; + process(v_r0, v_g0, v_b0, + v_gray0); + + __m128 v_gray1; + process(v_r1, v_g1, v_b1, + v_gray1); + + _mm_storeu_ps(dst + i, v_gray0); + _mm_storeu_ps(dst + i + 4, v_gray1); + } + } + + for ( ; i < n; i++, src += scn) + dst[i] = src[0]*cb + src[1]*cg + src[2]*cr; + } + + int srccn; + float coeffs[3]; + __m128 v_cb, v_cg, v_cr; + bool haveSIMD; +}; + #else template<> struct RGB2Gray @@ -1449,6 +1796,103 @@ struct RGB2YCrCb_f float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; }; +#elif CV_SSE2 + +template <> +struct RGB2YCrCb_f +{ + typedef float channel_type; + + RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : + srccn(_srccn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_c4 = _mm_set1_ps(coeffs[4]); + v_delta = _mm_set1_ps(ColorChannel::half()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + } + + void process(__m128 v_r, __m128 v_g, __m128 v_b, + __m128 & v_y, __m128 & v_cr, __m128 & v_cb) const + { + v_y = _mm_mul_ps(v_r, v_c0); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c1)); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c2)); + + v_cr = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 0 ? v_b : v_r, v_y), v_c3), v_delta); + v_cb = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 2 ? v_b : v_r, v_y), v_c4), v_delta); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + n *= 3; + + if (haveSIMD) + { + for ( ; i <= n - 24; i += 24, src += 8 * scn) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + + if (scn == 4) + { + __m128 v_a0 = _mm_loadu_ps(src + 24); + __m128 v_a1 = _mm_loadu_ps(src + 28); + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128 v_y0, v_cr0, v_cb0; + process(v_r0, v_g0, v_b0, + v_y0, v_cr0, v_cb0); + + __m128 v_y1, v_cr1, v_cb1; + process(v_r1, v_g1, v_b1, + v_y1, v_cr1, v_cb1); + + _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + + _mm_storeu_ps(dst + i, v_y0); + _mm_storeu_ps(dst + i + 4, v_y1); + _mm_storeu_ps(dst + i + 8, v_cr0); + _mm_storeu_ps(dst + i + 12, v_cr1); + _mm_storeu_ps(dst + i + 16, v_cb0); + _mm_storeu_ps(dst + i + 20, v_cb1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; + float Cr = (src[bidx^2] - Y)*C3 + delta; + float Cb = (src[bidx] - Y)*C4 + delta; + dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb; + } + } + int srccn, blueIdx; + float coeffs[5]; + __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; + bool haveSIMD; +}; + #endif template struct RGB2YCrCb_i @@ -1699,7 +2143,288 @@ struct RGB2YCrCb_i int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2; }; -#endif +#elif CV_SSE4_1 + +template <> +struct RGB2YCrCb_i +{ + typedef uchar channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = _mm_set1_epi32(coeffs[0]); + v_c1 = _mm_set1_epi32(coeffs[1]); + v_c2 = _mm_set1_epi32(coeffs[2]); + v_c3 = _mm_set1_epi32(coeffs[3]); + v_c4 = _mm_set1_epi32(coeffs[4]); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); + v_delta = _mm_add_epi32(v_delta, v_delta2); + v_zero = _mm_setzero_si128(); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); + } + + // 16u x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const + { + __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero); + __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero); + __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); + + __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); + + __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3); + __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4); + v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift); + v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift); + + v_r_p = _mm_unpackhi_epi16(v_r, v_zero); + v_g_p = _mm_unpackhi_epi16(v_g, v_zero); + v_b_p = _mm_unpackhi_epi16(v_b, v_zero); + + __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift); + + __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3); + __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4); + v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift); + v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift); + + v_y = _mm_packs_epi32(v_y0, v_y1); + v_cr = _mm_packs_epi32(v_cr0, v_cr1); + v_cb = _mm_packs_epi32(v_cb0, v_cb1); + } + + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + if (haveSIMD) + { + for ( ; i <= n - 96; i += 96, src += scn * 32) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80)); + + if (scn == 4) + { + __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 96)); + __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 112)); + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + v_y0, v_cr0, v_cb0); + + __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero; + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + v_y1, v_cr1, v_cb1); + + __m128i v_y_0 = _mm_packus_epi16(v_y0, v_y1); + __m128i v_cr_0 = _mm_packus_epi16(v_cr0, v_cr1); + __m128i v_cb_0 = _mm_packus_epi16(v_cb0, v_cb1); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + v_y0, v_cr0, v_cb0); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + v_y1, v_cr1, v_cb1); + + __m128i v_y_1 = _mm_packus_epi16(v_y0, v_y1); + __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1); + __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1); + + _mm_interleave_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1); + + _mm_storeu_si128((__m128i *)(dst + i), v_y_0); + _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1); + _mm_storeu_si128((__m128i *)(dst + i + 32), v_cr_0); + _mm_storeu_si128((__m128i *)(dst + i + 48), v_cr_1); + _mm_storeu_si128((__m128i *)(dst + i + 64), v_cb_0); + _mm_storeu_si128((__m128i *)(dst + i + 80), v_cb_1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + + int srccn, blueIdx, coeffs[5]; + __m128i v_c0, v_c1, v_c2; + __m128i v_c3, v_c4, v_delta, v_delta2; + __m128i v_zero; + bool haveSIMD; +}; + +template <> +struct RGB2YCrCb_i +{ + typedef ushort channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = _mm_set1_epi32(coeffs[0]); + v_c1 = _mm_set1_epi32(coeffs[1]); + v_c2 = _mm_set1_epi32(coeffs[2]); + v_c3 = _mm_set1_epi32(coeffs[3]); + v_c4 = _mm_set1_epi32(coeffs[4]); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); + v_delta = _mm_add_epi32(v_delta, v_delta2); + v_zero = _mm_setzero_si128(); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); + } + + // 16u x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const + { + __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero); + __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero); + __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); + + __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); + + __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3); + __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4); + v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift); + v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift); + + v_r_p = _mm_unpackhi_epi16(v_r, v_zero); + v_g_p = _mm_unpackhi_epi16(v_g, v_zero); + v_b_p = _mm_unpackhi_epi16(v_b, v_zero); + + __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift); + + __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3); + __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4); + v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift); + v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift); + + v_y = _mm_packus_epi32(v_y0, v_y1); + v_cr = _mm_packus_epi32(v_cr0, v_cr1); + v_cb = _mm_packus_epi32(v_cb0, v_cb1); + } + + void operator()(const ushort * src, ushort * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + if (haveSIMD) + { + for ( ; i <= n - 48; i += 48, src += scn * 16) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40)); + + if (scn == 4) + { + __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48)); + __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56)); + + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; + process(v_r0, v_g0, v_b0, + v_y0, v_cr0, v_cb0); + + __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero; + process(v_r1, v_g1, v_b1, + v_y1, v_cr1, v_cb1); + + _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + + _mm_storeu_si128((__m128i *)(dst + i), v_y0); + _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1); + _mm_storeu_si128((__m128i *)(dst + i + 16), v_cr0); + _mm_storeu_si128((__m128i *)(dst + i + 24), v_cr1); + _mm_storeu_si128((__m128i *)(dst + i + 32), v_cb0); + _mm_storeu_si128((__m128i *)(dst + i + 40), v_cb1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + + int srccn, blueIdx, coeffs[5]; + __m128i v_c0, v_c1, v_c2; + __m128i v_c3, v_c4, v_delta, v_delta2; + __m128i v_zero; + bool haveSIMD; +}; + +#endif // CV_SSE4_1 template struct YCrCb2RGB_f { @@ -1809,6 +2534,118 @@ struct YCrCb2RGB_f float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; }; +#elif CV_SSE2 + +template <> +struct YCrCb2RGB_f +{ + typedef float channel_type; + + YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_delta = _mm_set1_ps(ColorChannel::half()); + v_alpha = _mm_set1_ps(ColorChannel::max()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + } + + void process(__m128 v_y, __m128 v_cr, __m128 v_cb, + __m128 & v_r, __m128 & v_g, __m128 & v_b) const + { + v_cb = _mm_sub_ps(v_cb, v_delta); + v_cr = _mm_sub_ps(v_cr, v_delta); + + v_b = _mm_mul_ps(v_cb, v_c3); + v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1)); + v_r = _mm_mul_ps(v_cr, v_c0); + + v_b = _mm_add_ps(v_b, v_y); + v_g = _mm_add_ps(v_g, v_y); + v_r = _mm_add_ps(v_r, v_y); + + if (blueIdx == 0) + std::swap(v_b, v_r); + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(), alpha = ColorChannel::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + if (haveSIMD) + { + for ( ; i <= n - 24; i += 24, dst += 8 * dcn) + { + __m128 v_y0 = _mm_loadu_ps(src + i); + __m128 v_y1 = _mm_loadu_ps(src + i + 4); + __m128 v_cr0 = _mm_loadu_ps(src + i + 8); + __m128 v_cr1 = _mm_loadu_ps(src + i + 12); + __m128 v_cb0 = _mm_loadu_ps(src + i + 16); + __m128 v_cb1 = _mm_loadu_ps(src + i + 20); + + _mm_deinterleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + + __m128 v_r0, v_g0, v_b0; + process(v_y0, v_cr0, v_cb0, + v_r0, v_g0, v_b0); + + __m128 v_r1, v_g1, v_b1; + process(v_y1, v_cr1, v_cb1, + v_r1, v_g1, v_b1); + + __m128 v_a0 = v_alpha, v_a1 = v_alpha; + + if (dcn == 3) + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + else + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + + _mm_storeu_ps(dst, v_r0); + _mm_storeu_ps(dst + 4, v_r1); + _mm_storeu_ps(dst + 8, v_g0); + _mm_storeu_ps(dst + 12, v_g1); + _mm_storeu_ps(dst + 16, v_b0); + _mm_storeu_ps(dst + 20, v_b1); + + if (dcn == 4) + { + _mm_storeu_ps(dst + 24, v_a0); + _mm_storeu_ps(dst + 28, v_a1); + } + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + float Y = src[i], Cr = src[i+1], Cb = src[i+2]; + + float b = Y + (Cb - delta)*C3; + float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; + float r = Y + (Cr - delta)*C0; + + dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + float coeffs[4]; + + __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; + bool haveSIMD; +}; + #endif template struct YCrCb2RGB_i @@ -2096,7 +2933,185 @@ struct YCrCb2RGB_i uint16x4_t v_alpha2; }; -#endif +#elif CV_SSE2 + +template <> +struct YCrCb2RGB_i +{ + typedef uchar channel_type; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {22987, -11698, -5636, 29049}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = _mm_set1_epi16((short)coeffs[0]); + v_c1 = _mm_set1_epi16((short)coeffs[1]); + v_c2 = _mm_set1_epi16((short)coeffs[2]); + v_c3 = _mm_set1_epi16((short)coeffs[3]); + v_delta = _mm_set1_epi16(ColorChannel::half()); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_zero = _mm_setzero_si128(); + + uchar alpha = ColorChannel::max(); + v_alpha = _mm_set1_epi8(*(char *)&alpha); + + useSSE = coeffs[0] <= std::numeric_limits::max(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + } + + // 16s x 8 + void process(__m128i v_y, __m128i v_cr, __m128i v_cb, + __m128i & v_r, __m128i & v_g, __m128i & v_b) const + { + v_cr = _mm_sub_epi16(v_cr, v_delta); + v_cb = _mm_sub_epi16(v_cb, v_delta); + + __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero); + + __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3); + __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2); + __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1); + __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0); + + __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3); + __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2); + __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1); + __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0); + + __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); + __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2), + _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2), + yuv_shift); + __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); + + v_r0 = _mm_add_epi32(v_r0, v_y_p); + v_g0 = _mm_add_epi32(v_g0, v_y_p); + v_b0 = _mm_add_epi32(v_b0, v_y_p); + + v_y_p = _mm_unpackhi_epi16(v_y, v_zero); + + __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift); + __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2), + _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2), + yuv_shift); + __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift); + + v_r1 = _mm_add_epi32(v_r1, v_y_p); + v_g1 = _mm_add_epi32(v_g1, v_y_p); + v_b1 = _mm_add_epi32(v_b1, v_y_p); + + v_r = _mm_packs_epi32(v_r0, v_r1); + v_g = _mm_packs_epi32(v_g0, v_g1); + v_b = _mm_packs_epi32(v_b0, v_b1); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + if (haveSIMD && useSSE) + { + for ( ; i <= n - 96; i += 96, dst += dcn * 32) + { + __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i)); + __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16)); + __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32)); + __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48)); + __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64)); + __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80)); + + _mm_deinterleave_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1); + + __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero; + process(_mm_unpacklo_epi8(v_y0, v_zero), + _mm_unpacklo_epi8(v_cr0, v_zero), + _mm_unpacklo_epi8(v_cb0, v_zero), + v_r_0, v_g_0, v_b_0); + + __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero; + process(_mm_unpackhi_epi8(v_y0, v_zero), + _mm_unpackhi_epi8(v_cr0, v_zero), + _mm_unpackhi_epi8(v_cb0, v_zero), + v_r_1, v_g_1, v_b_1); + + __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1); + __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1); + __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1); + + process(_mm_unpacklo_epi8(v_y1, v_zero), + _mm_unpacklo_epi8(v_cr1, v_zero), + _mm_unpacklo_epi8(v_cb1, v_zero), + v_r_0, v_g_0, v_b_0); + + process(_mm_unpackhi_epi8(v_y1, v_zero), + _mm_unpackhi_epi8(v_cr1, v_zero), + _mm_unpackhi_epi8(v_cb1, v_zero), + v_r_1, v_g_1, v_b_1); + + __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1); + __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1); + __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1); + + if (bidx == 0) + { + std::swap(v_r0, v_b0); + std::swap(v_r1, v_b1); + } + + __m128i v_a0 = v_alpha, v_a1 = v_alpha; + + if (dcn == 3) + _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + else + _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + + _mm_storeu_si128((__m128i *)(dst), v_r0); + _mm_storeu_si128((__m128i *)(dst + 16), v_r1); + _mm_storeu_si128((__m128i *)(dst + 32), v_g0); + _mm_storeu_si128((__m128i *)(dst + 48), v_g1); + _mm_storeu_si128((__m128i *)(dst + 64), v_b0); + _mm_storeu_si128((__m128i *)(dst + 80), v_b1); + + if (dcn == 4) + { + _mm_storeu_si128((__m128i *)(dst + 96), v_a0); + _mm_storeu_si128((__m128i *)(dst + 112), v_a1); + } + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + uchar Y = src[i]; + uchar Cr = src[i+1]; + uchar Cb = src[i+2]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[4]; + bool useSSE, haveSIMD; + + __m128i v_c0, v_c1, v_c2, v_c3, v_delta2; + __m128i v_delta, v_alpha, v_zero; +}; + +#endif // CV_SSE2 ////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// @@ -2219,6 +3234,118 @@ struct RGB2XYZ_f float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; }; +#elif CV_SSE2 + +template <> +struct RGB2XYZ_f +{ + typedef float channel_type; + + RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0])); + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_c4 = _mm_set1_ps(coeffs[4]); + v_c5 = _mm_set1_ps(coeffs[5]); + v_c6 = _mm_set1_ps(coeffs[6]); + v_c7 = _mm_set1_ps(coeffs[7]); + v_c8 = _mm_set1_ps(coeffs[8]); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + } + + void process(__m128 v_r, __m128 v_g, __m128 v_b, + __m128 & v_x, __m128 & v_y, __m128 & v_z) const + { + v_x = _mm_mul_ps(v_r, v_c0); + v_x = _mm_add_ps(v_x, _mm_mul_ps(v_g, v_c1)); + v_x = _mm_add_ps(v_x, _mm_mul_ps(v_b, v_c2)); + + v_y = _mm_mul_ps(v_r, v_c3); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c4)); + v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c5)); + + v_z = _mm_mul_ps(v_r, v_c6); + v_z = _mm_add_ps(v_z, _mm_mul_ps(v_g, v_c7)); + v_z = _mm_add_ps(v_z, _mm_mul_ps(v_b, v_c8)); + } + + void operator()(const float* src, float* dst, int n) const + { + int scn = srccn, i = 0; + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + + n *= 3; + + if (haveSIMD) + { + for ( ; i <= n - 24; i += 24, src += 8 * scn) + { + __m128 v_r0 = _mm_loadu_ps(src); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + + if (scn == 4) + { + __m128 v_a0 = _mm_loadu_ps(src + 24); + __m128 v_a1 = _mm_loadu_ps(src + 28); + + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, + v_b0, v_b1, v_a0, v_a1); + } + else + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + __m128 v_x0, v_y0, v_z0; + process(v_r0, v_g0, v_b0, + v_x0, v_y0, v_z0); + + __m128 v_x1, v_y1, v_z1; + process(v_r1, v_g1, v_b1, + v_x1, v_y1, v_z1); + + _mm_interleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); + + _mm_storeu_ps(dst + i, v_x0); + _mm_storeu_ps(dst + i + 4, v_x1); + _mm_storeu_ps(dst + i + 8, v_y0); + _mm_storeu_ps(dst + i + 12, v_y1); + _mm_storeu_ps(dst + i + 16, v_z0); + _mm_storeu_ps(dst + i + 20, v_z1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + float X = saturate_cast(src[0]*C0 + src[1]*C1 + src[2]*C2); + float Y = saturate_cast(src[0]*C3 + src[1]*C4 + src[2]*C5); + float Z = saturate_cast(src[0]*C6 + src[1]*C7 + src[2]*C8); + dst[i] = X; dst[i+1] = Y; dst[i+2] = Z; + } + } + + int srccn; + float coeffs[9]; + __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + bool haveSIMD; +}; + + #endif template struct RGB2XYZ_i @@ -2249,6 +3376,7 @@ template struct RGB2XYZ_i C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; n *= 3; + for(int i = 0; i < n; i += 3, src += scn) { int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift); @@ -2542,6 +3670,130 @@ template struct XYZ2RGB_f float coeffs[9]; }; +#if CV_SSE2 + +template <> +struct XYZ2RGB_f +{ + typedef float channel_type; + + XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0])); + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[6]); + std::swap(coeffs[1], coeffs[7]); + std::swap(coeffs[2], coeffs[8]); + } + + v_c0 = _mm_set1_ps(coeffs[0]); + v_c1 = _mm_set1_ps(coeffs[1]); + v_c2 = _mm_set1_ps(coeffs[2]); + v_c3 = _mm_set1_ps(coeffs[3]); + v_c4 = _mm_set1_ps(coeffs[4]); + v_c5 = _mm_set1_ps(coeffs[5]); + v_c6 = _mm_set1_ps(coeffs[6]); + v_c7 = _mm_set1_ps(coeffs[7]); + v_c8 = _mm_set1_ps(coeffs[8]); + + v_alpha = _mm_set1_ps(ColorChannel::max()); + + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + } + + void process(__m128 v_x, __m128 v_y, __m128 v_z, + __m128 & v_r, __m128 & v_g, __m128 & v_b) const + { + v_b = _mm_mul_ps(v_x, v_c0); + v_b = _mm_add_ps(v_b, _mm_mul_ps(v_y, v_c1)); + v_b = _mm_add_ps(v_b, _mm_mul_ps(v_z, v_c2)); + + v_g = _mm_mul_ps(v_x, v_c3); + v_g = _mm_add_ps(v_g, _mm_mul_ps(v_y, v_c4)); + v_g = _mm_add_ps(v_g, _mm_mul_ps(v_z, v_c5)); + + v_r = _mm_mul_ps(v_x, v_c6); + v_r = _mm_add_ps(v_r, _mm_mul_ps(v_y, v_c7)); + v_r = _mm_add_ps(v_r, _mm_mul_ps(v_z, v_c8)); + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn; + float alpha = ColorChannel::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + int i = 0; + + if (haveSIMD) + { + for ( ; i <= n - 24; i += 24, dst += 8 * dcn) + { + __m128 v_x0 = _mm_loadu_ps(src + i); + __m128 v_x1 = _mm_loadu_ps(src + i + 4); + __m128 v_y0 = _mm_loadu_ps(src + i + 8); + __m128 v_y1 = _mm_loadu_ps(src + i + 12); + __m128 v_z0 = _mm_loadu_ps(src + i + 16); + __m128 v_z1 = _mm_loadu_ps(src + i + 20); + + _mm_deinterleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1); + + __m128 v_r0, v_g0, v_b0; + process(v_x0, v_y0, v_z0, + v_r0, v_g0, v_b0); + + __m128 v_r1, v_g1, v_b1; + process(v_x1, v_y1, v_z1, + v_r1, v_g1, v_b1); + + __m128 v_a0 = v_alpha, v_a1 = v_alpha; + + if (dcn == 4) + _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, + v_r0, v_r1, v_a0, v_a1); + else + _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1); + + _mm_storeu_ps(dst, v_b0); + _mm_storeu_ps(dst + 4, v_b1); + _mm_storeu_ps(dst + 8, v_g0); + _mm_storeu_ps(dst + 12, v_g1); + _mm_storeu_ps(dst + 16, v_r0); + _mm_storeu_ps(dst + 20, v_r1); + + if (dcn == 4) + { + _mm_storeu_ps(dst + 24, v_a0); + _mm_storeu_ps(dst + 28, v_a1); + } + } + + } + + for( ; i < n; i += 3, dst += dcn) + { + float B = src[i]*C0 + src[i+1]*C1 + src[i+2]*C2; + float G = src[i]*C3 + src[i+1]*C4 + src[i+2]*C5; + float R = src[i]*C6 + src[i+1]*C7 + src[i+2]*C8; + dst[0] = B; dst[1] = G; dst[2] = R; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + float coeffs[9]; + + __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + __m128 v_alpha; + bool haveSIMD; +}; + +#endif // CV_SSE2 + template struct XYZ2RGB_i { @@ -3056,14 +4308,49 @@ struct HSV2RGB_b v_scale_inv = vdupq_n_f32(1.f/255.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(255.0f); + v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + float * buf) const + { + __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); + __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); + __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); + + __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); + __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); + __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); + + v_g0 = _mm_mul_ps(v_g0, v_scale_inv); + v_b0 = _mm_mul_ps(v_b0, v_scale_inv); + + v_g1 = _mm_mul_ps(v_g1, v_scale_inv); + v_b1 = _mm_mul_ps(v_b1, v_scale_inv); + + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + _mm_store_ps(buf, v_r0); + _mm_store_ps(buf + 4, v_r1); + _mm_store_ps(buf + 8, v_g0); + _mm_store_ps(buf + 12, v_g1); + _mm_store_ps(buf + 16, v_b0); + _mm_store_ps(buf + 20, v_b1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -3089,6 +4376,41 @@ struct HSV2RGB_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } + } #endif for( ; j < dn*3; j += 3 ) @@ -3129,6 +4451,28 @@ struct HSV2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, dst += dcn ) @@ -3147,6 +4491,10 @@ struct HSV2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale_inv, v_scale; + __m128i v_zero; + bool haveSIMD; #endif }; @@ -3218,13 +4566,42 @@ struct RGB2HLS_b v_scale_inv = vdupq_n_f32(1.f/255.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(255.f); + v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } + #if CV_SSE2 + void process(const float * buf, + __m128i & v_h, __m128i & v_l, __m128i & v_s) const + { + __m128 v_h0f = _mm_load_ps(buf); + __m128 v_h1f = _mm_load_ps(buf + 4); + __m128 v_l0f = _mm_load_ps(buf + 8); + __m128 v_l1f = _mm_load_ps(buf + 12); + __m128 v_s0f = _mm_load_ps(buf + 16); + __m128 v_s1f = _mm_load_ps(buf + 20); + + _mm_deinterleave_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f); + + v_l0f = _mm_mul_ps(v_l0f, v_scale); + v_l1f = _mm_mul_ps(v_l1f, v_scale); + v_s0f = _mm_mul_ps(v_s0f, v_scale); + v_s1f = _mm_mul_ps(v_s1f, v_scale); + + v_h = _mm_packs_epi32(_mm_cvtps_epi32(v_h0f), _mm_cvtps_epi32(v_h1f)); + v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f)); + v_s = _mm_packs_epi32(_mm_cvtps_epi32(v_s0f), _mm_cvtps_epi32(v_s1f)); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, scn = srccn; - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) { @@ -3262,6 +4639,26 @@ struct RGB2HLS_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (scn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, src += 16) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)src); + + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + } + + int jr = j % 3; + if (jr) + src -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, src += scn ) { @@ -3286,6 +4683,43 @@ struct RGB2HLS_b vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); vst3_u8(dst + j, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_h_0, v_l_0, v_s_0; + process(buf + j, + v_h_0, v_l_0, v_s_0); + + __m128i v_h_1, v_l_1, v_s_1; + process(buf + j + 24, + v_h_1, v_l_1, v_s_1); + + __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1); + + process(buf + j + 48, + v_h_0, v_l_0, v_s_0); + + process(buf + j + 72, + v_h_1, v_l_1, v_s_1); + + __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1); + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1); + + _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1); + + _mm_storeu_si128((__m128i *)(dst + j), v_h0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1); + } + } #endif for( ; j < dn*3; j += 3 ) { @@ -3301,6 +4735,10 @@ struct RGB2HLS_b #if CV_NEON float32x4_t v_scale, v_scale_inv; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv; + __m128i v_zero; + bool haveSIMD; #endif }; @@ -3380,14 +4818,49 @@ struct HLS2RGB_b v_scale_inv = vdupq_n_f32(1.f/255.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(255.f); + v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + float * buf) const + { + __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); + __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); + __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); + + __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); + __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); + __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); + + v_g0 = _mm_mul_ps(v_g0, v_scale_inv); + v_b0 = _mm_mul_ps(v_b0, v_scale_inv); + + v_g1 = _mm_mul_ps(v_g1, v_scale_inv); + v_b1 = _mm_mul_ps(v_b1, v_scale_inv); + + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + _mm_store_ps(buf, v_r0); + _mm_store_ps(buf + 4, v_r1); + _mm_store_ps(buf + 8, v_g0); + _mm_store_ps(buf + 12, v_g1); + _mm_store_ps(buf + 16, v_b0); + _mm_store_ps(buf + 20, v_b1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -3413,6 +4886,41 @@ struct HLS2RGB_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } + } #endif for( ; j < dn*3; j += 3 ) { @@ -3452,7 +4960,30 @@ struct HLS2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif + for( ; j < dn*3; j += 3, dst += dcn ) { dst[0] = saturate_cast(buf[j]*255.f); @@ -3469,6 +5000,10 @@ struct HLS2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv; + __m128i v_zero; + bool haveSIMD; #endif }; @@ -3784,14 +5319,52 @@ struct Lab2RGB_b v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); v_128 = vdupq_n_f32(128.0f); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(100.f/255.f); + v_scale = _mm_set1_ps(255.f); + v_128 = _mm_set1_ps(128.0f); + v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + float * buf) const + { + __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); + __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); + __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); + + __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); + __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); + __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); + + v_r0 = _mm_mul_ps(v_r0, v_scale_inv); + v_r1 = _mm_mul_ps(v_r1, v_scale_inv); + + v_g0 = _mm_sub_ps(v_g0, v_128); + v_g1 = _mm_sub_ps(v_g1, v_128); + v_b0 = _mm_sub_ps(v_b0, v_128); + v_b1 = _mm_sub_ps(v_b1, v_128); + + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + _mm_store_ps(buf, v_r0); + _mm_store_ps(buf + 4, v_r1); + _mm_store_ps(buf + 8, v_g0); + _mm_store_ps(buf + 12, v_g1); + _mm_store_ps(buf + 16, v_b0); + _mm_store_ps(buf + 20, v_b1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -3817,6 +5390,41 @@ struct Lab2RGB_b v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } + } #endif for( ; j < dn*3; j += 3 ) @@ -3857,6 +5465,28 @@ struct Lab2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, dst += dcn ) @@ -3876,6 +5506,10 @@ struct Lab2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv, v_128; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv, v_128; + __m128i v_zero; + bool haveSIMD; #endif }; @@ -4050,13 +5684,48 @@ struct RGB2Luv_b v_coeff3 = vdupq_n_f32(0.9732824427480916f); v_coeff4 = vdupq_n_f32(136.259541984732824f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_zero = _mm_setzero_si128(); + v_scale_inv = _mm_set1_ps(1.f/255.f); + v_scale = _mm_set1_ps(2.55f); + v_coeff1 = _mm_set1_ps(0.72033898305084743f); + v_coeff2 = _mm_set1_ps(96.525423728813564f); + v_coeff3 = _mm_set1_ps(0.9732824427480916f); + v_coeff4 = _mm_set1_ps(136.259541984732824f); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } + #if CV_SSE2 + void process(const float * buf, + __m128i & v_l, __m128i & v_u, __m128i & v_v) const + { + __m128 v_l0f = _mm_load_ps(buf); + __m128 v_l1f = _mm_load_ps(buf + 4); + __m128 v_u0f = _mm_load_ps(buf + 8); + __m128 v_u1f = _mm_load_ps(buf + 12); + __m128 v_v0f = _mm_load_ps(buf + 16); + __m128 v_v1f = _mm_load_ps(buf + 20); + + _mm_deinterleave_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f); + + v_l0f = _mm_mul_ps(v_l0f, v_scale); + v_l1f = _mm_mul_ps(v_l1f, v_scale); + v_u0f = _mm_add_ps(_mm_mul_ps(v_u0f, v_coeff1), v_coeff2); + v_u1f = _mm_add_ps(_mm_mul_ps(v_u1f, v_coeff1), v_coeff2); + v_v0f = _mm_add_ps(_mm_mul_ps(v_v0f, v_coeff3), v_coeff4); + v_v1f = _mm_add_ps(_mm_mul_ps(v_v1f, v_coeff3), v_coeff4); + + v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f)); + v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f)); + v_v = _mm_packs_epi32(_mm_cvtps_epi32(v_v0f), _mm_cvtps_epi32(v_v1f)); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, scn = srccn; - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) { @@ -4094,6 +5763,26 @@ struct RGB2Luv_b v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (scn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, src += 16) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)src); + + __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); + _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + + v_src_p = _mm_unpackhi_epi8(v_src, v_zero); + _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); + _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); + } + + int jr = j % 3; + if (jr) + src -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, src += scn ) { @@ -4119,6 +5808,43 @@ struct RGB2Luv_b vst3_u8(dst + j, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_l_0, v_u_0, v_v_0; + process(buf + j, + v_l_0, v_u_0, v_v_0); + + __m128i v_l_1, v_u_1, v_v_1; + process(buf + j + 24, + v_l_1, v_u_1, v_v_1); + + __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1); + + process(buf + j + 48, + v_l_0, v_u_0, v_v_0); + + process(buf + j + 72, + v_l_1, v_u_1, v_v_1); + + __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1); + __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1); + __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1); + + _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + + _mm_storeu_si128((__m128i *)(dst + j), v_l0); + _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1); + _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0); + _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1); + _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0); + _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1); + } + } #endif for( ; j < dn*3; j += 3 ) @@ -4136,6 +5862,10 @@ struct RGB2Luv_b #if CV_NEON float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4; + __m128i v_zero; + bool haveSIMD; #endif }; @@ -4156,14 +5886,55 @@ struct Luv2RGB_b v_140 = vdupq_n_f32(140.f); v_scale = vdupq_n_f32(255.f); v_alpha = vdup_n_u8(ColorChannel::max()); + #elif CV_SSE2 + v_scale_inv = _mm_set1_ps(100.f/255.f); + v_coeff1 = _mm_set1_ps(1.388235294117647f); + v_coeff2 = _mm_set1_ps(1.027450980392157f); + v_134 = _mm_set1_ps(134.f); + v_140 = _mm_set1_ps(140.f); + v_scale = _mm_set1_ps(255.f); + v_zero = _mm_setzero_si128(); + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif } + #if CV_SSE2 + // 16s x 8 + void process(__m128i v_l, __m128i v_u, __m128i v_v, + float * buf) const + { + __m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero)); + __m128 v_u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_u, v_zero)); + __m128 v_v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_v, v_zero)); + + __m128 v_l1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_l, v_zero)); + __m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero)); + __m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero)); + + v_l0 = _mm_mul_ps(v_l0, v_scale_inv); + v_l1 = _mm_mul_ps(v_l1, v_scale_inv); + + v_u0 = _mm_sub_ps(_mm_mul_ps(v_u0, v_coeff1), v_134); + v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134); + v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140); + v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140); + + _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1); + + _mm_store_ps(buf, v_l0); + _mm_store_ps(buf + 4, v_l1); + _mm_store_ps(buf + 8, v_u0); + _mm_store_ps(buf + 12, v_u1); + _mm_store_ps(buf + 16, v_v0); + _mm_store_ps(buf + 20, v_v1); + } + #endif + void operator()(const uchar* src, uchar* dst, int n) const { int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); - float buf[3*BLOCK_SIZE]; + float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { @@ -4189,6 +5960,41 @@ struct Luv2RGB_b v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140); vst3q_f32(buf + j + 12, v_dst); } + #elif CV_SSE2 + if (haveSIMD) + { + for ( ; j <= (dn - 32) * 3; j += 96) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80)); + + _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + buf + j); + + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + buf + j + 24); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + buf + j + 48); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + buf + j + 72); + } + } #endif for( ; j < dn*3; j += 3 ) { @@ -4228,6 +6034,28 @@ struct Luv2RGB_b vst3_u8(dst, v_dst); } } + #elif CV_SSE2 + if (dcn == 3 && haveSIMD) + { + for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) + { + __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); + __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); + __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); + __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); + + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), + _mm_cvtps_epi32(v_src1)); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), + _mm_cvtps_epi32(v_src3)); + + _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); + } + + int jr = j % 3; + if (jr) + dst -= jr, j -= jr; + } #endif for( ; j < dn*3; j += 3, dst += dcn ) @@ -4247,6 +6075,10 @@ struct Luv2RGB_b #if CV_NEON float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140; uint8x8_t v_alpha; + #elif CV_SSE2 + __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140; + __m128i v_zero; + bool haveSIMD; #endif }; diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 85f2063b28..358cd5802b 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -270,6 +271,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType)) return; +#elif CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); #endif int depth = src.depth(); @@ -318,6 +321,33 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size, vst3q_f32(cov_data + j * 3, v_dst); } + #elif CV_SSE2 + if (haveSSE2) + { + for( ; j <= size.width - 8; j += 8 ) + { + __m128 v_dx_0 = _mm_loadu_ps(dxdata + j); + __m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4); + __m128 v_dy_0 = _mm_loadu_ps(dydata + j); + __m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4); + + __m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0); + __m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0); + __m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0); + __m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1); + __m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1); + __m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1); + + _mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1); + + _mm_storeu_ps(cov_data + j * 3, v_dx2_0); + _mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1); + _mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0); + _mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1); + _mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0); + _mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1); + } + } #endif for( ; j < size.width; j++ ) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 0b7afb8ea6..cec450dc71 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 9acdc11415..ec8de4d815 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -2284,15 +2284,20 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() ); +#if CV_SSE2 + bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2); +#endif + for( size_t i = 0; i < it.nplanes; i++, ++it ) { const float* h1 = it.planes[0].ptr(); const float* h2 = it.planes[1].ptr(); len = it.planes[0].rows*it.planes[0].cols*H1.channels(); + j = 0; if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT)) { - for( j = 0; j < len; j++ ) + for( ; j < len; j++ ) { double a = h1[j] - h2[j]; double b = (method == CV_COMP_CHISQR) ? h1[j] : h1[j] + h2[j]; @@ -2302,7 +2307,51 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_CORREL ) { - for( j = 0; j < len; j++ ) + #if CV_SSE2 + if (haveSIMD) + { + __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1; + __m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1; + + for ( ; j <= len - 4; j += 4) + { + __m128 v_a = _mm_loadu_ps(h1 + j); + __m128 v_b = _mm_loadu_ps(h2 + j); + + // 0-1 + __m128d v_ad = _mm_cvtps_pd(v_a); + __m128d v_bd = _mm_cvtps_pd(v_b); + v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); + v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); + v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + + // 2-3 + v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); + v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); + v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); + v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); + v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + } + + double CV_DECL_ALIGNED(16) ar[10]; + _mm_store_pd(ar, v_s12); + _mm_store_pd(ar + 2, v_s11); + _mm_store_pd(ar + 4, v_s22); + _mm_store_pd(ar + 6, v_s1); + _mm_store_pd(ar + 8, v_s2); + + s12 += ar[0] + ar[1]; + s11 += ar[2] + ar[3]; + s22 += ar[4] + ar[5]; + s1 += ar[6] + ar[7]; + s2 += ar[8] + ar[9]; + } + #endif + for( ; j < len; j++ ) { double a = h1[j]; double b = h2[j]; @@ -2316,7 +2365,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_INTERSECT ) { - j = 0; #if CV_NEON float32x4_t v_result = vdupq_n_f32(0.0f); for( ; j <= len - 4; j += 4 ) @@ -2324,13 +2372,61 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) float CV_DECL_ALIGNED(16) ar[4]; vst1q_f32(ar, v_result); result += ar[0] + ar[1] + ar[2] + ar[3]; + #elif CV_SSE2 + if (haveSIMD) + { + __m128d v_result = _mm_setzero_pd(); + for ( ; j <= len - 4; j += 4) + { + __m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j), + _mm_loadu_ps(h2 + j)); + v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); + v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); + v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); + } + + double CV_DECL_ALIGNED(16) ar[2]; + _mm_store_pd(ar, v_result); + result += ar[0] + ar[1]; + } #endif for( ; j < len; j++ ) result += std::min(h1[j], h2[j]); } else if( method == CV_COMP_BHATTACHARYYA ) { - for( j = 0; j < len; j++ ) + #if CV_SSE2 + if (haveSIMD) + { + __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1; + for ( ; j <= len - 4; j += 4) + { + __m128 v_a = _mm_loadu_ps(h1 + j); + __m128 v_b = _mm_loadu_ps(h2 + j); + + __m128d v_ad = _mm_cvtps_pd(v_a); + __m128d v_bd = _mm_cvtps_pd(v_b); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); + + v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); + v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); + v_s1 = _mm_add_pd(v_s1, v_ad); + v_s2 = _mm_add_pd(v_s2, v_bd); + v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); + } + + double CV_DECL_ALIGNED(16) ar[6]; + _mm_store_pd(ar, v_s1); + _mm_store_pd(ar + 2, v_s2); + _mm_store_pd(ar + 4, v_result); + s1 += ar[0] + ar[1]; + s2 += ar[2] + ar[3]; + result += ar[4] + ar[5]; + } + #endif + for( ; j < len; j++ ) { double a = h1[j]; double b = h2[j]; @@ -2341,7 +2437,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_KL_DIV ) { - for( j = 0; j < len; j++ ) + for( ; j < len; j++ ) { double p = h1[j]; double q = h2[j]; diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index c4bb3baa9f..fe126fbbd1 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -1962,9 +1963,9 @@ private: struct ResizeAreaFastVec_SIMD_32f { ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : - scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) + cn(_cn), step(_step) { - fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); } int operator() (const float * S, float * D, int w) const @@ -2004,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f } private: - int scale_x, scale_y; int cn; bool fast_mode; int step; @@ -2199,8 +2199,146 @@ private: bool use_simd; }; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16s; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_32f; +class ResizeAreaFastVec_SIMD_16s +{ +public: + ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short* S, short* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const short* S0 = (const short*)S; + const short* S1 = (const short*)((const uchar*)(S) + step); + __m128i masklow = _mm_set1_epi32(0x0000ffff); + __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi32(2); + + if (cn == 1) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); + __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); + s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); + s0 = _mm_srai_epi32(s0, 2); + s0 = _mm_packs_epi32(s0, zero); + + _mm_storel_epi64((__m128i*)D, s0); + } + } + else if (cn == 3) + for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); + __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); + + __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); + __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); + s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + else + { + CV_Assert(cn == 4); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); + __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); + + __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); + __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); + s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + } + + return dx; + } + +private: + int cn; + int step; + bool use_simd; +}; + +struct ResizeAreaFastVec_SIMD_32f +{ + ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : + cn(_cn), step(_step) + { + fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); + fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const float * S, float * D, int w) const + { + if (!fast_mode) + return 0; + + const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); + int dx = 0; + + __m128 v_025 = _mm_set1_ps(0.25f); + + if (cn == 1) + { + const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), + v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); + + __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), + _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); + __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), + _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); + __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + + return dx; + } + +private: + int cn; + bool fast_mode; + int step; +}; #else @@ -4678,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, size.height = 1; } +#if CV_SSE2 + bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#endif +#if CV_SSE4_1 + bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); +#endif + const float scale = 1.f/INTER_TAB_SIZE; int x, y; for( y = 0; y < size.height; y++ ) @@ -4708,6 +4853,29 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst2q_s16(dst1 + (x << 1), v_dst); } + #elif CV_SSE4_1 + if (useSSE4_1) + { + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); + __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); + + __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); + __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), + _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); + + _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); + } + } #endif for( ; x < size.width; x++ ) { @@ -4742,6 +4910,52 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } + #elif CV_SSE4_1 + if (useSSE4_1) + { + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); + + for( ; x <= size.width - 16; x += 16 ) + { + __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); + __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); + __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); + __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); + + __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); + + v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); + v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); + v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); + v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); + + __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), + _mm_srai_epi32(v_ix1, INTER_BITS)); + __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), + _mm_srai_epi32(v_iy1, INTER_BITS)); + v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), + _mm_and_si128(v_ix0, v_its1)); + v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), + _mm_and_si128(v_ix1, v_its1)); + _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); + + _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); + + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); + _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); + } + } #endif for( ; x < size.width; x++ ) { @@ -4761,6 +4975,12 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, for( ; x <= (size.width << 1) - 8; x += 8 ) vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))))); + #elif CV_SSE2 + for( ; x <= (size.width << 1) - 8; x += 8 ) + { + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), + _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)))); + } #endif for( ; x < size.width; x++ ) { @@ -4796,6 +5016,30 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vandq_s32(v_ix1, v_mask))); vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); } + #elif CV_SSE4_1 + if (useSSE4_1) + { + __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); + __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); + + for( ; x <= size.width - 4; x += 4 ) + { + __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); + __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); + + __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), + _mm_srai_epi32(v_src1, INTER_BITS)); + _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); + + // x0 y0 x1 y1 . . . + v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), + _mm_and_si128(v_src1, v_its1)); + __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . + _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . + _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); + } + } #endif for( ; x < size.width; x++ ) { @@ -4841,6 +5085,44 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, vst1q_f32(dst1f + x + 4, v_dst1); vst1q_f32(dst2f + x + 4, v_dst2); } + #elif CV_SSE2 + __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); + __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128 v_scale = _mm_set1_ps(scale); + + for( ; x <= size.width - 16; x += 16) + { + __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); + __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8)); + __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16)); + __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24)); + + _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21); + + __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; + __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + + v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero; + v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); + _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); + _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)), + _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); + } #endif for( ; x < size.width; x++ ) { @@ -4882,6 +5164,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS))); vst2q_f32(dst1f + (x << 1) + 8, v_dst); } + #elif CV_SSE2 + if (useSSE2) + { + __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); + __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); + __m128 v_scale = _mm_set1_ps(scale); + + for ( ; x <= size.width - 8; x += 8) + { + __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); + __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; + __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); + __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); + + __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); + + v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); + _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); + } + } #endif for( ; x < size.width; x++ ) { @@ -4919,7 +5222,10 @@ public: const int AB_SCALE = 1 << AB_BITS; int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; #if CV_SSE2 - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); + bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + #if CV_SSE4_1 + bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); #endif int bh0 = std::min(BLOCK_SZ/2, dst.rows); @@ -4957,6 +5263,31 @@ public: vst2q_s16(xy + (x1 << 1), v_dst); } + #elif CV_SSE4_1 + if (useSSE4_1) + { + __m128i v_X0 = _mm_set1_epi32(X0); + __m128i v_Y0 = _mm_set1_epi32(Y0); + for ( ; x1 <= bw - 16; x1 += 16) + { + __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); + __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); + + __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); + __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), + _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); + + _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); + } + } #endif for( ; x1 < bw; x1++ ) { @@ -4971,7 +5302,7 @@ public: short* alpha = A + y1*bw; x1 = 0; #if CV_SSE2 - if( useSIMD ) + if( useSSE2 ) { __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); @@ -5364,6 +5695,20 @@ public: int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); + #if CV_SSE4_1 + bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); + __m128d v_M0 = _mm_set1_pd(M[0]); + __m128d v_M3 = _mm_set1_pd(M[3]); + __m128d v_M6 = _mm_set1_pd(M[6]); + __m128d v_intmax = _mm_set1_pd((double)INT_MAX); + __m128d v_intmin = _mm_set1_pd((double)INT_MIN); + __m128d v_2 = _mm_set1_pd(2), + v_zero = _mm_setzero_pd(), + v_1 = _mm_set1_pd(1), + v_its = _mm_set1_pd(INTER_TAB_SIZE); + __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); + #endif + for( y = range.start; y < range.end; y += bh0 ) { for( x = 0; x < width; x += bw0 ) @@ -5382,7 +5727,120 @@ public: double W0 = M[6]*x + M[7]*(y + y1) + M[8]; if( interpolation == INTER_NEAREST ) - for( x1 = 0; x1 < bw; x1++ ) + { + x1 = 0; + + #if CV_SSE4_1 + if (haveSSE4_1) + { + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for( ; x1 <= bw - 16; x1 += 16 ) + { + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // convert to 16s + v_X0 = _mm_packs_epi32(v_X0, v_X1); + v_X1 = _mm_packs_epi32(v_X2, v_X3); + v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); + v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } + } + #endif + + for( ; x1 < bw; x1++ ) { double W = W0 + M[6]*x1; W = W ? 1./W : 0; @@ -5394,10 +5852,136 @@ public: xy[x1*2] = saturate_cast(X); xy[x1*2+1] = saturate_cast(Y); } + } else { short* alpha = A + y1*bw; - for( x1 = 0; x1 < bw; x1++ ) + x1 = 0; + + #if CV_SSE4_1 + if (haveSSE4_1) + { + __m128d v_X0d = _mm_set1_pd(X0); + __m128d v_Y0d = _mm_set1_pd(Y0); + __m128d v_W0 = _mm_set1_pd(W0); + __m128d v_x1 = _mm_set_pd(1, 0); + + for( ; x1 <= bw - 16; x1 += 16 ) + { + // 0-3 + __m128i v_X0, v_Y0; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 4-8 + __m128i v_X1, v_Y1; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 8-11 + __m128i v_X2, v_Y2; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // 12-15 + __m128i v_X3, v_Y3; + { + __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); + v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); + __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); + __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); + v_x1 = _mm_add_pd(v_x1, v_2); + + v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); + v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), + _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); + } + + // store alpha + __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), + _mm_and_si128(v_X0, v_itsi1)); + __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), + _mm_and_si128(v_X1, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); + + v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), + _mm_and_si128(v_X2, v_itsi1)); + v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), + _mm_and_si128(v_X3, v_itsi1)); + _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); + + // convert to 16s + v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); + v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); + v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); + v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); + + _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); + + _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); + _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); + } + } + #endif + + for( ; x1 < bw; x1++ ) { double W = W0 + M[6]*x1; W = W ? INTER_TAB_SIZE/W : 0; diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index e510530afd..4271b942ae 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -183,13 +184,336 @@ struct PyrDownVec_32f } }; -typedef PyrDownNoVec PyrDownVec_32s16u; -typedef PyrDownNoVec PyrDownVec_32s16s; +#if CV_SSE4_1 + +struct PyrDownVec_32s16u +{ + PyrDownVec_32s16u() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator()(int** src, ushort* dst, int, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + __m128i v_delta = _mm_set1_epi32(128); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), + v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), + v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); + __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), + v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); + + v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); + v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); + + v_r10 = _mm_slli_epi32(v_r10, 2); + __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); + + v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); + v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); + v_r11 = _mm_slli_epi32(v_r11, 2); + __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1)); + } + + return x; + } + + bool haveSSE; +}; + +#else + +typedef PyrDownNoVec PyrDownVec_32s16u; + +#endif // CV_SSE4_1 + +struct PyrDownVec_32s16s +{ + PyrDownVec_32s16s() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator()(int** src, short* dst, int, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + __m128i v_delta = _mm_set1_epi32(128); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)), + v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)), + v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4)); + __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)), + v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4)); + + v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20)); + v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30); + + v_r10 = _mm_slli_epi32(v_r10, 2); + __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8); + + v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21)); + v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31); + v_r11 = _mm_slli_epi32(v_r11, 2); + __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1)); + } + + return x; + } + + bool haveSSE; +}; + +struct PyrUpVec_32s8u +{ + int operator()(int** src, uchar** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE2)) + return x; + + uchar *dst0 = dst[0], *dst1 = dst[1]; + const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; + __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128(); + + for( ; x <= width - 16; x += 16 ) + { + __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), + _mm_loadu_si128((__m128i const *)(row0 + x + 4))); + __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), + _mm_loadu_si128((__m128i const *)(row1 + x + 4))); + __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), + _mm_loadu_si128((__m128i const *)(row2 + x + 4))); + + __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); + __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); + __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); + + v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)), + _mm_loadu_si128((__m128i const *)(row0 + x + 12))); + v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)), + _mm_loadu_si128((__m128i const *)(row1 + x + 12))); + v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)), + _mm_loadu_si128((__m128i const *)(row2 + x + 12))); + + v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); + __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); + __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); + + _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6), + _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6))); + _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6), + _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6))); + } + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)), + _mm_loadu_si128((__m128i const *)(row0 + x + 4))); + __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)), + _mm_loadu_si128((__m128i const *)(row1 + x + 4))); + __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)), + _mm_loadu_si128((__m128i const *)(row2 + x + 4))); + + __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1); + __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1)); + __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2); + + _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero)); + _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero)); + } + + return x; + } +}; + +struct PyrUpVec_32s16s +{ + int operator()(int** src, short** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE2)) + return x; + + short *dst0 = dst[0], *dst1 = dst[1]; + const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; + __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + v_2r1 = _mm_slli_epi32(v_r1, 1); + v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6), + _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); + _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6), + _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); + } + + for( ; x <= width - 4; x += 4 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + + __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storel_epi64((__m128i *)(dst0 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); + _mm_storel_epi64((__m128i *)(dst1 + x), + _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); + } + + return x; + } +}; + +#if CV_SSE4_1 + +struct PyrUpVec_32s16u +{ + int operator()(int** src, ushort** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE4_1)) + return x; + + ushort *dst0 = dst[0], *dst1 = dst[1]; + const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2]; + __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128(); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4)); + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4)); + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4)); + v_2r1 = _mm_slli_epi32(v_r1, 1); + v_4r1 = _mm_slli_epi32(v_r1, 2); + __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storeu_si128((__m128i *)(dst0 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6), + _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6))); + _mm_storeu_si128((__m128i *)(dst1 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6), + _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6))); + } + + for( ; x <= width - 4; x += 4 ) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)), + v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)), + v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x)); + __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2); + + __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1)); + __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2); + + _mm_storel_epi64((__m128i *)(dst0 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero)); + _mm_storel_epi64((__m128i *)(dst1 + x), + _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero)); + } + + return x; + } +}; + +#else -typedef PyrUpNoVec PyrUpVec_32s8u; -typedef PyrUpNoVec PyrUpVec_32s16s; typedef PyrUpNoVec PyrUpVec_32s16u; -typedef PyrUpNoVec PyrUpVec_32f; + +#endif // CV_SSE4_1 + +struct PyrUpVec_32f +{ + int operator()(float** src, float** dst, int, int width) const + { + int x = 0; + + if (!checkHardwareSupport(CV_CPU_SSE2)) + return x; + + const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; + float *dst0 = dst[0], *dst1 = dst[1]; + __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f), + v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f)); + + for( ; x <= width - 8; x += 8 ) + { + __m128 v_r0 = _mm_loadu_ps(row0 + x); + __m128 v_r1 = _mm_loadu_ps(row1 + x); + __m128 v_r2 = _mm_loadu_ps(row2 + x); + + _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); + _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); + + v_r0 = _mm_loadu_ps(row0 + x + 4); + v_r1 = _mm_loadu_ps(row1 + x + 4); + v_r2 = _mm_loadu_ps(row2 + x + 4); + + _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2))); + _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2))); + } + + return x; + } +}; #elif CV_NEON diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 2a69003641..7d8b263bda 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -713,6 +714,156 @@ struct ColumnSum : std::vector sum; }; +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int i; + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + #if CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i <= width-4; i+=4 ) + { + __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); + __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); + _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); + } + } + #elif CV_NEON + for( ; i <= width - 4; i+=4 ) + vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); + #endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + int* D = (int*)dst; + if( haveScale ) + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + const __m128 scale4 = _mm_set1_ps((float)_scale); + for( ; i <= width-4; i+=4 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); + + _mm_storeu_si128((__m128i*)(D+i), _s0T); + _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm)); + } + } + #elif CV_NEON + float32x4_t v_scale = vdupq_n_f32((float)_scale); + for( ; i <= width-4; i+=4 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + + int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); + vst1q_s32(D + i, v_s0d); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + } + #endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i <= width-4; i+=4 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + _mm_storeu_si128((__m128i*)(D+i), _s0); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + } + } + #elif CV_NEON + for( ; i <= width-4; i+=4 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + + vst1q_s32(D + i, v_s0); + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + } + #endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = s0; + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } + } + + double scale; + int sumCount; + std::vector sum; +}; + + template<> struct ColumnSum : public BaseColumnFilter diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index cdef88f6c1..16c7c7ef26 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp index 34505c4ca4..176c9907f3 100644 --- a/modules/imgproc/test/test_imgwarp.cpp +++ b/modules/imgproc/test/test_imgwarp.cpp @@ -1595,7 +1595,10 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst) TEST(Resize, Area_half) { const int size = 1000; - int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4, CV_16SC1, CV_16SC4, CV_32FC1, CV_32FC4 }; + int types[] = { CV_8UC1, CV_8UC4, + CV_16UC1, CV_16UC4, + CV_16SC1, CV_16SC3, CV_16SC4, + CV_32FC1, CV_32FC4 }; cv::RNG rng(17); diff --git a/modules/photo/test/test_cloning.cpp b/modules/photo/test/test_cloning.cpp index 56d166205c..1f86612a4a 100644 --- a/modules/photo/test/test_cloning.cpp +++ b/modules/photo/test/test_cloning.cpp @@ -64,6 +64,7 @@ TEST(Photo_SeamlessClone_normal, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -79,8 +80,8 @@ TEST(Photo_SeamlessClone_normal, regression) p.y = destination.size().height/2; seamlessClone(source, destination, mask, p, result, 1); - - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; SAVE(result); @@ -94,6 +95,7 @@ TEST(Photo_SeamlessClone_mixed, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -111,7 +113,9 @@ TEST(Photo_SeamlessClone_mixed, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -123,6 +127,7 @@ TEST(Photo_SeamlessClone_featureExchange, regression) string original_path1 = folder + "source1.png"; string original_path2 = folder + "destination1.png"; string original_path3 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat destination = imread(original_path2, IMREAD_COLOR); @@ -140,7 +145,9 @@ TEST(Photo_SeamlessClone_featureExchange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -151,6 +158,7 @@ TEST(Photo_SeamlessClone_colorChange, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/color_change/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -163,7 +171,9 @@ TEST(Photo_SeamlessClone_colorChange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -174,6 +184,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Illumination_Change/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -186,7 +197,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); @@ -197,6 +208,7 @@ TEST(Photo_SeamlessClone_textureFlattening, regression) string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Texture_Flattening/"; string original_path1 = folder + "source1.png"; string original_path2 = folder + "mask.png"; + string reference_path = folder + "reference.png"; Mat source = imread(original_path1, IMREAD_COLOR); Mat mask = imread(original_path2, IMREAD_COLOR); @@ -209,7 +221,9 @@ TEST(Photo_SeamlessClone_textureFlattening, regression) SAVE(result); - Mat reference = imread(folder + "reference.png"); + Mat reference = imread(reference_path); + ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path; + double error = cvtest::norm(reference, result, NORM_L1); EXPECT_LE(error, numerical_precision); diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 03877c0910..b6a832b6bb 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -2998,6 +2998,12 @@ void printVersionInfo(bool useStdOut) std::string cpu_features; +#if CV_POPCNT + if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt"; +#endif +#if CV_MMX + if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx"; +#endif #if CV_SSE if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse"; #endif @@ -3019,6 +3025,39 @@ void printVersionInfo(bool useStdOut) #if CV_AVX if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx"; #endif +#if CV_AVX2 + if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2"; +#endif +#if CV_FMA3 + if (checkHardwareSupport(CV_CPU_FMA3)) cpu_features += " fma3"; +#endif +#if CV_AVX_512F + if (checkHardwareSupport(CV_CPU_AVX_512F) cpu_features += " avx-512f"; +#endif +#if CV_AVX_512BW + if (checkHardwareSupport(CV_CPU_AVX_512BW) cpu_features += " avx-512bw"; +#endif +#if CV_AVX_512CD + if (checkHardwareSupport(CV_CPU_AVX_512CD) cpu_features += " avx-512cd"; +#endif +#if CV_AVX_512DQ + if (checkHardwareSupport(CV_CPU_AVX_512DQ) cpu_features += " avx-512dq"; +#endif +#if CV_AVX_512ER + if (checkHardwareSupport(CV_CPU_AVX_512ER) cpu_features += " avx-512er"; +#endif +#if CV_AVX_512IFMA512 + if (checkHardwareSupport(CV_CPU_AVX_512IFMA512) cpu_features += " avx-512ifma512"; +#endif +#if CV_AVX_512PF + if (checkHardwareSupport(CV_CPU_AVX_512PF) cpu_features += " avx-512pf"; +#endif +#if CV_AVX_512VBMI + if (checkHardwareSupport(CV_CPU_AVX_512VBMI) cpu_features += " avx-512vbmi"; +#endif +#if CV_AVX_512VL + if (checkHardwareSupport(CV_CPU_AVX_512VL) cpu_features += " avx-512vl"; +#endif #if CV_NEON if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon"; #endif