diff --git a/CMakeLists.txt b/CMakeLists.txt
index 75fcf9659b..2f4fd3323d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,11 +216,14 @@ OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 ON   IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_POPCNT              "Enable POPCNT instructions"                               OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_AVX2                "Enable AVX2 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_FMA3                "Enable FMA3 instructions"                                 OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR IOS) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 2f9068c60d..13559b5c8a 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -128,10 +128,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(ENABLE_SSE2)
     add_extra_compiler_option(-msse2)
   endif()
-  if (ENABLE_NEON)
+  if(ENABLE_NEON)
     add_extra_compiler_option("-mfpu=neon")
   endif()
-  if (ENABLE_VFPV3 AND NOT ENABLE_NEON)
+  if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
     add_extra_compiler_option("-mfpu=vfpv3")
   endif()
 
@@ -140,6 +140,13 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     if(ENABLE_AVX)
       add_extra_compiler_option(-mavx)
     endif()
+    if(ENABLE_AVX2)
+      add_extra_compiler_option(-mavx2)
+
+      if(ENABLE_FMA3)
+        add_extra_compiler_option(-mfma)
+      endif()
+    endif()
 
     # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
     if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
@@ -158,6 +165,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
       if(ENABLE_SSE42)
         add_extra_compiler_option(-msse4.2)
       endif()
+
+      if(ENABLE_POPCNT)
+        add_extra_compiler_option(-mpopcnt)
+      endif()
     endif()
   endif(NOT MINGW)
 
@@ -214,7 +225,10 @@ if(MSVC)
     set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi")
   endif()
 
-  if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600)
+  if(ENABLE_AVX2 AND NOT MSVC_VERSION LESS 1800)
+    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX2")
+  endif()
+  if(ENABLE_AVX AND NOT MSVC_VERSION LESS 1600 AND NOT OPENCV_EXTRA_FLAGS MATCHES "/arch:")
     set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /arch:AVX")
   endif()
 
@@ -236,7 +250,7 @@ if(MSVC)
     endif()
   endif()
 
-  if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX)
+  if(ENABLE_SSE OR ENABLE_SSE2 OR ENABLE_SSE3 OR ENABLE_SSE4_1 OR ENABLE_AVX OR ENABLE_AVX2)
     set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /Oi")
   endif()
 
@@ -308,6 +322,7 @@ if(MSVC)
   endforeach()
 
   if(NOT ENABLE_NOISY_WARNINGS)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4251") #class 'std::XXX' needs to have dll-interface to be used by clients of YYY
+    ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4251) # class 'std::XXX' needs to have dll-interface to be used by clients of YYY
+    ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4324) # 'struct_name' : structure was padded due to __declspec(align())
   endif()
 endif()
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index e43fbbc951..f2acaa3fb4 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -13,6 +13,7 @@
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -813,4 +814,6 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val)
 
 } // cv
 
+#include "sse_utils.hpp"
+
 #endif //__OPENCV_CORE_BASE_HPP__
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 06894d7a5d..3fdaa6954d 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -13,6 +13,7 @@
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -104,17 +105,32 @@
 #endif
 
 /* CPU features and intrinsics support */
-#define CV_CPU_NONE    0
-#define CV_CPU_MMX     1
-#define CV_CPU_SSE     2
-#define CV_CPU_SSE2    3
-#define CV_CPU_SSE3    4
-#define CV_CPU_SSSE3   5
-#define CV_CPU_SSE4_1  6
-#define CV_CPU_SSE4_2  7
-#define CV_CPU_POPCNT  8
-#define CV_CPU_AVX    10
-#define CV_CPU_NEON   11
+#define CV_CPU_NONE             0
+#define CV_CPU_MMX              1
+#define CV_CPU_SSE              2
+#define CV_CPU_SSE2             3
+#define CV_CPU_SSE3             4
+#define CV_CPU_SSSE3            5
+#define CV_CPU_SSE4_1           6
+#define CV_CPU_SSE4_2           7
+#define CV_CPU_POPCNT           8
+
+#define CV_CPU_AVX              10
+#define CV_CPU_AVX2             11
+#define CV_CPU_FMA3             12
+
+#define CV_CPU_AVX_512F         13
+#define CV_CPU_AVX_512BW        14
+#define CV_CPU_AVX_512CD        15
+#define CV_CPU_AVX_512DQ        16
+#define CV_CPU_AVX_512ER        17
+#define CV_CPU_AVX_512IFMA512   18
+#define CV_CPU_AVX_512PF        19
+#define CV_CPU_AVX_512VBMI      20
+#define CV_CPU_AVX_512VL        21
+
+#define CV_CPU_NEON   100
+
 // when adding to this list remember to update the enum in core/utility.cpp
 #define CV_HARDWARE_MAX_FEATURE 255
 
@@ -123,6 +139,7 @@
 
 #if defined __SSE2__ || defined _M_X64  || (defined _M_IX86_FP && _M_IX86_FP >= 2)
 #  include <emmintrin.h>
+#  define CV_MMX 1
 #  define CV_SSE 1
 #  define CV_SSE2 1
 #  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
@@ -141,7 +158,15 @@
 #    include <nmmintrin.h>
 #    define CV_SSE4_2 1
 #  endif
-#  if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
+#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    ifdef _MSC_VER
+#      include <nmmintrin.h>
+#    else
+#      include <popcntintrin.h>
+#    endif
+#    define CV_POPCNT 1
+#  endif
+#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
 // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
 // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
 #    include <immintrin.h>
@@ -152,6 +177,13 @@
 #      define __xgetbv() 0
 #    endif
 #  endif
+#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
+#    include <immintrin.h>
+#    define CV_AVX2 1
+#    if defined __FMA__
+#      define CV_FMA3 1
+#    endif
+#  endif
 #endif
 
 #if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
@@ -166,6 +198,12 @@
 
 #endif // __CUDACC__
 
+#ifndef CV_POPCNT
+#define CV_POPCNT 0
+#endif
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
 #ifndef CV_SSE
 #  define CV_SSE 0
 #endif
@@ -187,6 +225,40 @@
 #ifndef CV_AVX
 #  define CV_AVX 0
 #endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA512
+#  define CV_AVX_512IFMA512 0
+#endif
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+
 #ifndef CV_NEON
 #  define CV_NEON 0
 #endif
diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/core/include/opencv2/core/sse_utils.hpp
new file mode 100644
index 0000000000..e0283eb3f3
--- /dev/null
+++ b/modules/core/include/opencv2/core/sse_utils.hpp
@@ -0,0 +1,645 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_SSE_UTILS_HPP__
+#define __OPENCV_CORE_SSE_UTILS_HPP__
+
+#ifndef __cplusplus
+#  error sse_utils.hpp header must be compiled as C++
+#endif
+
+#if CV_SSE2
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
+}
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                  __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
+    __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
+    __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
+    v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
+    v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
+}
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                  __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
+    __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
+    __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
+    __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
+    __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
+    __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
+    __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
+    __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
+    __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
+    v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
+    v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
+    v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
+    v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+    v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
+    __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
+    __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
+    __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
+    __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
+    __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+    v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
+    v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
+    v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                   __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
+    v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
+    v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                   __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
+    __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
+    __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
+    __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
+    __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
+    v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
+    v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
+    v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
+    v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
+}
+
+#if CV_SSE4_1
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+}
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+    v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
+}
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
+    __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
+    __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
+    __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+    v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
+    v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
+    v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
+}
+
+#endif // CV_SSE4_1
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
+}
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
+                                __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
+    __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
+    __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
+    __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
+    __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
+    v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
+    v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
+}
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
+                                __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
+    __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
+    __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
+    __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
+    __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
+    __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
+    __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
+    __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
+    __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
+    v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
+    v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
+    v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
+    v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
+{
+    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
+                              __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
+{
+    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
+    __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
+    __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+    v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
+    v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
+                              __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
+{
+    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
+    __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
+    __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
+    __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
+    __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+    v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
+    v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
+    v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
+    v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
+}
+
+#endif // CV_SSE2
+
+#endif //__OPENCV_CORE_SSE_UTILS_HPP__
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index 88989ef5cb..f89560a809 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -13,6 +13,7 @@
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -281,16 +282,30 @@ CV_EXPORTS_W int64 getCPUTickCount();
 remember to keep this list identical to the one in cvdef.h
 */
 enum CpuFeatures {
-    CPU_MMX       = 1,
-    CPU_SSE       = 2,
-    CPU_SSE2      = 3,
-    CPU_SSE3      = 4,
-    CPU_SSSE3     = 5,
-    CPU_SSE4_1    = 6,
-    CPU_SSE4_2    = 7,
-    CPU_POPCNT    = 8,
-    CPU_AVX       = 10,
-    CPU_NEON      = 11
+    CPU_MMX             = 1,
+    CPU_SSE             = 2,
+    CPU_SSE2            = 3,
+    CPU_SSE3            = 4,
+    CPU_SSSE3           = 5,
+    CPU_SSE4_1          = 6,
+    CPU_SSE4_2          = 7,
+    CPU_POPCNT          = 8,
+
+    CPU_AVX             = 10,
+    CPU_AVX2            = 11,
+    CPU_FMA3            = 12,
+
+    CPU_AVX_512F        = 13,
+    CPU_AVX_512BW       = 14,
+    CPU_AVX_512CD       = 15,
+    CPU_AVX_512DQ       = 16,
+    CPU_AVX_512ER       = 17,
+    CPU_AVX_512IFMA512  = 18,
+    CPU_AVX_512PF       = 19,
+    CPU_AVX_512VBMI     = 20,
+    CPU_AVX_512VL       = 21,
+
+    CPU_NEON            = 100
 };
 
 /** @brief Returns true if the specified feature is supported by the host hardware.
diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp
index 3598c8639f..c6c2a1b29f 100644
--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@@ -242,3 +242,31 @@ PERF_TEST_P(Size_MatType, multiplyScale, TYPICAL_MATS_CORE_ARITHM)
 
     SANITY_CHECK(c, 1e-8);
 }
+
+PERF_TEST_P(Size_MatType, divide, TYPICAL_MATS_CORE_ARITHM)
+{
+    Size sz = get<0>(GetParam());
+    int type = get<1>(GetParam());
+    cv::Mat a(sz, type), b(sz, type), c(sz, type);
+    double scale = 0.5;
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    TEST_CYCLE() divide(a, b, c, scale);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(Size_MatType, reciprocal, TYPICAL_MATS_CORE_ARITHM)
+{
+    Size sz = get<0>(GetParam());
+    int type = get<1>(GetParam());
+    cv::Mat b(sz, type), c(sz, type);
+    double scale = 0.5;
+
+    declare.in(b, WARMUP_RNG).out(c);
+
+    TEST_CYCLE() divide(scale, b, c);
+
+    SANITY_CHECK_NOTHING();
+}
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index f881c785b3..c4de2c4bed 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -63,6 +64,10 @@ FUNCTOR_TEMPLATE(VLoadStore128);
 #if CV_SSE2
 FUNCTOR_TEMPLATE(VLoadStore64);
 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
+#if CV_AVX2
+FUNCTOR_TEMPLATE(VLoadStore256);
+FUNCTOR_TEMPLATE(VLoadStore256Aligned);
+#endif
 #endif
 
 #endif
@@ -75,17 +80,28 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
 #endif
     Op op;
 
-    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
-                        src2 += step2/sizeof(src2[0]),
-                        dst += step/sizeof(dst[0]) )
+    for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
     {
         int x = 0;
 
 #if CV_NEON || CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
+            {
+                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
+                r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
+                VLoadStore256<T>::store(dst + x, r0);
+            }
+        }
+#else
 #if CV_SSE2
         if( USE_SSE2 )
         {
-#endif
+#endif // CV_SSE2
             for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
             {
                 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
@@ -97,9 +113,13 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
             }
 #if CV_SSE2
         }
-#endif
-#endif
-#if CV_SSE2
+#endif // CV_SSE2
+#endif // CV_AVX2
+#endif // CV_NEON || CV_SSE2
+
+#if CV_AVX2
+        // nothing
+#elif CV_SSE2
         if( USE_SSE2 )
         {
             for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) )
@@ -110,6 +130,7 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
             }
         }
 #endif
+
 #if CV_ENABLE_UNROLLED
         for( ; x <= sz.width - 4; x += 4 )
         {
@@ -136,13 +157,26 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
 #endif
     Op op;
 
-    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
-        src2 += step2/sizeof(src2[0]),
-        dst += step/sizeof(dst[0]) )
+    for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
     {
         int x = 0;
 
-#if CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
+            {
+                for( ; x <= sz.width - 8; x += 8 )
+                {
+                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
+                    r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
+                    VLoadStore256Aligned<T>::store(dst + x, r0);
+                }
+            }
+        }
+#elif CV_SSE2
         if( USE_SSE2 )
         {
             if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
@@ -158,12 +192,24 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
                 }
             }
         }
-#endif
+#endif // CV_AVX2
+
 #if CV_NEON || CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            for( ; x <= sz.width - 8; x += 8 )
+            {
+                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
+                r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
+                VLoadStore256<T>::store(dst + x, r0);
+            }
+        }
+#else
 #if CV_SSE2
         if( USE_SSE2 )
         {
-#endif
+#endif // CV_SSE2
             for( ; x <= sz.width - 8; x += 8 )
             {
                 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
@@ -175,8 +221,10 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
             }
 #if CV_SSE2
         }
-#endif
-#endif
+#endif // CV_SSE2
+#endif // CV_AVX2
+#endif // CV_NEON || CV_SSE2
+
 #if CV_ENABLE_UNROLLED
         for( ; x <= sz.width - 4; x += 4 )
         {
@@ -204,13 +252,26 @@ void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
 #endif
     Op op;
 
-    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
-        src2 += step2/sizeof(src2[0]),
-        dst += step/sizeof(dst[0]) )
+    for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
     {
         int x = 0;
 
-#if CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
+            {
+                for( ; x <= sz.width - 4; x += 4 )
+                {
+                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
+                    r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
+                    VLoadStore256Aligned<T>::store(dst + x, r0);
+                }
+            }
+        }
+#elif CV_SSE2
         if( USE_SSE2 )
         {
             if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
@@ -243,7 +304,141 @@ void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
     }
 }
 
-#if CV_SSE2
+#if CV_AVX2
+
+#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
+    template <>                                                                                  \
+    struct name<template_arg>{                                                                   \
+        typedef register_type reg_type;                                                          \
+        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
+        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
+    }
+
+#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
+    template <>                                                                     \
+    struct name<template_arg>{                                                      \
+        typedef register_type reg_type;                                             \
+        static reg_type load(const template_arg * p) { return load_body (p); }      \
+        static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
+    }
+
+#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
+    template<>                                                                 \
+    struct name<template_arg>                                                  \
+    {                                                                          \
+        VLoadStore256<template_arg>::reg_type operator()(                      \
+                        const VLoadStore256<template_arg>::reg_type & a,       \
+                        const VLoadStore256<template_arg>::reg_type & b) const \
+        {                                                                      \
+            body;                                                              \
+        }                                                                      \
+    }
+
+#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
+    template<>                                                                 \
+    struct name<template_arg>                                                  \
+    {                                                                          \
+        VLoadStore256<template_arg>::reg_type operator()(                      \
+                        const VLoadStore256<template_arg>::reg_type & a,       \
+                        const VLoadStore256<template_arg>::reg_type &  ) const \
+        {                                                                      \
+            body;                                                              \
+        }                                                                      \
+    }
+
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
+FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
+
+FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
+FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
+FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
+
+FUNCTOR_TEMPLATE(VAdd);
+FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
+
+FUNCTOR_TEMPLATE(VSub);
+FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
+
+FUNCTOR_TEMPLATE(VMin);
+FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
+FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
+
+FUNCTOR_TEMPLATE(VMax);
+FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
+FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
+
+
+static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
+                                                           0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
+                                                           0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
+
+FUNCTOR_TEMPLATE(VAbsDiff);
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
+        return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
+        __m256i d = _mm256_subs_epi8(a, b);
+        __m256i m = _mm256_cmpgt_epi8(b, a);
+        return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
+        return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
+        __m256i M = _mm256_max_epi16(a, b);
+        __m256i m = _mm256_min_epi16(a, b);
+        return _mm256_subs_epi16(M, m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
+        __m256i d = _mm256_sub_epi32(a, b);
+        __m256i m = _mm256_cmpgt_epi32(b, a);
+        return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
+        return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
+        return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
+    );
+
+FUNCTOR_TEMPLATE(VAnd);
+FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
+FUNCTOR_TEMPLATE(VOr);
+FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
+FUNCTOR_TEMPLATE(VXor);
+FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
+FUNCTOR_TEMPLATE(VNot);
+FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
+
+#elif CV_SSE2
 
 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
     template <>                                                                                  \
@@ -2574,6 +2769,263 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
     }
 }
 
+template <typename T>
+struct Div_SIMD
+{
+    int operator() (const T *, const T *, T *, int, double) const
+    {
+        return 0;
+    }
+};
+
+#if CV_SSE2
+
+#if CV_SSE4_1
+
+template <>
+struct Div_SIMD<uchar>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+
+    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(src1 + x)), v_zero);
+            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
+            __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
+
+            __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
+            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
+            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
+            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
+            v_src1d = _mm_cvtepi32_pd(v_src1i);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+        }
+
+        return x;
+    }
+};
+
+#endif // CV_SSE4_1
+
+template <>
+struct Div_SIMD<schar>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((const __m128i *)(src1 + x))), 8);
+            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
+            __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
+
+            __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
+            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
+            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
+            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
+            v_src1d = _mm_cvtepi32_pd(v_src1i);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct Div_SIMD<ushort>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128i v_src1i = _mm_unpacklo_epi16(v_src1, v_zero);
+            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
+            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src1i = _mm_unpackhi_epi16(v_src1, v_zero);
+            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
+            v_src1d = _mm_cvtepi32_pd(v_src1i);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
+        }
+
+        return x;
+    }
+};
+
+#endif // CV_SSE4_1
+
+template <>
+struct Div_SIMD<short>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128i v_src1i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16);
+            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
+            __m128d v_src1d = _mm_cvtepi32_pd(v_src1i);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src1i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16);
+            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
+            v_src1d = _mm_cvtepi32_pd(v_src1i);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1i, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Div_SIMD<int>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+
+    int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128d v_src1d = _mm_cvtepi32_pd(v_src1);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+
+            v_src1d = _mm_cvtepi32_pd(_mm_srli_si128(v_src1, 8));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(_mm_mul_pd(v_src1d, v_scale), v_src2d));
+
+            __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
+        }
+
+        return x;
+    }
+};
+
+#endif
+
 template<typename T> static void
 div_( const T* src1, size_t step1, const T* src2, size_t step2,
       T* dst, size_t step, Size size, double scale )
@@ -2582,9 +3034,11 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
     step2 /= sizeof(src2[0]);
     step /= sizeof(dst[0]);
 
+    Div_SIMD<T> vop;
+
     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
     {
-        int i = 0;
+        int i = vop(src1, src2, dst, size.width, scale);
         #if CV_ENABLE_UNROLLED
         for( ; i <= size.width - 4; i += 4 )
         {
@@ -2621,6 +3075,232 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
     }
 }
 
+template <typename T>
+struct Recip_SIMD
+{
+    int operator() (const T *, T *, int, double) const
+    {
+        return 0;
+    }
+};
+
+#if CV_SSE2
+
+#if CV_SSE4_1
+
+template <>
+struct Recip_SIMD<uchar>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+
+    int operator() (const uchar * src2, uchar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
+            __m128i v_src2 = _mm_unpacklo_epi8(_v_src2, v_zero);
+
+            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+        }
+
+        return x;
+    }
+};
+
+#endif // CV_SSE4_1
+
+template <>
+struct Recip_SIMD<schar>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+
+    int operator() (const schar * src2, schar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i _v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
+            __m128i v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _v_src2), 8);
+
+            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi8(_v_src2, v_zero);
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst_0, v_dst_1), v_zero)));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct Recip_SIMD<ushort>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+
+    int operator() (const ushort * src2, ushort * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128i v_src2i = _mm_unpacklo_epi16(v_src2, v_zero);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src2i = _mm_unpackhi_epi16(v_src2, v_zero);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packus_epi32(v_dst_0, v_dst_1)));
+        }
+
+        return x;
+    }
+};
+
+#endif // CV_SSE4_1
+
+template <>
+struct Recip_SIMD<short>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+
+    int operator() (const short * src2, short * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128i v_src2i = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16);
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2i);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            v_src2i = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16);
+            v_src2d = _mm_cvtepi32_pd(v_src2i);
+            v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2i, 8));
+            v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+            __m128i v_dst_1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+
+            __m128i v_mask = _mm_cmpeq_epi16(v_src2, v_zero);
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, _mm_packs_epi32(v_dst_0, v_dst_1)));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Recip_SIMD<int>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2); }
+
+    int operator() (const int * src2, int * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale);
+        __m128i v_zero = _mm_setzero_si128();
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128d v_src2d = _mm_cvtepi32_pd(v_src2);
+            __m128i v_dst0 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+
+            v_src2d = _mm_cvtepi32_pd(_mm_srli_si128(v_src2, 8));
+            __m128i v_dst1 = _mm_cvtpd_epi32(_mm_div_pd(v_scale, v_src2d));
+
+            __m128i v_dst = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(v_dst0), _mm_castsi128_ps(v_dst1)));
+            __m128i v_mask = _mm_cmpeq_epi32(v_src2, v_zero);
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(v_mask, v_dst));
+        }
+
+        return x;
+    }
+};
+
+#endif
+
 template<typename T> static void
 recip_( const T*, size_t, const T* src2, size_t step2,
         T* dst, size_t step, Size size, double scale )
@@ -2628,9 +3308,11 @@ recip_( const T*, size_t, const T* src2, size_t step2,
     step2 /= sizeof(src2[0]);
     step /= sizeof(dst[0]);
 
+    Recip_SIMD<T> vop;
+
     for( ; size.height--; src2 += step2, dst += step )
     {
-        int i = 0;
+        int i = vop(src2, dst, size.width, scale);
         #if CV_ENABLE_UNROLLED
         for( ; i <= size.width - 4; i += 4 )
         {
@@ -3564,6 +4246,130 @@ struct Cmp_SIMD<float>
     uint8x8_t v_mask;
 };
 
+#elif CV_SSE2
+
+template <>
+struct Cmp_SIMD<schar>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        CV_Assert(code == CMP_GT || code == CMP_LE ||
+                  code == CMP_EQ || code == CMP_NE);
+
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+
+        v_mask = _mm_set1_epi8(-1);
+    }
+
+    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 16; x += 16)
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
+        else if (code == CMP_LE)
+            for ( ; x <= width - 16; x += 16)
+            {
+                __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 16; x += 16)
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
+        else if (code == CMP_NE)
+            for ( ; x <= width - 16; x += 16)
+            {
+                __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
+            }
+
+        return x;
+    }
+
+    int code;
+    __m128i v_mask;
+    bool haveSSE;
+};
+
+template <>
+struct Cmp_SIMD<int>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        CV_Assert(code == CMP_GT || code == CMP_LE ||
+                  code == CMP_EQ || code == CMP_NE);
+
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+
+        v_mask = _mm_set1_epi32(0xffffffff);
+    }
+
+    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
+            }
+
+        return x;
+    }
+
+    int code;
+    __m128i v_mask;
+    bool haveSSE;
+};
+
 #endif
 
 template<typename T> static void
@@ -3676,7 +4482,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
         {
             int x =0;
             #if CV_SSE2
-            if( USE_SSE2 ){
+            if( USE_SSE2 )
+            {
                 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
                 __m128i c128 = _mm_set1_epi8 (-128);
                 for( ; x <= size.width - 16; x += 16 )
@@ -3692,7 +4499,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
 
                 }
             }
-           #elif CV_NEON
+            #elif CV_NEON
             uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
 
             for( ; x <= size.width - 16; x += 16 )
@@ -3714,7 +4521,8 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
         {
             int x = 0;
             #if CV_SSE2
-            if( USE_SSE2 ){
+            if( USE_SSE2 )
+            {
                 __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
                 for( ; x <= size.width - 16; x += 16 )
                 {
@@ -3724,7 +4532,7 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
                     _mm_storeu_si128((__m128i*)(dst + x), r00);
                 }
             }
-           #elif CV_NEON
+            #elif CV_NEON
             uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
 
             for( ; x <= size.width - 16; x += 16 )
@@ -3804,7 +4612,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
         {
             int x =0;
             #if CV_SSE2
-            if( USE_SSE2){//
+            if( USE_SSE2)
+            {
                 __m128i m128 =  code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
                 for( ; x <= size.width - 16; x += 16 )
                 {
@@ -3828,7 +4637,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
                     x += 8;
                 }
             }
-           #elif CV_NEON
+            #elif CV_NEON
             uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
 
             for( ; x <= size.width - 16; x += 16 )
@@ -3843,8 +4652,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
 
                 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
             }
-
-           #endif
+            #endif
 
             for( ; x < size.width; x++ ){
                  dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
@@ -3858,7 +4666,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
         {
             int x = 0;
             #if CV_SSE2
-            if( USE_SSE2 ){
+            if( USE_SSE2 )
+            {
                 __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
                 for( ; x <= size.width - 16; x += 16 )
                 {
@@ -3882,7 +4691,7 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
                     x += 8;
                 }
             }
-           #elif CV_NEON
+            #elif CV_NEON
             uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
 
             for( ; x <= size.width - 16; x += 16 )
@@ -3897,8 +4706,8 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
 
                 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
             }
-           #endif
-           for( ; x < size.width; x++ )
+            #endif
+            for( ; x < size.width; x++ )
                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
         }
     }
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 829b984c9f..090acf5508 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -62,8 +63,11 @@ template<typename T> struct VSplit4;
 
 #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
     template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1){  \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0,                    \
+                        data_type* dst1) const                                    \
+        {                                                                         \
             reg_type r = load_func(src);                                          \
             store_func(dst0, r.val[0]);                                           \
             store_func(dst1, r.val[1]);                                           \
@@ -72,9 +76,11 @@ template<typename T> struct VSplit4;
 
 #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
     template<>                                                                    \
-    struct name<data_type>{                                                       \
+    struct name<data_type>                                                        \
+    {                                                                             \
         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2){                                         \
+                        data_type* dst2) const                                    \
+        {                                                                         \
             reg_type r = load_func(src);                                          \
             store_func(dst0, r.val[0]);                                           \
             store_func(dst1, r.val[1]);                                           \
@@ -84,9 +90,11 @@ template<typename T> struct VSplit4;
 
 #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
     template<>                                                                    \
-    struct name<data_type>{                                                       \
+    struct name<data_type>                                                        \
+    {                                                                             \
         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2, data_type* dst3){                        \
+                        data_type* dst2, data_type* dst3) const                   \
+        {                                                                         \
             reg_type r = load_func(src);                                          \
             store_func(dst0, r.val[0]);                                           \
             store_func(dst1, r.val[1]);                                           \
@@ -96,28 +104,174 @@ template<typename T> struct VSplit4;
     }
 
 SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
-SPLIT2_KERNEL_TEMPLATE(VSplit2, schar ,   int8x16x2_t, vld2q_s8 , vst1q_s8 );
 SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, short ,   int16x8x2_t, vld2q_s16, vst1q_s16);
 SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32);
 SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
 
 SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
-SPLIT3_KERNEL_TEMPLATE(VSplit3, schar ,   int8x16x3_t, vld3q_s8 , vst1q_s8 );
 SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, short ,   int16x8x3_t, vld3q_s16, vst1q_s16);
 SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32);
 SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
 
 SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
-SPLIT4_KERNEL_TEMPLATE(VSplit4, schar ,   int8x16x4_t, vld4q_s8 , vst1q_s8 );
 SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, short ,   int16x8x4_t, vld4q_s16, vst1q_s16);
 SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32);
 SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
+
+#elif CV_SSE2
+
+template <typename T>
+struct VSplit2
+{
+    VSplit2() : support(false) { }
+    void operator()(const T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit3
+{
+    VSplit3() : support(false) { }
+    void operator()(const T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit4
+{
+    VSplit4() : support(false) { }
+    void operator()(const T *, T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit2<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit2()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit3<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit3()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1, data_type * dst2) const            \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
+                         v_src3, v_src4, v_src5);                                          \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit4<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit4()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
+                    data_type * dst2, data_type * dst3) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+        reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
+        reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
+                         v_src4, v_src5, v_src6, v_src7);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+        _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
 #endif
 
 template<typename T> static void
@@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn )
             for( ; i < len - inc_i; i += inc_i, j += inc_j)
                 vsplit(src + j, dst0 + i, dst1 + i);
         }
+#elif CV_SSE2
+        if (cn == 2)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VSplit2<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i);
+            }
+        }
 #endif
         for( ; i < len; i++, j += cn )
         {
@@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn )
             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
         }
+#elif CV_SSE2
+        if (cn == 3)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VSplit3<T> vsplit;
+
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
+            }
+        }
 #endif
         for( ; i < len; i++, j += cn )
         {
@@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn )
             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
         }
+#elif CV_SSE2
+        if (cn == 4)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VSplit4<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
+            }
+        }
 #endif
         for( ; i < len; i++, j += cn )
         {
@@ -265,28 +459,177 @@ template<typename T> struct VMerge4;
     }
 
 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
-MERGE2_KERNEL_TEMPLATE(VMerge2, schar ,   int8x16x2_t, vld1q_s8 , vst2q_s8 );
 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
-MERGE2_KERNEL_TEMPLATE(VMerge2, short ,   int16x8x2_t, vld1q_s16, vst2q_s16);
 MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
-MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32);
 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
 
 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
-MERGE3_KERNEL_TEMPLATE(VMerge3, schar ,   int8x16x3_t, vld1q_s8 , vst3q_s8 );
 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
-MERGE3_KERNEL_TEMPLATE(VMerge3, short ,   int16x8x3_t, vld1q_s16, vst3q_s16);
 MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
-MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32);
 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
 
 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
-MERGE4_KERNEL_TEMPLATE(VMerge4, schar ,   int8x16x4_t, vld1q_s8 , vst4q_s8 );
 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
-MERGE4_KERNEL_TEMPLATE(VMerge4, short ,   int16x8x4_t, vld1q_s16, vst4q_s16);
 MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
-MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32);
 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
+
+#elif CV_SSE2
+
+template <typename T>
+struct VMerge2
+{
+    VMerge2() : support(false) { }
+    void operator()(const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VMerge3
+{
+    VMerge3() : support(false) { }
+    void operator()(const T *, const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VMerge4
+{
+    VMerge4() : support(false) { }
+    void operator()(const T *, const T *, const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge2<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge2()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1,                        \
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge3<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge3()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
+        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2,                                             \
+                       v_src3, v_src4, v_src5);                                            \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge4<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge4()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1,                        \
+                    const data_type * src2, const data_type * src3,                        \
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
+        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
+        reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
+        reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
+                       v_src4, v_src5, v_src6, v_src7);                                    \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+
+#if CV_SSE4_1
+MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+#endif
+
+MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+
 #endif
 
 template<typename T> static void
@@ -314,6 +657,17 @@ merge_( const T** src, T* dst, int len, int cn )
             for( ; i < len - inc_i; i += inc_i, j += inc_j)
                 vmerge(src0 + i, src1 + i, dst + j);
         }
+#elif CV_SSE2
+        if(cn == 2)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VMerge2<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, dst + j);
+        }
 #endif
         for( ; i < len; i++, j += cn )
         {
@@ -335,6 +689,17 @@ merge_( const T** src, T* dst, int len, int cn )
             for( ; i < len - inc_i; i += inc_i, j += inc_j)
                 vmerge(src0 + i, src1 + i, src2 + i, dst + j);
         }
+#elif CV_SSE2
+        if(cn == 3)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VMerge3<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, src2 + i, dst + j);
+        }
 #endif
         for( ; i < len; i++, j += cn )
         {
@@ -357,6 +722,17 @@ merge_( const T** src, T* dst, int len, int cn )
             for( ; i < len - inc_i; i += inc_i, j += inc_j)
                 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
         }
+#elif CV_SSE2
+        if(cn == 4)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VMerge4<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
+        }
 #endif
         for( ; i < len; i++, j += cn )
         {
@@ -1123,6 +1499,48 @@ struct cvtScaleAbs_SIMD<uchar, uchar, float>
     }
 };
 
+template <>
+struct cvtScaleAbs_SIMD<schar, uchar, float>
+{
+    int operator () (const schar * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
+                v_zero_f = _mm_setzero_ps();
+            __m128i v_zero_i = _mm_setzero_si128();
+
+            for ( ; x <= width - 16; x += 16)
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
+                __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8),
+                        v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8);
+                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
+                    _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
+                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
+                    _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
+                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
+                __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
+                    _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
+                v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
+                __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
+                    _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
+                v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
+
+                __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
+                                                   _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
+            }
+        }
+
+        return x;
+    }
+};
+
 template <>
 struct cvtScaleAbs_SIMD<ushort, uchar, float>
 {
@@ -1242,6 +1660,44 @@ struct cvtScaleAbs_SIMD<float, uchar, float>
     }
 };
 
+template <>
+struct cvtScaleAbs_SIMD<double, uchar, float>
+{
+    int operator () (const double * src, uchar * dst, int width,
+                     float scale, float shift) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
+                v_zero_f = _mm_setzero_ps();
+            __m128i v_zero_i = _mm_setzero_si128();
+
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
+                                              _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
+                __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
+                                              _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
+
+                __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift);
+                v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
+
+                __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift);
+                v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
+
+                __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1),
+                                                  _mm_cvtps_epi32(v_dst2));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
+            }
+        }
+
+        return x;
+    }
+};
+
 #elif CV_NEON
 
 template <>
@@ -1489,7 +1945,1582 @@ struct cvtScale_SIMD
     }
 };
 
-#if CV_NEON
+#if CV_SSE2
+
+// from uchar
+
+template <>
+struct cvtScale_SIMD<uchar, uchar, float>
+{
+    int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, schar, float>
+{
+    int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct cvtScale_SIMD<uchar, ushort, float>
+{
+    cvtScale_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
+                                             _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct cvtScale_SIMD<uchar, short, float>
+{
+    int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, int, float>
+{
+    int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
+            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, float, float>
+{
+    int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_ps(dst + x, v_dst_0);
+            _mm_storeu_ps(dst + x + 4, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, double, double>
+{
+    int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
+
+            __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x, v_dst_0);
+            _mm_storeu_pd(dst + x + 2, v_dst_1);
+
+            v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
+            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x + 4, v_dst_0);
+            _mm_storeu_pd(dst + x + 6, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+// from schar
+
+template <>
+struct cvtScale_SIMD<schar, uchar, float>
+{
+    int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, schar, float>
+{
+    int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct cvtScale_SIMD<schar, ushort, float>
+{
+    cvtScale_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
+                                             _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct cvtScale_SIMD<schar, short, float>
+{
+    int operator () (const schar * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, int, float>
+{
+    int operator () (const schar * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
+            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, float, float>
+{
+    int operator () (const schar * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_ps(dst + x, v_dst_0);
+            _mm_storeu_ps(dst + x + 4, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, double, double>
+{
+    int operator () (const schar * src, double * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x)));
+            v_src = _mm_srai_epi16(v_src, 8);
+
+            __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x, v_dst_0);
+            _mm_storeu_pd(dst + x + 2, v_dst_1);
+
+            v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
+            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x + 4, v_dst_0);
+            _mm_storeu_pd(dst + x + 6, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+// from ushort
+
+template <>
+struct cvtScale_SIMD<ushort, uchar, float>
+{
+    int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, schar, float>
+{
+    int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct cvtScale_SIMD<ushort, ushort, float>
+{
+    cvtScale_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
+                                             _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct cvtScale_SIMD<ushort, short, float>
+{
+    int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, int, float>
+{
+    int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
+            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, float, float>
+{
+    int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_ps(dst + x, v_dst_0);
+            _mm_storeu_ps(dst + x + 4, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, double, double>
+{
+    int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+
+            __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x, v_dst_0);
+            _mm_storeu_pd(dst + x + 2, v_dst_1);
+
+            v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
+            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x + 4, v_dst_0);
+            _mm_storeu_pd(dst + x + 6, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+// from short
+
+template <>
+struct cvtScale_SIMD<short, uchar, float>
+{
+    int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<short, schar, float>
+{
+    int operator () (const short * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct cvtScale_SIMD<short, ushort, float>
+{
+    cvtScale_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
+                                             _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct cvtScale_SIMD<short, short, float>
+{
+    int operator () (const short * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<short, int, float>
+{
+    int operator () (const short * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
+            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<short, float, float>
+{
+    int operator () (const short * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
+
+            _mm_storeu_ps(dst + x, v_dst_0);
+            _mm_storeu_ps(dst + x + 4, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<short, double, double>
+{
+    int operator () (const short * src, double * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+
+            __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x, v_dst_0);
+            _mm_storeu_pd(dst + x + 2, v_dst_1);
+
+            v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
+            v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
+            v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
+            _mm_storeu_pd(dst + x + 4, v_dst_0);
+            _mm_storeu_pd(dst + x + 6, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+// from int
+
+template <>
+struct cvtScale_SIMD<int, uchar, float>
+{
+    int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<int, schar, float>
+{
+    int operator () (const int * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct cvtScale_SIMD<int, ushort, float>
+{
+    cvtScale_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
+                                             _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct cvtScale_SIMD<int, short, float>
+{
+    int operator () (const int * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<int, int, double>
+{
+    int operator () (const int * src, int * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+
+            v_src = _mm_srli_si128(v_src, 8);
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+
+            __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)),
+                                         _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1)));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<int, float, double>
+{
+    int operator () (const int * src, float * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+
+            v_src = _mm_srli_si128(v_src, 8);
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+
+            _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0),
+                                                 _mm_cvtpd_ps(v_dst_1)));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<int, double, double>
+{
+    int operator () (const int * src, double * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+
+            v_src = _mm_srli_si128(v_src, 8);
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
+
+            _mm_storeu_pd(dst + x, v_dst_0);
+            _mm_storeu_pd(dst + x + 2, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+// from float
+
+template <>
+struct cvtScale_SIMD<float, uchar, float>
+{
+    int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_loadu_ps(src + x);
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_loadu_ps(src + x + 4);
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, schar, float>
+{
+    int operator () (const float * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_loadu_ps(src + x);
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_loadu_ps(src + x + 4);
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct cvtScale_SIMD<float, ushort, float>
+{
+    cvtScale_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_loadu_ps(src + x);
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_loadu_ps(src + x + 4);
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
+                                             _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct cvtScale_SIMD<float, short, float>
+{
+    int operator () (const float * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_loadu_ps(src + x);
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_loadu_ps(src + x + 4);
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, int, float>
+{
+    int operator () (const float * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_loadu_ps(src + x);
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_loadu_ps(src + x + 4);
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
+            _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, float, float>
+{
+    int operator () (const float * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128 v_src = _mm_loadu_ps(src + x);
+            __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+            _mm_storeu_ps(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, double, double>
+{
+    int operator () (const float * src, double * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128 v_src = _mm_loadu_ps(src + x);
+            __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
+            v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
+            __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
+
+            _mm_storeu_pd(dst + x, v_dst_0);
+            _mm_storeu_pd(dst + x + 2, v_dst_1);
+        }
+
+        return x;
+    }
+};
+
+// from double
+
+template <>
+struct cvtScale_SIMD<double, uchar, float>
+{
+    int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
+                                         _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
+                                  _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<double, schar, float>
+{
+    int operator () (const double * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
+                                         _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
+                                  _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct cvtScale_SIMD<double, ushort, float>
+{
+    cvtScale_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
+                                         _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
+                                  _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
+                                             _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct cvtScale_SIMD<double, short, float>
+{
+    int operator () (const double * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
+                                         _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
+            __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
+                                  _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
+            __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
+                                            _mm_cvtps_epi32(v_dst_1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<double, int, double>
+{
+    int operator () (const double * src, int * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128d v_src = _mm_loadu_pd(src + x);
+            __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
+
+            v_src = _mm_loadu_pd(src + x + 2);
+            __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
+
+            __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)),
+                                         _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1)));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<double, float, double>
+{
+    int operator () (const double * src, float * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128d v_src = _mm_loadu_pd(src + x);
+            __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
+
+            v_src = _mm_loadu_pd(src + x + 2);
+            __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
+
+            __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0),
+                                         _mm_cvtpd_ps(v_dst1));
+
+            _mm_storeu_ps(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<double, double, double>
+{
+    int operator () (const double * src, double * dst, int width, double scale, double shift) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
+
+        for ( ; x <= width - 2; x += 2)
+        {
+            __m128d v_src = _mm_loadu_pd(src + x);
+            __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
+            _mm_storeu_pd(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+#elif CV_NEON
 
 // from uchar
 
@@ -2294,26 +4325,44 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
     {
         int x = 0;
 
-         #if CV_SSE2
-            if(USE_SSE2)//~5X
-            {
-                __m128 scale128 = _mm_set1_ps (scale);
-                __m128 shift128 = _mm_set1_ps (shift);
-                for(; x <= size.width - 8; x += 8 )
-                {
-                    __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
-                    __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
-                    __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
-                    __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
-                    rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
-                    rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
-                    r0 = _mm_cvtps_epi32(rf0);
-                    r1 = _mm_cvtps_epi32(rf1);
+        #if CV_AVX2
+        if (USE_AVX2)
+        {
+            __m256 scale256 = _mm256_set1_ps(scale);
+            __m256 shift256 = _mm256_set1_ps(shift);
+            const int shuffle = 0xD8;
 
-                    _mm_storeu_si128((__m128i*)(dst + x), r0);
-                    _mm_storeu_si128((__m128i*)(dst + x + 4), r1);
-                }
+            for ( ; x <= size.width - 16; x += 16)
+            {
+                __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
+                v_src = _mm256_permute4x64_epi64(v_src, shuffle);
+                __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
+                __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
+                __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
+                __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
+                _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
+                _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
             }
+        }
+        #endif
+        #if CV_SSE2
+        if (USE_SSE2)//~5X
+        {
+            __m128 scale128 = _mm_set1_ps (scale);
+            __m128 shift128 = _mm_set1_ps (shift);
+            for(; x <= size.width - 8; x += 8 )
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x));
+
+                __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
+                __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16));
+                rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
+                rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
+
+                _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0));
+                _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1));
+            }
+        }
         #elif CV_NEON
         float32x4_t v_shift = vdupq_n_f32(shift);
         for(; x <= size.width - 8; x += 8 )
@@ -2330,24 +4379,6 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
         }
         #endif
 
-        //We will wait Haswell
-        /*
-        #if CV_AVX
-            if(USE_AVX)//2X - bad variant
-            {
-                ////TODO:AVX implementation (optimization?) required
-                __m256 scale256 = _mm256_set1_ps (scale);
-                __m256 shift256 = _mm256_set1_ps (shift);
-                for(; x <= size.width - 8; x += 8 )
-                {
-                    __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x)));
-                    __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256);
-                    __m256i res = _mm256_cvtps_epi32(r0);
-                    _mm256_storeu_si256 ((__m256i*)(dst+x), res);
-                }
-            }
-        #endif*/
-
         for(; x < size.width; x++ )
             dst[x] = saturate_cast<int>(src[x]*scale + shift);
     }
@@ -2362,7 +4393,180 @@ struct Cvt_SIMD
     }
 };
 
-#if CV_NEON
+#if CV_SSE2
+
+// from double
+
+template <>
+struct Cvt_SIMD<double, uchar>
+{
+    int operator() (const double * src, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
+            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
+            __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
+            __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
+
+            v_src0 = _mm_movelh_ps(v_src0, v_src1);
+            v_src1 = _mm_movelh_ps(v_src2, v_src3);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                            _mm_cvtps_epi32(v_src1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Cvt_SIMD<double, schar>
+{
+    int operator() (const double * src, schar * dst, int width) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
+            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
+            __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
+            __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
+
+            v_src0 = _mm_movelh_ps(v_src0, v_src1);
+            v_src1 = _mm_movelh_ps(v_src2, v_src3);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                            _mm_cvtps_epi32(v_src1));
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+template <>
+struct Cvt_SIMD<double, ushort>
+{
+    bool haveSIMD;
+    Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
+
+    int operator() (const double * src, ushort * dst, int width) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
+            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
+            __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
+            __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
+
+            v_src0 = _mm_movelh_ps(v_src0, v_src1);
+            v_src1 = _mm_movelh_ps(v_src2, v_src3);
+
+            __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
+                                             _mm_cvtps_epi32(v_src1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+#endif // CV_SSE4_1
+
+template <>
+struct Cvt_SIMD<double, short>
+{
+    int operator() (const double * src, short * dst, int width) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
+            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
+            __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
+            __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
+
+            v_src0 = _mm_movelh_ps(v_src0, v_src1);
+            v_src1 = _mm_movelh_ps(v_src2, v_src3);
+
+            __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                            _mm_cvtps_epi32(v_src1));
+            _mm_storeu_si128((__m128i *)(dst + x), v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Cvt_SIMD<double, int>
+{
+    int operator() (const double * src, int * dst, int width) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
+            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
+            v_src0 = _mm_movelh_ps(v_src0, v_src1);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Cvt_SIMD<double, float>
+{
+    int operator() (const double * src, float * dst, int width) const
+    {
+        int x = 0;
+
+        if (!USE_SSE2)
+            return x;
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
+            __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
+
+            _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1));
+        }
+
+        return x;
+    }
+};
+
+
+#elif CV_NEON
 
 // from uchar
 
@@ -2931,8 +5135,9 @@ cvt_<float, short>( const float* src, size_t sstep,
     {
         int x = 0;
         #if   CV_SSE2
-        if(USE_SSE2){
-              for( ; x <= size.width - 8; x += 8 )
+        if(USE_SSE2)
+        {
+            for( ; x <= size.width - 8; x += 8 )
             {
                 __m128 src128 = _mm_loadu_ps (src + x);
                 __m128i src_int128 = _mm_cvtps_epi32 (src128);
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 301ea80a1f..fe8ffd7718 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -11,6 +11,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 1c045f3faa..13ada1d1d6 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -593,14 +594,46 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
             {
                 const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                 double *angle = (double*)ptrs[2];
-                for( k = 0; k < len; k++ )
+                k = 0;
+
+#if CV_SSE2
+                if (USE_SSE2)
+                {
+                    for ( ; k <= len - 4; k += 4)
+                    {
+                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
+                                                      _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
+                        __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
+                                                      _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
+
+                        _mm_storeu_ps(buf[0] + k, v_dst0);
+                        _mm_storeu_ps(buf[1] + k, v_dst1);
+                    }
+                }
+#endif
+
+                for( ; k < len; k++ )
                 {
                     buf[0][k] = (float)x[k];
                     buf[1][k] = (float)y[k];
                 }
 
                 FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
-                for( k = 0; k < len; k++ )
+                k = 0;
+
+#if CV_SSE2
+                if (USE_SSE2)
+                {
+                    for ( ; k <= len - 4; k += 4)
+                    {
+                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
+                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
+                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
+                    }
+                }
+#endif
+
+                for( ; k < len; k++ )
                     angle[k] = buf[0][k];
             }
             ptrs[0] += len*esz1;
@@ -698,14 +731,46 @@ void cartToPolar( InputArray src1, InputArray src2,
                 double *angle = (double*)ptrs[3];
 
                 Magnitude_64f(x, y, (double*)ptrs[2], len);
-                for( k = 0; k < len; k++ )
+                k = 0;
+
+#if CV_SSE2
+                if (USE_SSE2)
+                {
+                    for ( ; k <= len - 4; k += 4)
+                    {
+                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
+                                                      _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
+                        __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
+                                                      _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
+
+                        _mm_storeu_ps(buf[0] + k, v_dst0);
+                        _mm_storeu_ps(buf[1] + k, v_dst1);
+                    }
+                }
+#endif
+
+                for( ; k < len; k++ )
                 {
                     buf[0][k] = (float)x[k];
                     buf[1][k] = (float)y[k];
                 }
 
                 FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
-                for( k = 0; k < len; k++ )
+                k = 0;
+
+#if CV_SSE2
+                if (USE_SSE2)
+                {
+                    for ( ; k <= len - 4; k += 4)
+                    {
+                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
+                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
+                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
+                    }
+                }
+#endif
+
+                for( ; k < len; k++ )
                     angle[k] = buf[0][k];
             }
             ptrs[0] += len*esz1;
@@ -771,14 +836,77 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
     /*static const double cos_a2 =  1;*/
 
     double k1;
-    int i;
+    int i = 0;
 
     if( !angle_in_degrees )
         k1 = N/(2*CV_PI);
     else
         k1 = N/360.;
 
-    for( i = 0; i < len; i++ )
+#if CV_AVX2
+    if (USE_AVX2)
+    {
+        __m128d v_k1 = _mm_set1_pd(k1);
+        __m128d v_1 = _mm_set1_pd(1);
+        __m128i v_N1 = _mm_set1_epi32(N - 1);
+        __m128i v_N4 = _mm_set1_epi32(N >> 2);
+        __m128d v_sin_a0 = _mm_set1_pd(sin_a0);
+        __m128d v_sin_a2 = _mm_set1_pd(sin_a2);
+        __m128d v_cos_a0 = _mm_set1_pd(cos_a0);
+
+        for ( ; i <= len - 4; i += 4)
+        {
+            __m128 v_angle = _mm_loadu_ps(angle + i);
+
+            // 0-1
+            __m128d v_t = _mm_mul_pd(_mm_cvtps_pd(v_angle), v_k1);
+            __m128i v_it = _mm_cvtpd_epi32(v_t);
+            v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
+
+            __m128i v_sin_idx = _mm_and_si128(v_it, v_N1);
+            __m128i v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1);
+
+            __m128d v_t2 = _mm_mul_pd(v_t, v_t);
+            __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
+            __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
+
+            __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
+            __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
+
+            __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
+                                             _mm_mul_pd(v_cos_a, v_sin_b));
+            __m128d v_cos_val_0 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b),
+                                             _mm_mul_pd(v_sin_a, v_sin_b));
+
+            // 2-3
+            v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1);
+            v_it = _mm_cvtpd_epi32(v_t);
+            v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it));
+
+            v_sin_idx = _mm_and_si128(v_it, v_N1);
+            v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1);
+
+            v_t2 = _mm_mul_pd(v_t, v_t);
+            v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t);
+            v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1);
+
+            v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8);
+            v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8);
+
+            __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b),
+                                             _mm_mul_pd(v_cos_a, v_sin_b));
+            __m128d v_cos_val_1 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b),
+                                             _mm_mul_pd(v_sin_a, v_sin_b));
+
+            _mm_storeu_ps(sinval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_sin_val_0),
+                                                    _mm_cvtpd_ps(v_sin_val_1)));
+            _mm_storeu_ps(cosval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_cos_val_0),
+                                                    _mm_cvtpd_ps(v_cos_val_1)));
+        }
+    }
+#endif
+
+    for( ; i < len; i++ )
     {
         double t = angle[i]*k1;
         int it = cvRound(t);
@@ -914,6 +1042,16 @@ void polarToCart( InputArray src1, InputArray src2,
                         vst1q_f32(x + k, vmulq_f32(vld1q_f32(x + k), v_m));
                         vst1q_f32(y + k, vmulq_f32(vld1q_f32(y + k), v_m));
                     }
+                    #elif CV_SSE2
+                    if (USE_SSE2)
+                    {
+                        for( ; k <= len - 4; k += 4 )
+                        {
+                            __m128 v_m = _mm_loadu_ps(mag + k);
+                            _mm_storeu_ps(x + k, _mm_mul_ps(_mm_loadu_ps(x + k), v_m));
+                            _mm_storeu_ps(y + k, _mm_mul_ps(_mm_loadu_ps(y + k), v_m));
+                        }
+                    }
                     #endif
 
                     for( ; k < len; k++ )
@@ -939,10 +1077,10 @@ void polarToCart( InputArray src1, InputArray src2,
                         x[k] = buf[0][k]*m; y[k] = buf[1][k]*m;
                     }
                 else
-                    for( k = 0; k < len; k++ )
-                    {
-                        x[k] = buf[0][k]; y[k] = buf[1][k];
-                    }
+                {
+                    std::memcpy(x, buf[0], sizeof(float) * len);
+                    std::memcpy(y, buf[1], sizeof(float) * len);
+                }
             }
 
             if( ptrs[0] )
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index b2f36b3292..6c8bad2444 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index ef154400e2..0f85cc5568 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -192,6 +192,7 @@ struct NoVec
 extern volatile bool USE_SSE2;
 extern volatile bool USE_SSE4_2;
 extern volatile bool USE_AVX;
+extern volatile bool USE_AVX2;
 
 enum { BLOCK_SIZE = 1024 };
 
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index ca56a7c966..87c423dc3b 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -72,7 +73,114 @@ struct Sum_SIMD
     }
 };
 
-#if CV_NEON
+#if CV_SSE2
+
+template <>
+struct Sum_SIMD<schar, int>
+{
+    int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const
+    {
+        if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
+            return 0;
+
+        int x = 0;
+        __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero;
+
+        for ( ; x <= len - 16; x += 16)
+        {
+            __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
+            __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
+
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
+
+            v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
+        }
+
+        for ( ; x <= len - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
+
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+        }
+
+        int CV_DECL_ALIGNED(16) ar[4];
+        _mm_store_si128((__m128i*)ar, v_sum);
+
+        for (int i = 0; i < 4; i += cn)
+            for (int j = 0; j < cn; ++j)
+                dst[j] += ar[j + i];
+
+        return x / cn;
+    }
+};
+
+template <>
+struct Sum_SIMD<int, double>
+{
+    int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const
+    {
+        if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
+            return 0;
+
+        int x = 0;
+        __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
+
+        for ( ; x <= len - 4; x += 4)
+        {
+            __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x));
+            v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src));
+            v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8)));
+        }
+
+        double CV_DECL_ALIGNED(16) ar[4];
+        _mm_store_pd(ar, v_sum0);
+        _mm_store_pd(ar + 2, v_sum1);
+
+        for (int i = 0; i < 4; i += cn)
+            for (int j = 0; j < cn; ++j)
+                dst[j] += ar[j + i];
+
+        return x / cn;
+    }
+};
+
+template <>
+struct Sum_SIMD<float, double>
+{
+    int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const
+    {
+        if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2)
+            return 0;
+
+        int x = 0;
+        __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero;
+
+        for ( ; x <= len - 4; x += 4)
+        {
+            __m128 v_src = _mm_loadu_ps(src0 + x);
+            v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src));
+            v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
+            v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src));
+        }
+
+        double CV_DECL_ALIGNED(16) ar[4];
+        _mm_store_pd(ar, v_sum0);
+        _mm_store_pd(ar + 2, v_sum1);
+
+        for (int i = 0; i < 4; i += cn)
+            for (int j = 0; j < cn; ++j)
+                dst[j] += ar[j + i];
+
+        return x / cn;
+    }
+};
+
+
+#elif CV_NEON
 
 template <>
 struct Sum_SIMD<uchar, int>
@@ -396,6 +504,38 @@ static int countNonZero_(const T* src, int len )
     return nz;
 }
 
+#if CV_SSE2
+
+static const uchar * initPopcountTable()
+{
+    static uchar tab[256];
+    static volatile bool initialized = false;
+    if( !initialized )
+    {
+        // we compute inverse popcount table,
+        // since we pass (img[x] == 0) mask as index in the table.
+        unsigned int j = 0u;
+#if CV_POPCNT
+        if (checkHardwareSupport(CV_CPU_POPCNT))
+            for( ; j < 256u; j++ )
+                tab[j] = (uchar)(8 - _mm_popcnt_u32(j));
+#else
+        for( ; j < 256u; j++ )
+        {
+            int val = 0;
+            for( int mask = 1; mask < 256; mask += mask )
+                val += (j & mask) == 0;
+            tab[j] = (uchar)val;
+        }
+#endif
+        initialized = true;
+    }
+
+    return tab;
+}
+
+#endif
+
 static int countNonZero8u( const uchar* src, int len )
 {
     int i=0, nz = 0;
@@ -403,21 +543,7 @@ static int countNonZero8u( const uchar* src, int len )
     if(USE_SSE2)//5x-6x
     {
         __m128i pattern = _mm_setzero_si128 ();
-        static uchar tab[256];
-        static volatile bool initialized = false;
-        if( !initialized )
-        {
-            // we compute inverse popcount table,
-            // since we pass (img[x] == 0) mask as index in the table.
-            for( int j = 0; j < 256; j++ )
-            {
-                int val = 0;
-                for( int mask = 1; mask < 256; mask += mask )
-                    val += (j & mask) == 0;
-                tab[j] = (uchar)val;
-            }
-            initialized = true;
-        }
+        static const uchar * tab = initPopcountTable();
 
         for (; i<=len-16; i+=16)
         {
@@ -467,7 +593,22 @@ static int countNonZero8u( const uchar* src, int len )
 static int countNonZero16u( const ushort* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_NEON
+#if CV_SSE2
+    if (USE_SSE2)
+    {
+        __m128i v_zero = _mm_setzero_si128 ();
+        static const uchar * tab = initPopcountTable();
+
+        for ( ; i <= len - 8; i += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((const __m128i*)(src + i));
+            int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_cmpeq_epi16(v_src, v_zero), v_zero));
+            nz += tab[val];
+        }
+
+        src += i;
+    }
+#elif CV_NEON
     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
     uint32x4_t v_nz = vdupq_n_u32(0u);
     uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
@@ -503,7 +644,27 @@ static int countNonZero16u( const ushort* src, int len )
 static int countNonZero32s( const int* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_NEON
+#if CV_SSE2
+    if (USE_SSE2)
+    {
+        __m128i v_zero = _mm_setzero_si128 ();
+        static const uchar * tab = initPopcountTable();
+
+        for ( ; i <= len - 8; i += 8)
+        {
+            __m128i v_src = _mm_loadu_si128((const __m128i*)(src + i));
+            __m128i v_dst0 = _mm_cmpeq_epi32(v_src, v_zero);
+
+            v_src = _mm_loadu_si128((const __m128i*)(src + i + 4));
+            __m128i v_dst1 = _mm_cmpeq_epi32(v_src, v_zero);
+
+            int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero));
+            nz += tab[val];
+        }
+
+        src += i;
+    }
+#elif CV_NEON
     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
     uint32x4_t v_nz = vdupq_n_u32(0u);
     int32x4_t v_zero = vdupq_n_s32(0.0f);
@@ -541,7 +702,25 @@ static int countNonZero32s( const int* src, int len )
 static int countNonZero32f( const float* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_NEON
+#if CV_SSE2
+    if (USE_SSE2)
+    {
+        __m128i v_zero_i = _mm_setzero_si128();
+        __m128 v_zero_f = _mm_setzero_ps();
+        static const uchar * tab = initPopcountTable();
+
+        for ( ; i <= len - 8; i += 8)
+        {
+            __m128i v_dst0 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i), v_zero_f));
+            __m128i v_dst1 = _mm_castps_si128(_mm_cmpeq_ps(_mm_loadu_ps(src + i + 4), v_zero_f));
+
+            int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i));
+            nz += tab[val];
+        }
+
+        src += i;
+    }
+#elif CV_NEON
     int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
     uint32x4_t v_nz = vdupq_n_u32(0u);
     float32x4_t v_zero = vdupq_n_f32(0.0f);
@@ -577,7 +756,34 @@ static int countNonZero32f( const float* src, int len )
 }
 
 static int countNonZero64f( const double* src, int len )
-{ return countNonZero_(src, len); }
+{
+    int i = 0, nz = 0;
+#if CV_SSE2
+    if (USE_SSE2)
+    {
+        __m128i v_zero_i = _mm_setzero_si128();
+        __m128d v_zero_d = _mm_setzero_pd();
+        static const uchar * tab = initPopcountTable();
+
+        for ( ; i <= len - 8; i += 8)
+        {
+            __m128i v_dst0 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i), v_zero_d));
+            __m128i v_dst1 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 2), v_zero_d));
+            __m128i v_dst2 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 4), v_zero_d));
+            __m128i v_dst3 = _mm_castpd_si128(_mm_cmpeq_pd(_mm_loadu_pd(src + i + 6), v_zero_d));
+
+            v_dst0 = _mm_packs_epi32(v_dst0, v_dst1);
+            v_dst1 = _mm_packs_epi32(v_dst2, v_dst3);
+
+            int val = _mm_movemask_epi8(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_zero_i));
+            nz += tab[val];
+        }
+
+        src += i;
+    }
+#endif
+    return nz + countNonZero_(src, len - i);
+}
 
 typedef int (*CountNonZeroFunc)(const uchar*, int);
 
@@ -594,6 +800,137 @@ static CountNonZeroFunc getCountNonZeroTab(int depth)
     return countNonZeroTab[depth];
 }
 
+template <typename T, typename ST, typename SQT>
+struct SumSqr_SIMD
+{
+    int operator () (const T *, const uchar *, ST *, SQT *, int, int) const
+    {
+        return 0;
+    }
+};
+
+#if CV_SSE2
+
+template <>
+struct SumSqr_SIMD<uchar, int, int>
+{
+    int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
+    {
+        if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
+            return 0;
+
+        int x = 0;
+        __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
+
+        for ( ; x <= len - 16; x += 16)
+        {
+            __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
+            __m128i v_half = _mm_unpacklo_epi8(v_src, v_zero);
+
+            __m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
+            __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
+            v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
+            v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
+
+            v_half = _mm_unpackhi_epi8(v_src, v_zero);
+            v_mullo = _mm_mullo_epi16(v_half, v_half);
+            v_mulhi = _mm_mulhi_epi16(v_half, v_half);
+            v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_half, v_zero));
+            v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_half, v_zero));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
+        }
+
+        for ( ; x <= len - 8; x += 8)
+        {
+            __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero);
+
+            __m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
+            __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
+            v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_src, v_zero));
+            v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_src, v_zero));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
+        }
+
+        int CV_DECL_ALIGNED(16) ar[8];
+        _mm_store_si128((__m128i*)ar, v_sum);
+        _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
+
+        for (int i = 0; i < 4; i += cn)
+            for (int j = 0; j < cn; ++j)
+            {
+                sum[j] += ar[j + i];
+                sqsum[j] += ar[4 + j + i];
+            }
+
+        return x / cn;
+    }
+};
+
+template <>
+struct SumSqr_SIMD<schar, int, int>
+{
+    int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const
+    {
+        if (mask || (cn != 1 && cn != 2) || !USE_SSE2)
+            return 0;
+
+        int x = 0;
+        __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero;
+
+        for ( ; x <= len - 16; x += 16)
+        {
+            __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x));
+            __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8);
+
+            __m128i v_mullo = _mm_mullo_epi16(v_half, v_half);
+            __m128i v_mulhi = _mm_mulhi_epi16(v_half, v_half);
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
+
+            v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8);
+            v_mullo = _mm_mullo_epi16(v_half, v_half);
+            v_mulhi = _mm_mulhi_epi16(v_half, v_half);
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16));
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
+        }
+
+        for ( ; x <= len - 8; x += 8)
+        {
+            __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8);
+
+            __m128i v_mullo = _mm_mullo_epi16(v_src, v_src);
+            __m128i v_mulhi = _mm_mulhi_epi16(v_src, v_src);
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
+            v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpacklo_epi16(v_mullo, v_mulhi));
+            v_sqsum = _mm_add_epi32(v_sqsum, _mm_unpackhi_epi16(v_mullo, v_mulhi));
+        }
+
+        int CV_DECL_ALIGNED(16) ar[8];
+        _mm_store_si128((__m128i*)ar, v_sum);
+        _mm_store_si128((__m128i*)(ar + 4), v_sqsum);
+
+        for (int i = 0; i < 4; i += cn)
+            for (int j = 0; j < cn; ++j)
+            {
+                sum[j] += ar[j + i];
+                sqsum[j] += ar[4 + j + i];
+            }
+
+        return x / cn;
+    }
+};
+
+#endif
+
 template<typename T, typename ST, typename SQT>
 static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn )
 {
@@ -601,14 +938,15 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
 
     if( !mask )
     {
-        int i;
-        int k = cn % 4;
+        SumSqr_SIMD<T, ST, SQT> vop;
+        int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4;
+        src += i * cn;
 
         if( k == 1 )
         {
             ST s0 = sum[0];
             SQT sq0 = sqsum[0];
-            for( i = 0; i < len; i++, src += cn )
+            for( ; i < len; i++, src += cn )
             {
                 T v = src[0];
                 s0 += v; sq0 += (SQT)v*v;
@@ -620,7 +958,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
         {
             ST s0 = sum[0], s1 = sum[1];
             SQT sq0 = sqsum[0], sq1 = sqsum[1];
-            for( i = 0; i < len; i++, src += cn )
+            for( ; i < len; i++, src += cn )
             {
                 T v0 = src[0], v1 = src[1];
                 s0 += v0; sq0 += (SQT)v0*v0;
@@ -633,7 +971,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
         {
             ST s0 = sum[0], s1 = sum[1], s2 = sum[2];
             SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2];
-            for( i = 0; i < len; i++, src += cn )
+            for( ; i < len; i++, src += cn )
             {
                 T v0 = src[0], v1 = src[1], v2 = src[2];
                 s0 += v0; sq0 += (SQT)v0*v0;
@@ -649,7 +987,7 @@ static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int le
             src = src0 + k;
             ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3];
             SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3];
-            for( i = 0; i < len; i++, src += cn )
+            for( ; i < len; i++, src += cn )
             {
                 T v0, v1;
                 v0 = src[0], v1 = src[1];
@@ -924,7 +1262,6 @@ cv::Scalar cv::sum( InputArray _src )
         }
     }
 #endif
-
     SumFunc func = getSumFunc(depth);
 
     CV_Assert( cn <= 4 && func != 0 );
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index daf13a2dda..b6f3466f71 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -89,6 +90,22 @@
             pop ebx
         }
     }
+    static void __cpuidex(int* cpuid_data, int, int)
+    {
+        __asm
+        {
+            push edi
+            mov edi, cpuid_data
+            mov eax, 7
+            mov ecx, 0
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+        }
+    }
   #endif
 #endif
 
@@ -208,7 +225,7 @@ struct HWFeatures
     enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
 
     HWFeatures(void)
-     {
+    {
         memset( have, 0, sizeof(have) );
         x86_family = 0;
     }
@@ -252,10 +269,54 @@ struct HWFeatures
             f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
             f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
             f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
+            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
             f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
             f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
             f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
             f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
+
+            // make the second call to the cpuid command in order to get
+            // information about extended features like AVX2
+        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+            __cpuidex(cpuid_data, 7, 0);
+        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+            #ifdef __x86_64__
+            asm __volatile__
+            (
+             "movl $7, %%eax\n\t"
+             "movl $0, %%ecx\n\t"
+             "cpuid\n\t"
+             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :
+             : "cc"
+            );
+            #else
+            asm volatile
+            (
+             "pushl %%eax\n\t"
+             "pushl %%edx\n\t"
+             "movl $7,%%eax\n\t"
+             "movl $0,%%ecx\n\t"
+             "cpuid\n\t"
+             "popl %%edx\n\t"
+             "popl %%eax\n\t"
+             : "=b"(cpuid_data[1]), "=c"(cpuid_data[2])
+             :
+             : "cc"
+            );
+            #endif
+        #endif
+            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
+
+            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
+            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
+            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
+            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
+            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
+            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
+            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
+            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
+            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
         }
 
     #if defined ANDROID || defined __linux__
@@ -318,6 +379,7 @@ IPPInitializer ippInitializer;
 volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
 volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
 volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
+volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
 
 void setUseOptimized( bool flag )
 {
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 443111f48d..ffc20777b5 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -10,8 +10,7 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index d0d3847bed..0541819f89 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -1577,7 +1577,7 @@ PARAM_TEST_CASE(ConvertScaleAbs, MatDepth, Channels, bool)
 
         Size roiSize = randomSize(1, MAX_VALUE);
         Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, stype, 2, 11); // FIXIT: Test with minV, maxV
+        randomSubMat(src, src_roi, roiSize, srcBorder, stype, -11, 11); // FIXIT: Test with minV, maxV
 
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, 5, 16);
diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp
index d2e8b39aa3..23dc4576ba 100644
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 /
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index 1311d5abb9..233218b3e2 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
index 18a91d9544..06fc73153f 100644
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index f0a8fd8584..b900cf1845 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -965,6 +966,11 @@ struct Gray2RGB5x5
         #if CV_NEON
         v_n7 = vdup_n_u8(~7);
         v_n3 = vdup_n_u8(~3);
+        #elif CV_SSE2
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        v_n7 = _mm_set1_epi16(~7);
+        v_n3 = _mm_set1_epi16(~3);
+        v_zero = _mm_setzero_si128();
         #endif
     }
 
@@ -982,6 +988,26 @@ struct Gray2RGB5x5
                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
                 vst1q_u16((ushort *)dst + i, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; i <= n - 16; i += 16 )
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
+
+                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
+                    __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
+                                    _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
+                                                 _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
+
+                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
+                    v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
+                            _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
+                                         _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
+                }
+            }
             #endif
             for ( ; i < n; i++ )
             {
@@ -998,6 +1024,26 @@ struct Gray2RGB5x5
                 uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
                 vst1q_u16((ushort *)dst + i, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; i <= n - 16; i += 8 )
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
+
+                    __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3);
+                    __m128i v_dst = _mm_or_si128(v_src_p,
+                                    _mm_or_si128(_mm_slli_epi32(v_src_p, 5),
+                                                 _mm_slli_epi16(v_src_p, 10)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
+
+                    v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3);
+                    v_dst = _mm_or_si128(v_src_p,
+                            _mm_or_si128(_mm_slli_epi16(v_src_p, 5),
+                                         _mm_slli_epi16(v_src_p, 10)));
+                    _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
+                }
+            }
             #endif
             for( ; i < n; i++ )
             {
@@ -1010,6 +1056,9 @@ struct Gray2RGB5x5
 
     #if CV_NEON
     uint8x8_t v_n7, v_n3;
+    #elif CV_SSE2
+    __m128i v_n7, v_n3, v_zero;
+    bool haveSIMD;
     #endif
 };
 
@@ -1042,6 +1091,14 @@ struct RGB5x52Gray
         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
         v_f8 = vdupq_n_u16(0xf8);
         v_fc = vdupq_n_u16(0xfc);
+        #elif CV_SSE2
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        v_b2y = _mm_set1_epi16(B2Y);
+        v_g2y = _mm_set1_epi16(G2Y);
+        v_r2y = _mm_set1_epi16(R2Y);
+        v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
+        v_f8 = _mm_set1_epi16(0xf8);
+        v_fc = _mm_set1_epi16(0xfc);
         #endif
     }
 
@@ -1067,6 +1124,42 @@ struct RGB5x52Gray
 
                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                __m128i v_zero = _mm_setzero_si128();
+
+                for ( ; i <= n - 8; i += 8)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
+                    __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
+                            v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc),
+                            v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8);
+
+                    __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
+                    __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
+                    __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
+                    __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
+                    __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
+                    __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
+
+                    __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
+                                                   _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
+                    v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
+                                           _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
+
+                    __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
+                                                   _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
+                    v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
+                                           _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
+
+                    v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
+                    v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
+
+                    __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
+                    _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
+                }
+            }
             #endif
             for ( ; i < n; i++)
             {
@@ -1095,6 +1188,42 @@ struct RGB5x52Gray
 
                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                __m128i v_zero = _mm_setzero_si128();
+
+                for ( ; i <= n - 8; i += 8)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
+                    __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
+                            v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8),
+                            v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8);
+
+                    __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
+                    __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
+                    __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
+                    __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
+                    __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
+                    __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
+
+                    __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
+                                                   _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
+                    v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
+                                           _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
+
+                    __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
+                                                   _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
+                    v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
+                                           _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
+
+                    v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
+                    v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
+
+                    __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
+                    _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
+                }
+            }
             #endif
             for ( ; i < n; i++)
             {
@@ -1111,6 +1240,11 @@ struct RGB5x52Gray
     uint16x4_t v_b2y, v_g2y, v_r2y;
     uint32x4_t v_delta;
     uint16x8_t v_f8, v_fc;
+    #elif CV_SSE2
+    bool haveSIMD;
+    __m128i v_b2y, v_g2y, v_r2y;
+    __m128i v_delta;
+    __m128i v_f8, v_fc;
     #endif
 };
 
@@ -1327,6 +1461,219 @@ struct RGB2Gray<float>
     float32x4_t v_cb, v_cg, v_cr;
 };
 
+#elif CV_SSE2
+
+#if CV_SSE4_1
+
+template <>
+struct RGB2Gray<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
+        srccn(_srccn)
+    {
+        static const int coeffs0[] = { R2Y, G2Y, B2Y };
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
+        if( blueIdx == 0 )
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = _mm_set1_epi16((short)coeffs[0]);
+        v_cg = _mm_set1_epi16((short)coeffs[1]);
+        v_cr = _mm_set1_epi16((short)coeffs[2]);
+        v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    // 16s x 8
+    void process(__m128i v_b, __m128i v_g, __m128i v_r,
+                 __m128i & v_gray) const
+    {
+        __m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr);
+        __m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg);
+        __m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb);
+        __m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr);
+        __m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg);
+        __m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb);
+
+        __m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r),
+                                        _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
+        v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0);
+        v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift);
+
+        __m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r),
+                                        _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
+        v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1);
+        v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift);
+
+        v_gray = _mm_packus_epi32(v_gray0, v_gray1);
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
+
+        if (scn == 3 && haveSIMD)
+        {
+            for ( ; i <= n - 16; i += 16, src += scn * 16)
+            {
+                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
+                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
+                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
+                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
+                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
+                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
+
+                _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                __m128i v_gray0;
+                process(v_r0, v_g0, v_b0,
+                        v_gray0);
+
+                __m128i v_gray1;
+                process(v_r1, v_g1, v_b1,
+                        v_gray1);
+
+                _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
+                _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
+            }
+        }
+        else if (scn == 4 && haveSIMD)
+        {
+            for ( ; i <= n - 16; i += 16, src += scn * 16)
+            {
+                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
+                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
+                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
+                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
+                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
+                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
+                __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
+                __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
+
+                _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
+
+                __m128i v_gray0;
+                process(v_r0, v_g0, v_b0,
+                        v_gray0);
+
+                __m128i v_gray1;
+                process(v_r1, v_g1, v_b1,
+                        v_gray1);
+
+                _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
+                _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
+            }
+        }
+
+        for( ; i < n; i++, src += scn)
+            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
+    }
+
+    int srccn, coeffs[3];
+    __m128i v_cb, v_cg, v_cr;
+    __m128i v_delta;
+    bool haveSIMD;
+};
+
+#endif // CV_SSE4_1
+
+template <>
+struct RGB2Gray<float>
+{
+    typedef float channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
+        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
+        if(blueIdx == 0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = _mm_set1_ps(coeffs[0]);
+        v_cg = _mm_set1_ps(coeffs[1]);
+        v_cr = _mm_set1_ps(coeffs[2]);
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    void process(__m128 v_b, __m128 v_g, __m128 v_r,
+                 __m128 & v_gray) const
+    {
+        v_gray = _mm_mul_ps(v_r, v_cr);
+        v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg));
+        v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb));
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+
+        if (scn == 3 && haveSIMD)
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                __m128 v_r0 = _mm_loadu_ps(src);
+                __m128 v_r1 = _mm_loadu_ps(src + 4);
+                __m128 v_g0 = _mm_loadu_ps(src + 8);
+                __m128 v_g1 = _mm_loadu_ps(src + 12);
+                __m128 v_b0 = _mm_loadu_ps(src + 16);
+                __m128 v_b1 = _mm_loadu_ps(src + 20);
+
+                _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                __m128 v_gray0;
+                process(v_r0, v_g0, v_b0,
+                        v_gray0);
+
+                __m128 v_gray1;
+                process(v_r1, v_g1, v_b1,
+                        v_gray1);
+
+                _mm_storeu_ps(dst + i, v_gray0);
+                _mm_storeu_ps(dst + i + 4, v_gray1);
+            }
+        }
+        else if (scn == 4 && haveSIMD)
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                __m128 v_r0 = _mm_loadu_ps(src);
+                __m128 v_r1 = _mm_loadu_ps(src + 4);
+                __m128 v_g0 = _mm_loadu_ps(src + 8);
+                __m128 v_g1 = _mm_loadu_ps(src + 12);
+                __m128 v_b0 = _mm_loadu_ps(src + 16);
+                __m128 v_b1 = _mm_loadu_ps(src + 20);
+                __m128 v_a0 = _mm_loadu_ps(src + 24);
+                __m128 v_a1 = _mm_loadu_ps(src + 28);
+
+                _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
+
+                __m128 v_gray0;
+                process(v_r0, v_g0, v_b0,
+                        v_gray0);
+
+                __m128 v_gray1;
+                process(v_r1, v_g1, v_b1,
+                        v_gray1);
+
+                _mm_storeu_ps(dst + i, v_gray0);
+                _mm_storeu_ps(dst + i + 4, v_gray1);
+            }
+        }
+
+        for ( ; i < n; i++, src += scn)
+            dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
+    }
+
+    int srccn;
+    float coeffs[3];
+    __m128 v_cb, v_cg, v_cr;
+    bool haveSIMD;
+};
+
 #else
 
 template<> struct RGB2Gray<ushort>
@@ -1449,6 +1796,103 @@ struct RGB2YCrCb_f<float>
     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
 };
 
+#elif CV_SSE2
+
+template <>
+struct RGB2YCrCb_f<float>
+{
+    typedef float channel_type;
+
+    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
+        srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if (blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = _mm_set1_ps(coeffs[0]);
+        v_c1 = _mm_set1_ps(coeffs[1]);
+        v_c2 = _mm_set1_ps(coeffs[2]);
+        v_c3 = _mm_set1_ps(coeffs[3]);
+        v_c4 = _mm_set1_ps(coeffs[4]);
+        v_delta = _mm_set1_ps(ColorChannel<float>::half());
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    void process(__m128 v_r, __m128 v_g, __m128 v_b,
+                 __m128 & v_y, __m128 & v_cr, __m128 & v_cb) const
+    {
+        v_y = _mm_mul_ps(v_r, v_c0);
+        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c1));
+        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c2));
+
+        v_cr = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 0 ? v_b : v_r, v_y), v_c3), v_delta);
+        v_cb = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 2 ? v_b : v_r, v_y), v_c4), v_delta);
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        const float delta = ColorChannel<float>::half();
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        n *= 3;
+
+        if (haveSIMD)
+        {
+            for ( ; i <= n - 24; i += 24, src += 8 * scn)
+            {
+                __m128 v_r0 = _mm_loadu_ps(src);
+                __m128 v_r1 = _mm_loadu_ps(src + 4);
+                __m128 v_g0 = _mm_loadu_ps(src + 8);
+                __m128 v_g1 = _mm_loadu_ps(src + 12);
+                __m128 v_b0 = _mm_loadu_ps(src + 16);
+                __m128 v_b1 = _mm_loadu_ps(src + 20);
+
+                if (scn == 4)
+                {
+                    __m128 v_a0 = _mm_loadu_ps(src + 24);
+                    __m128 v_a1 = _mm_loadu_ps(src + 28);
+                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
+                                        v_b0, v_b1, v_a0, v_a1);
+                }
+                else
+                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                __m128 v_y0, v_cr0, v_cb0;
+                process(v_r0, v_g0, v_b0,
+                        v_y0, v_cr0, v_cb0);
+
+                __m128 v_y1, v_cr1, v_cb1;
+                process(v_r1, v_g1, v_b1,
+                        v_y1, v_cr1, v_cb1);
+
+                _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+
+                _mm_storeu_ps(dst + i, v_y0);
+                _mm_storeu_ps(dst + i + 4, v_y1);
+                _mm_storeu_ps(dst + i + 8, v_cr0);
+                _mm_storeu_ps(dst + i + 12, v_cr1);
+                _mm_storeu_ps(dst + i + 16, v_cb0);
+                _mm_storeu_ps(dst + i + 20, v_cb1);
+            }
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
+            float Cr = (src[bidx^2] - Y)*C3 + delta;
+            float Cb = (src[bidx] - Y)*C4 + delta;
+            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
+        }
+    }
+    int srccn, blueIdx;
+    float coeffs[5];
+    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
+    bool haveSIMD;
+};
+
 #endif
 
 template<typename _Tp> struct RGB2YCrCb_i
@@ -1699,7 +2143,288 @@ struct RGB2YCrCb_i<ushort>
     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
 };
 
-#endif
+#elif CV_SSE4_1
+
+template <>
+struct RGB2YCrCb_i<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+        : srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if (blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = _mm_set1_epi32(coeffs[0]);
+        v_c1 = _mm_set1_epi32(coeffs[1]);
+        v_c2 = _mm_set1_epi32(coeffs[2]);
+        v_c3 = _mm_set1_epi32(coeffs[3]);
+        v_c4 = _mm_set1_epi32(coeffs[4]);
+        v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
+        v_delta = _mm_set1_epi32(ColorChannel<uchar>::half()*(1 << yuv_shift));
+        v_delta = _mm_add_epi32(v_delta, v_delta2);
+        v_zero = _mm_setzero_si128();
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    // 16u x 8
+    void process(__m128i v_r, __m128i v_g, __m128i v_b,
+                 __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
+    {
+        __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
+        __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
+        __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
+
+        __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
+                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
+                                     _mm_mullo_epi32(v_b_p, v_c2)));
+        v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
+
+        __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
+        __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
+        v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
+        v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
+
+        v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
+        v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
+        v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
+
+        __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
+                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
+                                     _mm_mullo_epi32(v_b_p, v_c2)));
+        v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
+
+        __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
+        __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
+        v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
+        v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
+
+        v_y = _mm_packs_epi32(v_y0, v_y1);
+        v_cr = _mm_packs_epi32(v_cr0, v_cr1);
+        v_cb = _mm_packs_epi32(v_cb0, v_cb1);
+    }
+
+    void operator()(const uchar * src, uchar * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
+        n *= 3;
+
+        if (haveSIMD)
+        {
+            for ( ; i <= n - 96; i += 96, src += scn * 32)
+            {
+                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
+                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 16));
+                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 32));
+                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 48));
+                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64));
+                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80));
+
+                if (scn == 4)
+                {
+                    __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 96));
+                    __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 112));
+                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1,
+                                          v_b0, v_b1, v_a0, v_a1);
+                }
+                else
+                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
+                process(_mm_unpacklo_epi8(v_r0, v_zero),
+                        _mm_unpacklo_epi8(v_g0, v_zero),
+                        _mm_unpacklo_epi8(v_b0, v_zero),
+                        v_y0, v_cr0, v_cb0);
+
+                __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
+                process(_mm_unpackhi_epi8(v_r0, v_zero),
+                        _mm_unpackhi_epi8(v_g0, v_zero),
+                        _mm_unpackhi_epi8(v_b0, v_zero),
+                        v_y1, v_cr1, v_cb1);
+
+                __m128i v_y_0 = _mm_packus_epi16(v_y0, v_y1);
+                __m128i v_cr_0 = _mm_packus_epi16(v_cr0, v_cr1);
+                __m128i v_cb_0 = _mm_packus_epi16(v_cb0, v_cb1);
+
+                process(_mm_unpacklo_epi8(v_r1, v_zero),
+                        _mm_unpacklo_epi8(v_g1, v_zero),
+                        _mm_unpacklo_epi8(v_b1, v_zero),
+                        v_y0, v_cr0, v_cb0);
+
+                process(_mm_unpackhi_epi8(v_r1, v_zero),
+                        _mm_unpackhi_epi8(v_g1, v_zero),
+                        _mm_unpackhi_epi8(v_b1, v_zero),
+                        v_y1, v_cr1, v_cb1);
+
+                __m128i v_y_1 = _mm_packus_epi16(v_y0, v_y1);
+                __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1);
+                __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1);
+
+                _mm_interleave_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1);
+
+                _mm_storeu_si128((__m128i *)(dst + i), v_y_0);
+                _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1);
+                _mm_storeu_si128((__m128i *)(dst + i + 32), v_cr_0);
+                _mm_storeu_si128((__m128i *)(dst + i + 48), v_cr_1);
+                _mm_storeu_si128((__m128i *)(dst + i + 64), v_cb_0);
+                _mm_storeu_si128((__m128i *)(dst + i + 80), v_cb_1);
+            }
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+            dst[i] = saturate_cast<uchar>(Y);
+            dst[i+1] = saturate_cast<uchar>(Cr);
+            dst[i+2] = saturate_cast<uchar>(Cb);
+        }
+    }
+
+    int srccn, blueIdx, coeffs[5];
+    __m128i v_c0, v_c1, v_c2;
+    __m128i v_c3, v_c4, v_delta, v_delta2;
+    __m128i v_zero;
+    bool haveSIMD;
+};
+
+template <>
+struct RGB2YCrCb_i<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+        : srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if (blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = _mm_set1_epi32(coeffs[0]);
+        v_c1 = _mm_set1_epi32(coeffs[1]);
+        v_c2 = _mm_set1_epi32(coeffs[2]);
+        v_c3 = _mm_set1_epi32(coeffs[3]);
+        v_c4 = _mm_set1_epi32(coeffs[4]);
+        v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
+        v_delta = _mm_set1_epi32(ColorChannel<ushort>::half()*(1 << yuv_shift));
+        v_delta = _mm_add_epi32(v_delta, v_delta2);
+        v_zero = _mm_setzero_si128();
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    // 16u x 8
+    void process(__m128i v_r, __m128i v_g, __m128i v_b,
+                 __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
+    {
+        __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
+        __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
+        __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
+
+        __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
+                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
+                                     _mm_mullo_epi32(v_b_p, v_c2)));
+        v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
+
+        __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
+        __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
+        v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
+        v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
+
+        v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
+        v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
+        v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
+
+        __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
+                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
+                                     _mm_mullo_epi32(v_b_p, v_c2)));
+        v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
+
+        __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
+        __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
+        v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
+        v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
+
+        v_y = _mm_packus_epi32(v_y0, v_y1);
+        v_cr = _mm_packus_epi32(v_cr0, v_cr1);
+        v_cb = _mm_packus_epi32(v_cb0, v_cb1);
+    }
+
+    void operator()(const ushort * src, ushort * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
+        n *= 3;
+
+        if (haveSIMD)
+        {
+            for ( ; i <= n - 48; i += 48, src += scn * 16)
+            {
+                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
+                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
+                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
+                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
+                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
+                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
+
+                if (scn == 4)
+                {
+                    __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
+                    __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
+
+                    _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1,
+                                           v_b0, v_b1, v_a0, v_a1);
+                }
+                else
+                    _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
+                process(v_r0, v_g0, v_b0,
+                        v_y0, v_cr0, v_cb0);
+
+                __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
+                process(v_r1, v_g1, v_b1,
+                        v_y1, v_cr1, v_cb1);
+
+                _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+
+                _mm_storeu_si128((__m128i *)(dst + i), v_y0);
+                _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1);
+                _mm_storeu_si128((__m128i *)(dst + i + 16), v_cr0);
+                _mm_storeu_si128((__m128i *)(dst + i + 24), v_cr1);
+                _mm_storeu_si128((__m128i *)(dst + i + 32), v_cb0);
+                _mm_storeu_si128((__m128i *)(dst + i + 40), v_cb1);
+            }
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+            dst[i] = saturate_cast<ushort>(Y);
+            dst[i+1] = saturate_cast<ushort>(Cr);
+            dst[i+2] = saturate_cast<ushort>(Cb);
+        }
+    }
+
+    int srccn, blueIdx, coeffs[5];
+    __m128i v_c0, v_c1, v_c2;
+    __m128i v_c3, v_c4, v_delta, v_delta2;
+    __m128i v_zero;
+    bool haveSIMD;
+};
+
+#endif // CV_SSE4_1
 
 template<typename _Tp> struct YCrCb2RGB_f
 {
@@ -1809,6 +2534,118 @@ struct YCrCb2RGB_f<float>
     float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
 };
 
+#elif CV_SSE2
+
+template <>
+struct YCrCb2RGB_f<float>
+{
+    typedef float channel_type;
+
+    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = _mm_set1_ps(coeffs[0]);
+        v_c1 = _mm_set1_ps(coeffs[1]);
+        v_c2 = _mm_set1_ps(coeffs[2]);
+        v_c3 = _mm_set1_ps(coeffs[3]);
+        v_delta = _mm_set1_ps(ColorChannel<float>::half());
+        v_alpha = _mm_set1_ps(ColorChannel<float>::max());
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    void process(__m128 v_y, __m128 v_cr, __m128 v_cb,
+                 __m128 & v_r, __m128 & v_g, __m128 & v_b) const
+    {
+        v_cb = _mm_sub_ps(v_cb, v_delta);
+        v_cr = _mm_sub_ps(v_cr, v_delta);
+
+        v_b = _mm_mul_ps(v_cb, v_c3);
+        v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1));
+        v_r = _mm_mul_ps(v_cr, v_c0);
+
+        v_b = _mm_add_ps(v_b, v_y);
+        v_g = _mm_add_ps(v_g, v_y);
+        v_r = _mm_add_ps(v_r, v_y);
+
+        if (blueIdx == 0)
+            std::swap(v_b, v_r);
+    }
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        if (haveSIMD)
+        {
+            for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
+            {
+                __m128 v_y0 = _mm_loadu_ps(src + i);
+                __m128 v_y1 = _mm_loadu_ps(src + i + 4);
+                __m128 v_cr0 = _mm_loadu_ps(src + i + 8);
+                __m128 v_cr1 = _mm_loadu_ps(src + i + 12);
+                __m128 v_cb0 = _mm_loadu_ps(src + i + 16);
+                __m128 v_cb1 = _mm_loadu_ps(src + i + 20);
+
+                _mm_deinterleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+
+                __m128 v_r0, v_g0, v_b0;
+                process(v_y0, v_cr0, v_cb0,
+                        v_r0, v_g0, v_b0);
+
+                __m128 v_r1, v_g1, v_b1;
+                process(v_y1, v_cr1, v_cb1,
+                        v_r1, v_g1, v_b1);
+
+                __m128 v_a0 = v_alpha, v_a1 = v_alpha;
+
+                if (dcn == 3)
+                    _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+                else
+                    _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1,
+                                      v_b0, v_b1, v_a0, v_a1);
+
+                _mm_storeu_ps(dst, v_r0);
+                _mm_storeu_ps(dst + 4, v_r1);
+                _mm_storeu_ps(dst + 8, v_g0);
+                _mm_storeu_ps(dst + 12, v_g1);
+                _mm_storeu_ps(dst + 16, v_b0);
+                _mm_storeu_ps(dst + 20, v_b1);
+
+                if (dcn == 4)
+                {
+                    _mm_storeu_ps(dst + 24, v_a0);
+                    _mm_storeu_ps(dst + 28, v_a1);
+                }
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            float Y = src[i], Cr = src[i+1], Cb = src[i+2];
+
+            float b = Y + (Cb - delta)*C3;
+            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
+            float r = Y + (Cr - delta)*C0;
+
+            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    float coeffs[4];
+
+    __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
+    bool haveSIMD;
+};
+
 #endif
 
 template<typename _Tp> struct YCrCb2RGB_i
@@ -2096,7 +2933,185 @@ struct YCrCb2RGB_i<ushort>
     uint16x4_t v_alpha2;
 };
 
-#endif
+#elif CV_SSE2
+
+template <>
+struct YCrCb2RGB_i<uchar>
+{
+    typedef uchar channel_type;
+
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {22987, -11698, -5636, 29049};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = _mm_set1_epi16((short)coeffs[0]);
+        v_c1 = _mm_set1_epi16((short)coeffs[1]);
+        v_c2 = _mm_set1_epi16((short)coeffs[2]);
+        v_c3 = _mm_set1_epi16((short)coeffs[3]);
+        v_delta = _mm_set1_epi16(ColorChannel<uchar>::half());
+        v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
+        v_zero = _mm_setzero_si128();
+
+        uchar alpha = ColorChannel<uchar>::max();
+        v_alpha = _mm_set1_epi8(*(char *)&alpha);
+
+        useSSE = coeffs[0] <= std::numeric_limits<short>::max();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    // 16s x 8
+    void process(__m128i v_y, __m128i v_cr, __m128i v_cb,
+                 __m128i & v_r, __m128i & v_g, __m128i & v_b) const
+    {
+        v_cr = _mm_sub_epi16(v_cr, v_delta);
+        v_cb = _mm_sub_epi16(v_cb, v_delta);
+
+        __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero);
+
+        __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3);
+        __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2);
+        __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1);
+        __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0);
+
+        __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3);
+        __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2);
+        __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1);
+        __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0);
+
+        __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
+        __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2),
+                                                                  _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
+                                      yuv_shift);
+        __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
+
+        v_r0 = _mm_add_epi32(v_r0, v_y_p);
+        v_g0 = _mm_add_epi32(v_g0, v_y_p);
+        v_b0 = _mm_add_epi32(v_b0, v_y_p);
+
+        v_y_p = _mm_unpackhi_epi16(v_y, v_zero);
+
+        __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
+        __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2),
+                                                                  _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
+                                      yuv_shift);
+        __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
+
+        v_r1 = _mm_add_epi32(v_r1, v_y_p);
+        v_g1 = _mm_add_epi32(v_g1, v_y_p);
+        v_b1 = _mm_add_epi32(v_b1, v_y_p);
+
+        v_r = _mm_packs_epi32(v_r0, v_r1);
+        v_g = _mm_packs_epi32(v_g0, v_g1);
+        v_b = _mm_packs_epi32(v_b0, v_b1);
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        if (haveSIMD && useSSE)
+        {
+            for ( ; i <= n - 96; i += 96, dst += dcn * 32)
+            {
+                __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i));
+                __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16));
+                __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32));
+                __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48));
+                __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64));
+                __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80));
+
+                _mm_deinterleave_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
+
+                __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero;
+                process(_mm_unpacklo_epi8(v_y0, v_zero),
+                        _mm_unpacklo_epi8(v_cr0, v_zero),
+                        _mm_unpacklo_epi8(v_cb0, v_zero),
+                        v_r_0, v_g_0, v_b_0);
+
+                __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero;
+                process(_mm_unpackhi_epi8(v_y0, v_zero),
+                        _mm_unpackhi_epi8(v_cr0, v_zero),
+                        _mm_unpackhi_epi8(v_cb0, v_zero),
+                        v_r_1, v_g_1, v_b_1);
+
+                __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1);
+                __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1);
+                __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1);
+
+                process(_mm_unpacklo_epi8(v_y1, v_zero),
+                        _mm_unpacklo_epi8(v_cr1, v_zero),
+                        _mm_unpacklo_epi8(v_cb1, v_zero),
+                        v_r_0, v_g_0, v_b_0);
+
+                process(_mm_unpackhi_epi8(v_y1, v_zero),
+                        _mm_unpackhi_epi8(v_cr1, v_zero),
+                        _mm_unpackhi_epi8(v_cb1, v_zero),
+                        v_r_1, v_g_1, v_b_1);
+
+                __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1);
+                __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1);
+                __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1);
+
+                if (bidx == 0)
+                {
+                    std::swap(v_r0, v_b0);
+                    std::swap(v_r1, v_b1);
+                }
+
+                __m128i v_a0 = v_alpha, v_a1 = v_alpha;
+
+                if (dcn == 3)
+                    _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+                else
+                    _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1,
+                                        v_b0, v_b1, v_a0, v_a1);
+
+                _mm_storeu_si128((__m128i *)(dst), v_r0);
+                _mm_storeu_si128((__m128i *)(dst + 16), v_r1);
+                _mm_storeu_si128((__m128i *)(dst + 32), v_g0);
+                _mm_storeu_si128((__m128i *)(dst + 48), v_g1);
+                _mm_storeu_si128((__m128i *)(dst + 64), v_b0);
+                _mm_storeu_si128((__m128i *)(dst + 80), v_b1);
+
+                if (dcn == 4)
+                {
+                    _mm_storeu_si128((__m128i *)(dst + 96), v_a0);
+                    _mm_storeu_si128((__m128i *)(dst + 112), v_a1);
+                }
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            uchar Y = src[i];
+            uchar Cr = src[i+1];
+            uchar Cb = src[i+2];
+
+            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
+            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
+            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
+
+            dst[bidx] = saturate_cast<uchar>(b);
+            dst[1] = saturate_cast<uchar>(g);
+            dst[bidx^2] = saturate_cast<uchar>(r);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[4];
+    bool useSSE, haveSIMD;
+
+    __m128i v_c0, v_c1, v_c2, v_c3, v_delta2;
+    __m128i v_delta, v_alpha, v_zero;
+};
+
+#endif // CV_SSE2
 
 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
 
@@ -2219,6 +3234,118 @@ struct RGB2XYZ_f<float>
     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
 };
 
+#elif CV_SSE2
+
+template <>
+struct RGB2XYZ_f<float>
+{
+    typedef float channel_type;
+
+    RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[2]);
+            std::swap(coeffs[3], coeffs[5]);
+            std::swap(coeffs[6], coeffs[8]);
+        }
+
+        v_c0 = _mm_set1_ps(coeffs[0]);
+        v_c1 = _mm_set1_ps(coeffs[1]);
+        v_c2 = _mm_set1_ps(coeffs[2]);
+        v_c3 = _mm_set1_ps(coeffs[3]);
+        v_c4 = _mm_set1_ps(coeffs[4]);
+        v_c5 = _mm_set1_ps(coeffs[5]);
+        v_c6 = _mm_set1_ps(coeffs[6]);
+        v_c7 = _mm_set1_ps(coeffs[7]);
+        v_c8 = _mm_set1_ps(coeffs[8]);
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    void process(__m128 v_r, __m128 v_g, __m128 v_b,
+                 __m128 & v_x, __m128 & v_y, __m128 & v_z) const
+    {
+        v_x = _mm_mul_ps(v_r, v_c0);
+        v_x = _mm_add_ps(v_x, _mm_mul_ps(v_g, v_c1));
+        v_x = _mm_add_ps(v_x, _mm_mul_ps(v_b, v_c2));
+
+        v_y = _mm_mul_ps(v_r, v_c3);
+        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c4));
+        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c5));
+
+        v_z = _mm_mul_ps(v_r, v_c6);
+        v_z = _mm_add_ps(v_z, _mm_mul_ps(v_g, v_c7));
+        v_z = _mm_add_ps(v_z, _mm_mul_ps(v_b, v_c8));
+    }
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+        n *= 3;
+
+        if (haveSIMD)
+        {
+            for ( ; i <= n - 24; i += 24, src += 8 * scn)
+            {
+                __m128 v_r0 = _mm_loadu_ps(src);
+                __m128 v_r1 = _mm_loadu_ps(src + 4);
+                __m128 v_g0 = _mm_loadu_ps(src + 8);
+                __m128 v_g1 = _mm_loadu_ps(src + 12);
+                __m128 v_b0 = _mm_loadu_ps(src + 16);
+                __m128 v_b1 = _mm_loadu_ps(src + 20);
+
+                if (scn == 4)
+                {
+                    __m128 v_a0 = _mm_loadu_ps(src + 24);
+                    __m128 v_a1 = _mm_loadu_ps(src + 28);
+
+                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
+                                        v_b0, v_b1, v_a0, v_a1);
+                }
+                else
+                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                __m128 v_x0, v_y0, v_z0;
+                process(v_r0, v_g0, v_b0,
+                        v_x0, v_y0, v_z0);
+
+                __m128 v_x1, v_y1, v_z1;
+                process(v_r1, v_g1, v_b1,
+                        v_x1, v_y1, v_z1);
+
+                _mm_interleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
+
+                _mm_storeu_ps(dst + i, v_x0);
+                _mm_storeu_ps(dst + i + 4, v_x1);
+                _mm_storeu_ps(dst + i + 8, v_y0);
+                _mm_storeu_ps(dst + i + 12, v_y1);
+                _mm_storeu_ps(dst + i + 16, v_z0);
+                _mm_storeu_ps(dst + i + 20, v_z1);
+            }
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
+            float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
+            float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
+            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
+        }
+    }
+
+    int srccn;
+    float coeffs[9];
+    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    bool haveSIMD;
+};
+
+
 #endif
 
 template<typename _Tp> struct RGB2XYZ_i
@@ -2249,6 +3376,7 @@ template<typename _Tp> struct RGB2XYZ_i
             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
         n *= 3;
+
         for(int i = 0; i < n; i += 3, src += scn)
         {
             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
@@ -2542,6 +3670,130 @@ template<typename _Tp> struct XYZ2RGB_f
     float coeffs[9];
 };
 
+#if CV_SSE2
+
+template <>
+struct XYZ2RGB_f<float>
+{
+    typedef float channel_type;
+
+    XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
+    : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[6]);
+            std::swap(coeffs[1], coeffs[7]);
+            std::swap(coeffs[2], coeffs[8]);
+        }
+
+        v_c0 = _mm_set1_ps(coeffs[0]);
+        v_c1 = _mm_set1_ps(coeffs[1]);
+        v_c2 = _mm_set1_ps(coeffs[2]);
+        v_c3 = _mm_set1_ps(coeffs[3]);
+        v_c4 = _mm_set1_ps(coeffs[4]);
+        v_c5 = _mm_set1_ps(coeffs[5]);
+        v_c6 = _mm_set1_ps(coeffs[6]);
+        v_c7 = _mm_set1_ps(coeffs[7]);
+        v_c8 = _mm_set1_ps(coeffs[8]);
+
+        v_alpha = _mm_set1_ps(ColorChannel<float>::max());
+
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    void process(__m128 v_x, __m128 v_y, __m128 v_z,
+                 __m128 & v_r, __m128 & v_g, __m128 & v_b) const
+    {
+        v_b = _mm_mul_ps(v_x, v_c0);
+        v_b = _mm_add_ps(v_b, _mm_mul_ps(v_y, v_c1));
+        v_b = _mm_add_ps(v_b, _mm_mul_ps(v_z, v_c2));
+
+        v_g = _mm_mul_ps(v_x, v_c3);
+        v_g = _mm_add_ps(v_g, _mm_mul_ps(v_y, v_c4));
+        v_g = _mm_add_ps(v_g, _mm_mul_ps(v_z, v_c5));
+
+        v_r = _mm_mul_ps(v_x, v_c6);
+        v_r = _mm_add_ps(v_r, _mm_mul_ps(v_y, v_c7));
+        v_r = _mm_add_ps(v_r, _mm_mul_ps(v_z, v_c8));
+    }
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int dcn = dstcn;
+        float alpha = ColorChannel<float>::max();
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+        int i = 0;
+
+        if (haveSIMD)
+        {
+            for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
+            {
+                __m128 v_x0 = _mm_loadu_ps(src + i);
+                __m128 v_x1 = _mm_loadu_ps(src + i + 4);
+                __m128 v_y0 = _mm_loadu_ps(src + i + 8);
+                __m128 v_y1 = _mm_loadu_ps(src + i + 12);
+                __m128 v_z0 = _mm_loadu_ps(src + i + 16);
+                __m128 v_z1 = _mm_loadu_ps(src + i + 20);
+
+                _mm_deinterleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
+
+                __m128 v_r0, v_g0, v_b0;
+                process(v_x0, v_y0, v_z0,
+                        v_r0, v_g0, v_b0);
+
+                __m128 v_r1, v_g1, v_b1;
+                process(v_x1, v_y1, v_z1,
+                        v_r1, v_g1, v_b1);
+
+                __m128 v_a0 = v_alpha, v_a1 = v_alpha;
+
+                if (dcn == 4)
+                    _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1,
+                                      v_r0, v_r1, v_a0, v_a1);
+                else
+                    _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
+
+                _mm_storeu_ps(dst, v_b0);
+                _mm_storeu_ps(dst + 4, v_b1);
+                _mm_storeu_ps(dst + 8, v_g0);
+                _mm_storeu_ps(dst + 12, v_g1);
+                _mm_storeu_ps(dst + 16, v_r0);
+                _mm_storeu_ps(dst + 20, v_r1);
+
+                if (dcn == 4)
+                {
+                    _mm_storeu_ps(dst + 24, v_a0);
+                    _mm_storeu_ps(dst + 28, v_a1);
+                }
+            }
+
+        }
+
+        for( ; i < n; i += 3, dst += dcn)
+        {
+            float B = src[i]*C0 + src[i+1]*C1 + src[i+2]*C2;
+            float G = src[i]*C3 + src[i+1]*C4 + src[i+2]*C5;
+            float R = src[i]*C6 + src[i+1]*C7 + src[i+2]*C8;
+            dst[0] = B; dst[1] = G; dst[2] = R;
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    float coeffs[9];
+
+    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    __m128 v_alpha;
+    bool haveSIMD;
+};
+
+#endif // CV_SSE2
+
 
 template<typename _Tp> struct XYZ2RGB_i
 {
@@ -3056,14 +4308,49 @@ struct HSV2RGB_b
         v_scale_inv = vdupq_n_f32(1.f/255.f);
         v_scale = vdupq_n_f32(255.f);
         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_scale_inv = _mm_set1_ps(1.f/255.f);
+        v_scale = _mm_set1_ps(255.0f);
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
         #endif
     }
 
+    #if CV_SSE2
+    // 16s x 8
+    void process(__m128i v_r, __m128i v_g, __m128i v_b,
+                 float * buf) const
+    {
+        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
+        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
+        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
+
+        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
+        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
+        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+
+        v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
+        v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
+
+        v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
+        v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
+
+        _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+        _mm_store_ps(buf, v_r0);
+        _mm_store_ps(buf + 4, v_r1);
+        _mm_store_ps(buf + 8, v_g0);
+        _mm_store_ps(buf + 12, v_g1);
+        _mm_store_ps(buf + 16, v_b0);
+        _mm_store_ps(buf + 20, v_b1);
+    }
+    #endif
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
-        float buf[3*BLOCK_SIZE];
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
 
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
@@ -3089,6 +4376,41 @@ struct HSV2RGB_b
                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
                 vst3q_f32(buf + j + 12, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 32) * 3; j += 96)
+                {
+                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
+                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
+                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
+                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
+                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
+                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
+
+                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                    process(_mm_unpacklo_epi8(v_r0, v_zero),
+                            _mm_unpacklo_epi8(v_g0, v_zero),
+                            _mm_unpacklo_epi8(v_b0, v_zero),
+                            buf + j);
+
+                    process(_mm_unpackhi_epi8(v_r0, v_zero),
+                            _mm_unpackhi_epi8(v_g0, v_zero),
+                            _mm_unpackhi_epi8(v_b0, v_zero),
+                            buf + j + 24);
+
+                    process(_mm_unpacklo_epi8(v_r1, v_zero),
+                            _mm_unpacklo_epi8(v_g1, v_zero),
+                            _mm_unpacklo_epi8(v_b1, v_zero),
+                            buf + j + 48);
+
+                    process(_mm_unpackhi_epi8(v_r1, v_zero),
+                            _mm_unpackhi_epi8(v_g1, v_zero),
+                            _mm_unpackhi_epi8(v_b1, v_zero),
+                            buf + j + 72);
+                }
+            }
             #endif
 
             for( ; j < dn*3; j += 3 )
@@ -3129,6 +4451,28 @@ struct HSV2RGB_b
                     vst3_u8(dst, v_dst);
                 }
             }
+            #elif CV_SSE2
+            if (dcn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
+                {
+                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
+
+                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                                     _mm_cvtps_epi32(v_src1));
+                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
+                                                     _mm_cvtps_epi32(v_src3));
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
             #endif
 
             for( ; j < dn*3; j += 3, dst += dcn )
@@ -3147,6 +4491,10 @@ struct HSV2RGB_b
     #if CV_NEON
     float32x4_t v_scale, v_scale_inv;
     uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale_inv, v_scale;
+    __m128i v_zero;
+    bool haveSIMD;
     #endif
 };
 
@@ -3218,13 +4566,42 @@ struct RGB2HLS_b
         v_scale_inv = vdupq_n_f32(1.f/255.f);
         v_scale = vdupq_n_f32(255.f);
         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_scale_inv = _mm_set1_ps(1.f/255.f);
+        v_scale = _mm_set1_ps(255.f);
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
         #endif
     }
 
+    #if CV_SSE2
+    void process(const float * buf,
+                 __m128i & v_h, __m128i & v_l, __m128i & v_s) const
+    {
+        __m128 v_h0f = _mm_load_ps(buf);
+        __m128 v_h1f = _mm_load_ps(buf + 4);
+        __m128 v_l0f = _mm_load_ps(buf + 8);
+        __m128 v_l1f = _mm_load_ps(buf + 12);
+        __m128 v_s0f = _mm_load_ps(buf + 16);
+        __m128 v_s1f = _mm_load_ps(buf + 20);
+
+        _mm_deinterleave_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f);
+
+        v_l0f = _mm_mul_ps(v_l0f, v_scale);
+        v_l1f = _mm_mul_ps(v_l1f, v_scale);
+        v_s0f = _mm_mul_ps(v_s0f, v_scale);
+        v_s1f = _mm_mul_ps(v_s1f, v_scale);
+
+        v_h = _mm_packs_epi32(_mm_cvtps_epi32(v_h0f), _mm_cvtps_epi32(v_h1f));
+        v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
+        v_s = _mm_packs_epi32(_mm_cvtps_epi32(v_s0f), _mm_cvtps_epi32(v_s1f));
+    }
+    #endif
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
         int i, j, scn = srccn;
-        float buf[3*BLOCK_SIZE];
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
 
         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
         {
@@ -3262,6 +4639,26 @@ struct RGB2HLS_b
                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
                 vst3q_f32(buf + j + 12, v_dst);
             }
+            #elif CV_SSE2
+            if (scn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
+
+                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
+                    _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
+                    _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
+
+                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
+                    _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
+                    _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    src -= jr, j -= jr;
+            }
             #endif
             for( ; j < dn*3; j += 3, src += scn )
             {
@@ -3286,6 +4683,43 @@ struct RGB2HLS_b
                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
                 vst3_u8(dst + j, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 32) * 3; j += 96)
+                {
+                    __m128i v_h_0, v_l_0, v_s_0;
+                    process(buf + j,
+                            v_h_0, v_l_0, v_s_0);
+
+                    __m128i v_h_1, v_l_1, v_s_1;
+                    process(buf + j + 24,
+                            v_h_1, v_l_1, v_s_1);
+
+                    __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1);
+                    __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
+                    __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1);
+
+                    process(buf + j + 48,
+                            v_h_0, v_l_0, v_s_0);
+
+                    process(buf + j + 72,
+                            v_h_1, v_l_1, v_s_1);
+
+                    __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1);
+                    __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
+                    __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1);
+
+                    _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
+
+                    _mm_storeu_si128((__m128i *)(dst + j), v_h0);
+                    _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1);
+                    _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0);
+                    _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1);
+                    _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0);
+                    _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1);
+                }
+            }
             #endif
             for( ; j < dn*3; j += 3 )
             {
@@ -3301,6 +4735,10 @@ struct RGB2HLS_b
     #if CV_NEON
     float32x4_t v_scale, v_scale_inv;
     uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale, v_scale_inv;
+    __m128i v_zero;
+    bool haveSIMD;
     #endif
 };
 
@@ -3380,14 +4818,49 @@ struct HLS2RGB_b
         v_scale_inv = vdupq_n_f32(1.f/255.f);
         v_scale = vdupq_n_f32(255.f);
         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_scale_inv = _mm_set1_ps(1.f/255.f);
+        v_scale = _mm_set1_ps(255.f);
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
         #endif
     }
 
+    #if CV_SSE2
+    // 16s x 8
+    void process(__m128i v_r, __m128i v_g, __m128i v_b,
+                 float * buf) const
+    {
+        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
+        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
+        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
+
+        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
+        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
+        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+
+        v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
+        v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
+
+        v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
+        v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
+
+        _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+        _mm_store_ps(buf, v_r0);
+        _mm_store_ps(buf + 4, v_r1);
+        _mm_store_ps(buf + 8, v_g0);
+        _mm_store_ps(buf + 12, v_g1);
+        _mm_store_ps(buf + 16, v_b0);
+        _mm_store_ps(buf + 20, v_b1);
+    }
+    #endif
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
-        float buf[3*BLOCK_SIZE];
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
 
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
@@ -3413,6 +4886,41 @@ struct HLS2RGB_b
                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
                 vst3q_f32(buf + j + 12, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 32) * 3; j += 96)
+                {
+                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
+                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
+                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
+                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
+                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
+                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
+
+                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                    process(_mm_unpacklo_epi8(v_r0, v_zero),
+                            _mm_unpacklo_epi8(v_g0, v_zero),
+                            _mm_unpacklo_epi8(v_b0, v_zero),
+                            buf + j);
+
+                    process(_mm_unpackhi_epi8(v_r0, v_zero),
+                            _mm_unpackhi_epi8(v_g0, v_zero),
+                            _mm_unpackhi_epi8(v_b0, v_zero),
+                            buf + j + 24);
+
+                    process(_mm_unpacklo_epi8(v_r1, v_zero),
+                            _mm_unpacklo_epi8(v_g1, v_zero),
+                            _mm_unpacklo_epi8(v_b1, v_zero),
+                            buf + j + 48);
+
+                    process(_mm_unpackhi_epi8(v_r1, v_zero),
+                            _mm_unpackhi_epi8(v_g1, v_zero),
+                            _mm_unpackhi_epi8(v_b1, v_zero),
+                            buf + j + 72);
+                }
+            }
             #endif
             for( ; j < dn*3; j += 3 )
             {
@@ -3452,7 +4960,30 @@ struct HLS2RGB_b
                     vst3_u8(dst, v_dst);
                 }
             }
+            #elif CV_SSE2
+            if (dcn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
+                {
+                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
+
+                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                                     _mm_cvtps_epi32(v_src1));
+                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
+                                                     _mm_cvtps_epi32(v_src3));
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
             #endif
+
             for( ; j < dn*3; j += 3, dst += dcn )
             {
                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
@@ -3469,6 +5000,10 @@ struct HLS2RGB_b
     #if CV_NEON
     float32x4_t v_scale, v_scale_inv;
     uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale, v_scale_inv;
+    __m128i v_zero;
+    bool haveSIMD;
     #endif
 };
 
@@ -3784,14 +5319,52 @@ struct Lab2RGB_b
         v_scale = vdupq_n_f32(255.f);
         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
         v_128 = vdupq_n_f32(128.0f);
+        #elif CV_SSE2
+        v_scale_inv = _mm_set1_ps(100.f/255.f);
+        v_scale = _mm_set1_ps(255.f);
+        v_128 = _mm_set1_ps(128.0f);
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
         #endif
     }
 
+    #if CV_SSE2
+    // 16s x 8
+    void process(__m128i v_r, __m128i v_g, __m128i v_b,
+                 float * buf) const
+    {
+        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
+        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
+        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
+
+        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
+        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
+        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
+
+        v_r0 = _mm_mul_ps(v_r0, v_scale_inv);
+        v_r1 = _mm_mul_ps(v_r1, v_scale_inv);
+
+        v_g0 = _mm_sub_ps(v_g0, v_128);
+        v_g1 = _mm_sub_ps(v_g1, v_128);
+        v_b0 = _mm_sub_ps(v_b0, v_128);
+        v_b1 = _mm_sub_ps(v_b1, v_128);
+
+        _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+        _mm_store_ps(buf, v_r0);
+        _mm_store_ps(buf + 4, v_r1);
+        _mm_store_ps(buf + 8, v_g0);
+        _mm_store_ps(buf + 12, v_g1);
+        _mm_store_ps(buf + 16, v_b0);
+        _mm_store_ps(buf + 20, v_b1);
+    }
+    #endif
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
-        float buf[3*BLOCK_SIZE];
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
 
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
@@ -3817,6 +5390,41 @@ struct Lab2RGB_b
                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
                 vst3q_f32(buf + j + 12, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 32) * 3; j += 96)
+                {
+                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
+                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
+                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
+                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
+                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
+                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
+
+                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                    process(_mm_unpacklo_epi8(v_r0, v_zero),
+                            _mm_unpacklo_epi8(v_g0, v_zero),
+                            _mm_unpacklo_epi8(v_b0, v_zero),
+                            buf + j);
+
+                    process(_mm_unpackhi_epi8(v_r0, v_zero),
+                            _mm_unpackhi_epi8(v_g0, v_zero),
+                            _mm_unpackhi_epi8(v_b0, v_zero),
+                            buf + j + 24);
+
+                    process(_mm_unpacklo_epi8(v_r1, v_zero),
+                            _mm_unpacklo_epi8(v_g1, v_zero),
+                            _mm_unpacklo_epi8(v_b1, v_zero),
+                            buf + j + 48);
+
+                    process(_mm_unpackhi_epi8(v_r1, v_zero),
+                            _mm_unpackhi_epi8(v_g1, v_zero),
+                            _mm_unpackhi_epi8(v_b1, v_zero),
+                            buf + j + 72);
+                }
+            }
             #endif
 
             for( ; j < dn*3; j += 3 )
@@ -3857,6 +5465,28 @@ struct Lab2RGB_b
                     vst3_u8(dst, v_dst);
                 }
             }
+            #elif CV_SSE2
+            if (dcn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
+                {
+                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
+
+                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                                     _mm_cvtps_epi32(v_src1));
+                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
+                                                     _mm_cvtps_epi32(v_src3));
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
             #endif
 
             for( ; j < dn*3; j += 3, dst += dcn )
@@ -3876,6 +5506,10 @@ struct Lab2RGB_b
     #if CV_NEON
     float32x4_t v_scale, v_scale_inv, v_128;
     uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale, v_scale_inv, v_128;
+    __m128i v_zero;
+    bool haveSIMD;
     #endif
 };
 
@@ -4050,13 +5684,48 @@ struct RGB2Luv_b
         v_coeff3 = vdupq_n_f32(0.9732824427480916f);
         v_coeff4 = vdupq_n_f32(136.259541984732824f);
         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_zero = _mm_setzero_si128();
+        v_scale_inv = _mm_set1_ps(1.f/255.f);
+        v_scale = _mm_set1_ps(2.55f);
+        v_coeff1 = _mm_set1_ps(0.72033898305084743f);
+        v_coeff2 = _mm_set1_ps(96.525423728813564f);
+        v_coeff3 = _mm_set1_ps(0.9732824427480916f);
+        v_coeff4 = _mm_set1_ps(136.259541984732824f);
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
         #endif
     }
 
+    #if CV_SSE2
+    void process(const float * buf,
+                 __m128i & v_l, __m128i & v_u, __m128i & v_v) const
+    {
+        __m128 v_l0f = _mm_load_ps(buf);
+        __m128 v_l1f = _mm_load_ps(buf + 4);
+        __m128 v_u0f = _mm_load_ps(buf + 8);
+        __m128 v_u1f = _mm_load_ps(buf + 12);
+        __m128 v_v0f = _mm_load_ps(buf + 16);
+        __m128 v_v1f = _mm_load_ps(buf + 20);
+
+        _mm_deinterleave_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f);
+
+        v_l0f = _mm_mul_ps(v_l0f, v_scale);
+        v_l1f = _mm_mul_ps(v_l1f, v_scale);
+        v_u0f = _mm_add_ps(_mm_mul_ps(v_u0f, v_coeff1), v_coeff2);
+        v_u1f = _mm_add_ps(_mm_mul_ps(v_u1f, v_coeff1), v_coeff2);
+        v_v0f = _mm_add_ps(_mm_mul_ps(v_v0f, v_coeff3), v_coeff4);
+        v_v1f = _mm_add_ps(_mm_mul_ps(v_v1f, v_coeff3), v_coeff4);
+
+        v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
+        v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
+        v_v = _mm_packs_epi32(_mm_cvtps_epi32(v_v0f), _mm_cvtps_epi32(v_v1f));
+    }
+    #endif
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
         int i, j, scn = srccn;
-        float buf[3*BLOCK_SIZE];
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
 
         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
         {
@@ -4094,6 +5763,26 @@ struct RGB2Luv_b
                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
                 vst3q_f32(buf + j + 12, v_dst);
             }
+            #elif CV_SSE2
+            if (scn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
+
+                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
+                    _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
+                    _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
+
+                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
+                    _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
+                    _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    src -= jr, j -= jr;
+            }
             #endif
             for( ; j < dn*3; j += 3, src += scn )
             {
@@ -4119,6 +5808,43 @@ struct RGB2Luv_b
 
                 vst3_u8(dst + j, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 32) * 3; j += 96)
+                {
+                    __m128i v_l_0, v_u_0, v_v_0;
+                    process(buf + j,
+                            v_l_0, v_u_0, v_v_0);
+
+                    __m128i v_l_1, v_u_1, v_v_1;
+                    process(buf + j + 24,
+                            v_l_1, v_u_1, v_v_1);
+
+                    __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
+                    __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1);
+                    __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1);
+
+                    process(buf + j + 48,
+                            v_l_0, v_u_0, v_v_0);
+
+                    process(buf + j + 72,
+                            v_l_1, v_u_1, v_v_1);
+
+                    __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
+                    __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1);
+                    __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1);
+
+                    _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
+
+                    _mm_storeu_si128((__m128i *)(dst + j), v_l0);
+                    _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1);
+                    _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0);
+                    _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1);
+                    _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0);
+                    _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1);
+                }
+            }
             #endif
 
             for( ; j < dn*3; j += 3 )
@@ -4136,6 +5862,10 @@ struct RGB2Luv_b
     #if CV_NEON
     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
     uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
+    __m128i v_zero;
+    bool haveSIMD;
     #endif
 };
 
@@ -4156,14 +5886,55 @@ struct Luv2RGB_b
         v_140 = vdupq_n_f32(140.f);
         v_scale = vdupq_n_f32(255.f);
         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #elif CV_SSE2
+        v_scale_inv = _mm_set1_ps(100.f/255.f);
+        v_coeff1 = _mm_set1_ps(1.388235294117647f);
+        v_coeff2 = _mm_set1_ps(1.027450980392157f);
+        v_134 = _mm_set1_ps(134.f);
+        v_140 = _mm_set1_ps(140.f);
+        v_scale = _mm_set1_ps(255.f);
+        v_zero = _mm_setzero_si128();
+        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
         #endif
     }
 
+    #if CV_SSE2
+    // 16s x 8
+    void process(__m128i v_l, __m128i v_u, __m128i v_v,
+                 float * buf) const
+    {
+        __m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero));
+        __m128 v_u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_u, v_zero));
+        __m128 v_v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_v, v_zero));
+
+        __m128 v_l1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_l, v_zero));
+        __m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero));
+        __m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero));
+
+        v_l0 = _mm_mul_ps(v_l0, v_scale_inv);
+        v_l1 = _mm_mul_ps(v_l1, v_scale_inv);
+
+        v_u0 = _mm_sub_ps(_mm_mul_ps(v_u0, v_coeff1), v_134);
+        v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134);
+        v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140);
+        v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140);
+
+        _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
+
+        _mm_store_ps(buf, v_l0);
+        _mm_store_ps(buf + 4, v_l1);
+        _mm_store_ps(buf + 8, v_u0);
+        _mm_store_ps(buf + 12, v_u1);
+        _mm_store_ps(buf + 16, v_v0);
+        _mm_store_ps(buf + 20, v_v1);
+    }
+    #endif
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
         int i, j, dcn = dstcn;
         uchar alpha = ColorChannel<uchar>::max();
-        float buf[3*BLOCK_SIZE];
+        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
 
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
@@ -4189,6 +5960,41 @@ struct Luv2RGB_b
                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140);
                 vst3q_f32(buf + j + 12, v_dst);
             }
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                for ( ; j <= (dn - 32) * 3; j += 96)
+                {
+                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
+                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
+                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
+                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
+                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
+                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
+
+                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
+
+                    process(_mm_unpacklo_epi8(v_r0, v_zero),
+                            _mm_unpacklo_epi8(v_g0, v_zero),
+                            _mm_unpacklo_epi8(v_b0, v_zero),
+                            buf + j);
+
+                    process(_mm_unpackhi_epi8(v_r0, v_zero),
+                            _mm_unpackhi_epi8(v_g0, v_zero),
+                            _mm_unpackhi_epi8(v_b0, v_zero),
+                            buf + j + 24);
+
+                    process(_mm_unpacklo_epi8(v_r1, v_zero),
+                            _mm_unpacklo_epi8(v_g1, v_zero),
+                            _mm_unpacklo_epi8(v_b1, v_zero),
+                            buf + j + 48);
+
+                    process(_mm_unpackhi_epi8(v_r1, v_zero),
+                            _mm_unpackhi_epi8(v_g1, v_zero),
+                            _mm_unpackhi_epi8(v_b1, v_zero),
+                            buf + j + 72);
+                }
+            }
             #endif
             for( ; j < dn*3; j += 3 )
             {
@@ -4228,6 +6034,28 @@ struct Luv2RGB_b
                     vst3_u8(dst, v_dst);
                 }
             }
+            #elif CV_SSE2
+            if (dcn == 3 && haveSIMD)
+            {
+                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
+                {
+                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
+                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
+                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
+                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
+
+                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
+                                                     _mm_cvtps_epi32(v_src1));
+                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
+                                                     _mm_cvtps_epi32(v_src3));
+
+                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
+                }
+
+                int jr = j % 3;
+                if (jr)
+                    dst -= jr, j -= jr;
+            }
             #endif
 
             for( ; j < dn*3; j += 3, dst += dcn )
@@ -4247,6 +6075,10 @@ struct Luv2RGB_b
     #if CV_NEON
     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
     uint8x8_t v_alpha;
+    #elif CV_SSE2
+    __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
+    __m128i v_zero;
+    bool haveSIMD;
     #endif
 };
 
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index 85f2063b28..358cd5802b 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -270,6 +271,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
 #ifdef HAVE_TEGRA_OPTIMIZATION
     if (tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType))
         return;
+#elif CV_SSE2
+    bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
 #endif
 
     int depth = src.depth();
@@ -318,6 +321,33 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
 
             vst3q_f32(cov_data + j * 3, v_dst);
         }
+        #elif CV_SSE2
+        if (haveSSE2)
+        {
+            for( ; j <= size.width - 8; j += 8 )
+            {
+                __m128 v_dx_0 = _mm_loadu_ps(dxdata + j);
+                __m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4);
+                __m128 v_dy_0 = _mm_loadu_ps(dydata + j);
+                __m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4);
+
+                __m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0);
+                __m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0);
+                __m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0);
+                __m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1);
+                __m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1);
+                __m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1);
+
+                _mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1);
+
+                _mm_storeu_ps(cov_data + j * 3, v_dx2_0);
+                _mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1);
+                _mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0);
+                _mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1);
+                _mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0);
+                _mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1);
+            }
+        }
         #endif
 
         for( ; j < size.width; j++ )
diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp
index 0b7afb8ea6..cec450dc71 100644
--- a/modules/imgproc/src/demosaicing.cpp
+++ b/modules/imgproc/src/demosaicing.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 9acdc11415..ec8de4d815 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -2284,15 +2284,20 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
 
     CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() );
 
+#if CV_SSE2
+    bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+#endif
+
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
         const float* h1 = it.planes[0].ptr<float>();
         const float* h2 = it.planes[1].ptr<float>();
         len = it.planes[0].rows*it.planes[0].cols*H1.channels();
+        j = 0;
 
         if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT))
         {
-            for( j = 0; j < len; j++ )
+            for( ; j < len; j++ )
             {
                 double a = h1[j] - h2[j];
                 double b = (method == CV_COMP_CHISQR) ? h1[j] : h1[j] + h2[j];
@@ -2302,7 +2307,51 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_CORREL )
         {
-            for( j = 0; j < len; j++ )
+            #if CV_SSE2
+            if (haveSIMD)
+            {
+                __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1;
+                __m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1;
+
+                for ( ; j <= len - 4; j += 4)
+                {
+                    __m128 v_a = _mm_loadu_ps(h1 + j);
+                    __m128 v_b = _mm_loadu_ps(h2 + j);
+
+                    // 0-1
+                    __m128d v_ad = _mm_cvtps_pd(v_a);
+                    __m128d v_bd = _mm_cvtps_pd(v_b);
+                    v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
+                    v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
+                    v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
+                    v_s1 = _mm_add_pd(v_s1, v_ad);
+                    v_s2 = _mm_add_pd(v_s2, v_bd);
+
+                    // 2-3
+                    v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
+                    v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
+                    v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd));
+                    v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad));
+                    v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd));
+                    v_s1 = _mm_add_pd(v_s1, v_ad);
+                    v_s2 = _mm_add_pd(v_s2, v_bd);
+                }
+
+                double CV_DECL_ALIGNED(16) ar[10];
+                _mm_store_pd(ar, v_s12);
+                _mm_store_pd(ar + 2, v_s11);
+                _mm_store_pd(ar + 4, v_s22);
+                _mm_store_pd(ar + 6, v_s1);
+                _mm_store_pd(ar + 8, v_s2);
+
+                s12 += ar[0] + ar[1];
+                s11 += ar[2] + ar[3];
+                s22 += ar[4] + ar[5];
+                s1 += ar[6] + ar[7];
+                s2 += ar[8] + ar[9];
+            }
+            #endif
+            for( ; j < len; j++ )
             {
                 double a = h1[j];
                 double b = h2[j];
@@ -2316,7 +2365,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_INTERSECT )
         {
-            j = 0;
             #if CV_NEON
             float32x4_t v_result = vdupq_n_f32(0.0f);
             for( ; j <= len - 4; j += 4 )
@@ -2324,13 +2372,61 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
             float CV_DECL_ALIGNED(16) ar[4];
             vst1q_f32(ar, v_result);
             result += ar[0] + ar[1] + ar[2] + ar[3];
+            #elif CV_SSE2
+            if (haveSIMD)
+            {
+                __m128d v_result = _mm_setzero_pd();
+                for ( ; j <= len - 4; j += 4)
+                {
+                    __m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j),
+                                              _mm_loadu_ps(h2 + j));
+                    v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
+                    v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
+                    v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src));
+                }
+
+                double CV_DECL_ALIGNED(16) ar[2];
+                _mm_store_pd(ar, v_result);
+                result += ar[0] + ar[1];
+            }
             #endif
             for( ; j < len; j++ )
                 result += std::min(h1[j], h2[j]);
         }
         else if( method == CV_COMP_BHATTACHARYYA )
         {
-            for( j = 0; j < len; j++ )
+            #if CV_SSE2
+            if (haveSIMD)
+            {
+                __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1;
+                for ( ; j <= len - 4; j += 4)
+                {
+                    __m128 v_a = _mm_loadu_ps(h1 + j);
+                    __m128 v_b = _mm_loadu_ps(h2 + j);
+
+                    __m128d v_ad = _mm_cvtps_pd(v_a);
+                    __m128d v_bd = _mm_cvtps_pd(v_b);
+                    v_s1 = _mm_add_pd(v_s1, v_ad);
+                    v_s2 = _mm_add_pd(v_s2, v_bd);
+                    v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
+
+                    v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8)));
+                    v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8)));
+                    v_s1 = _mm_add_pd(v_s1, v_ad);
+                    v_s2 = _mm_add_pd(v_s2, v_bd);
+                    v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd)));
+                }
+
+                double CV_DECL_ALIGNED(16) ar[6];
+                _mm_store_pd(ar, v_s1);
+                _mm_store_pd(ar + 2, v_s2);
+                _mm_store_pd(ar + 4, v_result);
+                s1 += ar[0] + ar[1];
+                s2 += ar[2] + ar[3];
+                result += ar[4] + ar[5];
+            }
+            #endif
+            for( ; j < len; j++ )
             {
                 double a = h1[j];
                 double b = h2[j];
@@ -2341,7 +2437,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
         }
         else if( method == CV_COMP_KL_DIV )
         {
-            for( j = 0; j < len; j++ )
+            for( ; j < len; j++ )
             {
                 double p = h1[j];
                 double q = h2[j];
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index c4bb3baa9f..fe126fbbd1 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -1962,9 +1963,9 @@ private:
 struct ResizeAreaFastVec_SIMD_32f
 {
     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
-        scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)
+        cn(_cn), step(_step)
     {
-        fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
+        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
     }
 
     int operator() (const float * S, float * D, int w) const
@@ -2004,7 +2005,6 @@ struct ResizeAreaFastVec_SIMD_32f
     }
 
 private:
-    int scale_x, scale_y;
     int cn;
     bool fast_mode;
     int step;
@@ -2199,8 +2199,146 @@ private:
     bool use_simd;
 };
 
-typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
-typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
+class ResizeAreaFastVec_SIMD_16s
+{
+public:
+    ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        use_simd = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short* S, short* D, int w) const
+    {
+        if (!use_simd)
+            return 0;
+
+        int dx = 0;
+        const short* S0 = (const short*)S;
+        const short* S1 = (const short*)((const uchar*)(S) + step);
+        __m128i masklow = _mm_set1_epi32(0x0000ffff);
+        __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi32(2);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
+                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
+                __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
+                    _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
+                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
+                s0 = _mm_srai_epi32(s0, 2);
+                s0 = _mm_packs_epi32(s0, zero);
+
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+        else if (cn == 3)
+            for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
+                __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
+                __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
+                __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
+
+                __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
+                __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
+                s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
+                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        else
+        {
+            CV_Assert(cn == 4);
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
+                __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
+                __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
+                __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
+
+                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
+                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
+                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
+                s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn;
+    int step;
+    bool use_simd;
+};
+
+struct ResizeAreaFastVec_SIMD_32f
+{
+    ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+        fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
+        fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const float * S, float * D, int w) const
+    {
+        if (!fast_mode)
+            return 0;
+
+        const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
+        int dx = 0;
+
+        __m128 v_025 = _mm_set1_ps(0.25f);
+
+        if (cn == 1)
+        {
+            const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
+                       v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
+
+                __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
+                                           _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
+                __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
+                                           _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
+
+                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
+                __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
+
+                _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn;
+    bool fast_mode;
+    int step;
+};
 
 #else
 
@@ -4678,6 +4816,13 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         size.height = 1;
     }
 
+#if CV_SSE2
+    bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+#endif
+#if CV_SSE4_1
+    bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
+#endif
+
     const float scale = 1.f/INTER_TAB_SIZE;
     int x, y;
     for( y = 0; y < size.height; y++ )
@@ -4708,6 +4853,29 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
 
                     vst2q_s16(dst1 + (x << 1), v_dst);
                 }
+                #elif CV_SSE4_1
+                if (useSSE4_1)
+                {
+                    for( ; x <= size.width - 16; x += 16 )
+                    {
+                        __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
+                                                         _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
+                        __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
+                                                         _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
+
+                        __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
+                                                         _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
+                        __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
+                                                         _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
+
+                        _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
+
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
+                    }
+                }
                 #endif
                 for( ; x < size.width; x++ )
                 {
@@ -4742,6 +4910,52 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                                                               vandq_s32(v_ix1, v_mask)));
                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
                 }
+                #elif CV_SSE4_1
+                if (useSSE4_1)
+                {
+                    __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
+                    __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
+
+                    for( ; x <= size.width - 16; x += 16 )
+                    {
+                        __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
+                        __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
+                        __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
+                        __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
+
+                        __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
+                                                          _mm_srai_epi32(v_ix1, INTER_BITS));
+                        __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
+                                                          _mm_srai_epi32(v_iy1, INTER_BITS));
+                        __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
+                                                        _mm_and_si128(v_ix0, v_its1));
+                        __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
+                                                        _mm_and_si128(v_ix1, v_its1));
+                        _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
+
+                        v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
+                        v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
+                        v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
+                        v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
+
+                        __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
+                                                          _mm_srai_epi32(v_ix1, INTER_BITS));
+                        __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
+                                                          _mm_srai_epi32(v_iy1, INTER_BITS));
+                        v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
+                                                _mm_and_si128(v_ix0, v_its1));
+                        v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
+                                                _mm_and_si128(v_ix1, v_its1));
+                        _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
+
+                        _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
+
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
+                    }
+                }
                 #endif
                 for( ; x < size.width; x++ )
                 {
@@ -4761,6 +4975,12 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 for( ; x <= (size.width << 1) - 8; x += 8 )
                     vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
                                                      vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
+                #elif CV_SSE2
+                for( ; x <= (size.width << 1) - 8; x += 8 )
+                {
+                    _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
+                                                                            _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
+                }
                 #endif
                 for( ; x < size.width; x++ )
                 {
@@ -4796,6 +5016,30 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                                                               vandq_s32(v_ix1, v_mask)));
                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
                 }
+                #elif CV_SSE4_1
+                if (useSSE4_1)
+                {
+                    __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
+                    __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
+                    __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
+
+                    for( ; x <= size.width - 4; x += 4 )
+                    {
+                        __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
+                        __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
+
+                        __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
+                                                         _mm_srai_epi32(v_src1, INTER_BITS));
+                        _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
+
+                        // x0 y0 x1 y1 . . .
+                        v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
+                                                 _mm_and_si128(v_src1, v_its1));
+                        __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
+                                                      _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
+                        _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
+                    }
+                }
                 #endif
                 for( ; x < size.width; x++ )
                 {
@@ -4841,6 +5085,44 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                 vst1q_f32(dst1f + x + 4, v_dst1);
                 vst1q_f32(dst2f + x + 4, v_dst2);
             }
+            #elif CV_SSE2
+            __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
+            __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
+            __m128 v_scale = _mm_set1_ps(scale);
+
+            for( ; x <= size.width - 16; x += 16)
+            {
+                __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
+                __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
+                __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
+                __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
+
+                _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
+
+                __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
+                __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
+                _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
+                                                    _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
+                _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
+                                                    _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
+                v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
+                _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
+                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
+                _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
+                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
+
+                v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
+                v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
+                _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
+                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
+                _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
+                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
+                v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
+                _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
+                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
+                _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
+                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
+            }
             #endif
             for( ; x < size.width; x++ )
             {
@@ -4882,6 +5164,27 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
                 vst2q_f32(dst1f + (x << 1) + 8, v_dst);
             }
+            #elif CV_SSE2
+            if (useSSE2)
+            {
+                __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
+                __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
+                __m128 v_scale = _mm_set1_ps(scale);
+
+                for ( ; x <= size.width - 8; x += 8)
+                {
+                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
+                    __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
+                    __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
+                    __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
+
+                    __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
+                    _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
+
+                    v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
+                    _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
+                }
+            }
             #endif
             for( ; x < size.width; x++ )
             {
@@ -4919,7 +5222,10 @@ public:
         const int AB_SCALE = 1 << AB_BITS;
         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
     #if CV_SSE2
-        bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    #endif
+    #if CV_SSE4_1
+        bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
     #endif
 
         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
@@ -4957,6 +5263,31 @@ public:
 
                             vst2q_s16(xy + (x1 << 1), v_dst);
                         }
+                        #elif CV_SSE4_1
+                        if (useSSE4_1)
+                        {
+                            __m128i v_X0 = _mm_set1_epi32(X0);
+                            __m128i v_Y0 = _mm_set1_epi32(Y0);
+                            for ( ; x1 <= bw - 16; x1 += 16)
+                            {
+                                __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
+                                                               _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
+                                __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
+                                                               _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
+
+                                __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
+                                                               _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
+                                __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
+                                                               _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
+
+                                _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
+
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
+                            }
+                        }
                         #endif
                         for( ; x1 < bw; x1++ )
                         {
@@ -4971,7 +5302,7 @@ public:
                         short* alpha = A + y1*bw;
                         x1 = 0;
                     #if CV_SSE2
-                        if( useSIMD )
+                        if( useSSE2 )
                         {
                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
@@ -5364,6 +5695,20 @@ public:
         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
 
+        #if CV_SSE4_1
+        bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
+        __m128d v_M0 = _mm_set1_pd(M[0]);
+        __m128d v_M3 = _mm_set1_pd(M[3]);
+        __m128d v_M6 = _mm_set1_pd(M[6]);
+        __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
+        __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
+        __m128d v_2 = _mm_set1_pd(2),
+                v_zero = _mm_setzero_pd(),
+                v_1 = _mm_set1_pd(1),
+                v_its = _mm_set1_pd(INTER_TAB_SIZE);
+        __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
+        #endif
+
         for( y = range.start; y < range.end; y += bh0 )
         {
             for( x = 0; x < width; x += bw0 )
@@ -5382,7 +5727,120 @@ public:
                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
 
                     if( interpolation == INTER_NEAREST )
-                        for( x1 = 0; x1 < bw; x1++ )
+                    {
+                        x1 = 0;
+
+                        #if CV_SSE4_1
+                        if (haveSSE4_1)
+                        {
+                            __m128d v_X0d = _mm_set1_pd(X0);
+                            __m128d v_Y0d = _mm_set1_pd(Y0);
+                            __m128d v_W0 = _mm_set1_pd(W0);
+                            __m128d v_x1 = _mm_set_pd(1, 0);
+
+                            for( ; x1 <= bw - 16; x1 += 16 )
+                            {
+                                // 0-3
+                                __m128i v_X0, v_Y0;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // 4-8
+                                __m128i v_X1, v_Y1;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // 8-11
+                                __m128i v_X2, v_Y2;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // 12-15
+                                __m128i v_X3, v_Y3;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // convert to 16s
+                                v_X0 = _mm_packs_epi32(v_X0, v_X1);
+                                v_X1 = _mm_packs_epi32(v_X2, v_X3);
+                                v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
+                                v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
+
+                                _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
+
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
+                            }
+                        }
+                        #endif
+
+                        for( ; x1 < bw; x1++ )
                         {
                             double W = W0 + M[6]*x1;
                             W = W ? 1./W : 0;
@@ -5394,10 +5852,136 @@ public:
                             xy[x1*2] = saturate_cast<short>(X);
                             xy[x1*2+1] = saturate_cast<short>(Y);
                         }
+                    }
                     else
                     {
                         short* alpha = A + y1*bw;
-                        for( x1 = 0; x1 < bw; x1++ )
+                        x1 = 0;
+
+                        #if CV_SSE4_1
+                        if (haveSSE4_1)
+                        {
+                            __m128d v_X0d = _mm_set1_pd(X0);
+                            __m128d v_Y0d = _mm_set1_pd(Y0);
+                            __m128d v_W0 = _mm_set1_pd(W0);
+                            __m128d v_x1 = _mm_set_pd(1, 0);
+
+                            for( ; x1 <= bw - 16; x1 += 16 )
+                            {
+                                // 0-3
+                                __m128i v_X0, v_Y0;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // 4-8
+                                __m128i v_X1, v_Y1;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // 8-11
+                                __m128i v_X2, v_Y2;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // 12-15
+                                __m128i v_X3, v_Y3;
+                                {
+                                    __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
+                                    v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
+                                    __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
+                                    __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
+                                    v_x1 = _mm_add_pd(v_x1, v_2);
+
+                                    v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
+                                    v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
+                                                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
+                                }
+
+                                // store alpha
+                                __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
+                                                                 _mm_and_si128(v_X0, v_itsi1));
+                                __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
+                                                                 _mm_and_si128(v_X1, v_itsi1));
+                                _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
+
+                                v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
+                                                         _mm_and_si128(v_X2, v_itsi1));
+                                v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
+                                                         _mm_and_si128(v_X3, v_itsi1));
+                                _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
+
+                                // convert to 16s
+                                v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
+                                v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
+                                v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
+                                v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
+
+                                _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
+
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
+                                _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
+                            }
+                        }
+                        #endif
+
+                        for( ; x1 < bw; x1++ )
                         {
                             double W = W0 + M[6]*x1;
                             W = W ? INTER_TAB_SIZE/W : 0;
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index e510530afd..4271b942ae 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -183,13 +184,336 @@ struct PyrDownVec_32f
     }
 };
 
-typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
-typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
+#if CV_SSE4_1
+
+struct PyrDownVec_32s16u
+{
+    PyrDownVec_32s16u()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator()(int** src, ushort* dst, int, int width) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+        __m128i v_delta = _mm_set1_epi32(128);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
+                    v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
+            __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
+                    v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
+            __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
+                    v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
+            __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
+                    v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
+            __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
+                    v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
+
+            v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
+            v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
+
+            v_r10 = _mm_slli_epi32(v_r10, 2);
+            __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
+
+            v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
+            v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
+            v_r11 = _mm_slli_epi32(v_r11, 2);
+            __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#else
+
+typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
+
+#endif // CV_SSE4_1
+
+struct PyrDownVec_32s16s
+{
+    PyrDownVec_32s16s()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator()(int** src, short* dst, int, int width) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+        __m128i v_delta = _mm_set1_epi32(128);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
+                    v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
+            __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
+                    v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
+            __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
+                    v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
+            __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
+                    v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
+            __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
+                    v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
+
+            v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
+            v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
+
+            v_r10 = _mm_slli_epi32(v_r10, 2);
+            __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
+
+            v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
+            v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
+            v_r11 = _mm_slli_epi32(v_r11, 2);
+            __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+struct PyrUpVec_32s8u
+{
+    int operator()(int** src, uchar** dst, int, int width) const
+    {
+        int x = 0;
+
+        if (!checkHardwareSupport(CV_CPU_SSE2))
+            return x;
+
+        uchar *dst0 = dst[0], *dst1 = dst[1];
+        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
+        __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
+
+        for( ; x <= width - 16; x += 16 )
+        {
+            __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
+                                           _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
+            __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
+                                           _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
+            __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
+                                           _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
+
+            __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
+            __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
+            __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
+
+            v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
+                                   _mm_loadu_si128((__m128i const *)(row0 + x + 12)));
+            v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
+                                   _mm_loadu_si128((__m128i const *)(row1 + x + 12)));
+            v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
+                                   _mm_loadu_si128((__m128i const *)(row2 + x + 12)));
+
+            v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
+            __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
+            __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
+
+            _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
+                                                                     _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
+            _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
+                                                                     _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
+        }
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
+                                           _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
+            __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
+                                           _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
+            __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
+                                           _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
+
+            __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
+            __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
+            __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
+
+            _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
+            _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
+        }
+
+        return x;
+    }
+};
+
+struct PyrUpVec_32s16s
+{
+    int operator()(int** src, short** dst, int, int width) const
+    {
+        int x = 0;
+
+        if (!checkHardwareSupport(CV_CPU_SSE2))
+            return x;
+
+        short *dst0 = dst[0], *dst1 = dst[1];
+        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
+        __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
+                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
+                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
+            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
+            __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
+            __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
+
+            v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
+            v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
+            v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
+            v_2r1 = _mm_slli_epi32(v_r1, 1);
+            v_4r1 = _mm_slli_epi32(v_r1, 2);
+            __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
+            __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
+
+            _mm_storeu_si128((__m128i *)(dst0 + x),
+                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
+                                _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
+            _mm_storeu_si128((__m128i *)(dst1 + x),
+                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
+                                _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
+        }
+
+        for( ; x <= width - 4; x += 4 )
+        {
+            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
+                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
+                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
+            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
+
+            __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
+            __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
+
+            _mm_storel_epi64((__m128i *)(dst0 + x),
+                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
+            _mm_storel_epi64((__m128i *)(dst1 + x),
+                _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
+        }
+
+        return x;
+    }
+};
+
+#if CV_SSE4_1
+
+struct PyrUpVec_32s16u
+{
+    int operator()(int** src, ushort** dst, int, int width) const
+    {
+        int x = 0;
+
+        if (!checkHardwareSupport(CV_CPU_SSE4_1))
+            return x;
+
+        ushort *dst0 = dst[0], *dst1 = dst[1];
+        const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
+        __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
+                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
+                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
+            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
+            __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
+            __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
+
+            v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
+            v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
+            v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
+            v_2r1 = _mm_slli_epi32(v_r1, 1);
+            v_4r1 = _mm_slli_epi32(v_r1, 2);
+            __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
+            __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
+
+            _mm_storeu_si128((__m128i *)(dst0 + x),
+                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
+                                 _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
+            _mm_storeu_si128((__m128i *)(dst1 + x),
+                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
+                                 _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
+        }
+
+        for( ; x <= width - 4; x += 4 )
+        {
+            __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
+                    v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
+                    v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
+            __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
+
+            __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
+            __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
+
+            _mm_storel_epi64((__m128i *)(dst0 + x),
+                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
+            _mm_storel_epi64((__m128i *)(dst1 + x),
+                _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
+        }
+
+        return x;
+    }
+};
+
+#else
 
-typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
-typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
-typedef PyrUpNoVec<float, float> PyrUpVec_32f;
+
+#endif // CV_SSE4_1
+
+struct PyrUpVec_32f
+{
+    int operator()(float** src, float** dst, int, int width) const
+    {
+        int x = 0;
+
+        if (!checkHardwareSupport(CV_CPU_SSE2))
+            return x;
+
+        const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
+        float *dst0 = dst[0], *dst1 = dst[1];
+        __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
+               v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128 v_r0 = _mm_loadu_ps(row0 + x);
+            __m128 v_r1 = _mm_loadu_ps(row1 + x);
+            __m128 v_r2 = _mm_loadu_ps(row2 + x);
+
+            _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
+            _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
+
+            v_r0 = _mm_loadu_ps(row0 + x + 4);
+            v_r1 = _mm_loadu_ps(row1 + x + 4);
+            v_r2 = _mm_loadu_ps(row2 + x + 4);
+
+            _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
+            _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
+        }
+
+        return x;
+    }
+};
 
 #elif CV_NEON
 
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 2a69003641..7d8b263bda 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -713,6 +714,156 @@ struct ColumnSum<int, ushort> :
     std::vector<int> sum;
 };
 
+template<>
+struct ColumnSum<int, int> :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
+    {
+        int i;
+        int* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        #if CV_SSE2
+            bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(int));
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const int* Sp = (const int*)src[0];
+                i = 0;
+                #if CV_SSE2
+                if(haveSSE2)
+                {
+                    for( ; i <= width-4; i+=4 )
+                    {
+                        __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
+                        __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
+                        _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
+                    }
+                }
+                #elif CV_NEON
+                for( ; i <= width - 4; i+=4 )
+                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                #endif
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const int* Sp = (const int*)src[0];
+            const int* Sm = (const int*)src[1-ksize];
+            int* D = (int*)dst;
+            if( haveScale )
+            {
+                i = 0;
+                #if CV_SSE2
+                if(haveSSE2)
+                {
+                    const __m128 scale4 = _mm_set1_ps((float)_scale);
+                    for( ; i <= width-4; i+=4 )
+                    {
+                        __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
+
+                        __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
+                                                     _mm_loadu_si128((const __m128i*)(Sp+i)));
+
+                        __m128i _s0T  = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
+
+                        _mm_storeu_si128((__m128i*)(D+i), _s0T);
+                        _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm));
+                    }
+                }
+                #elif CV_NEON
+                float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                for( ; i <= width-4; i+=4 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+
+                    int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                    vst1q_s32(D + i, v_s0d);
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                }
+                #endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<int>(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                i = 0;
+                #if CV_SSE2
+                if(haveSSE2)
+                {
+                    for( ; i <= width-4; i+=4 )
+                    {
+                        __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
+                        __m128i _s0  = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
+                                                     _mm_loadu_si128((const __m128i*)(Sp+i)));
+
+                        _mm_storeu_si128((__m128i*)(D+i), _s0);
+                        _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
+                    }
+                }
+                #elif CV_NEON
+                for( ; i <= width-4; i+=4 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+
+                    vst1q_s32(D + i, v_s0);
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                }
+                #endif
+
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = s0;
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<int> sum;
+};
+
+
 template<>
 struct ColumnSum<int, float> :
         public BaseColumnFilter
diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp
index cdef88f6c1..16c7c7ef26 100755
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
index 34505c4ca4..176c9907f3 100644
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@@ -1595,7 +1595,10 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst)
 TEST(Resize, Area_half)
 {
     const int size = 1000;
-    int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4, CV_16SC1, CV_16SC4, CV_32FC1, CV_32FC4 };
+    int types[] = { CV_8UC1, CV_8UC4,
+                    CV_16UC1, CV_16UC4,
+                    CV_16SC1, CV_16SC3, CV_16SC4,
+                    CV_32FC1, CV_32FC4 };
 
     cv::RNG rng(17);
 
diff --git a/modules/photo/test/test_cloning.cpp b/modules/photo/test/test_cloning.cpp
index 56d166205c..1f86612a4a 100644
--- a/modules/photo/test/test_cloning.cpp
+++ b/modules/photo/test/test_cloning.cpp
@@ -64,6 +64,7 @@ TEST(Photo_SeamlessClone_normal, regression)
     string original_path1 = folder + "source1.png";
     string original_path2 = folder + "destination1.png";
     string original_path3 = folder + "mask.png";
+    string reference_path = folder + "reference.png";
 
     Mat source = imread(original_path1, IMREAD_COLOR);
     Mat destination = imread(original_path2, IMREAD_COLOR);
@@ -79,8 +80,8 @@ TEST(Photo_SeamlessClone_normal, regression)
     p.y = destination.size().height/2;
     seamlessClone(source, destination, mask, p, result, 1);
 
-
-    Mat reference = imread(folder + "reference.png");
+    Mat reference = imread(reference_path);
+    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
 
     SAVE(result);
 
@@ -94,6 +95,7 @@ TEST(Photo_SeamlessClone_mixed, regression)
     string original_path1 = folder + "source1.png";
     string original_path2 = folder + "destination1.png";
     string original_path3 = folder + "mask.png";
+    string reference_path = folder + "reference.png";
 
     Mat source = imread(original_path1, IMREAD_COLOR);
     Mat destination = imread(original_path2, IMREAD_COLOR);
@@ -111,7 +113,9 @@ TEST(Photo_SeamlessClone_mixed, regression)
 
     SAVE(result);
 
-    Mat reference = imread(folder + "reference.png");
+    Mat reference = imread(reference_path);
+    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
+
     double error = cvtest::norm(reference, result, NORM_L1);
     EXPECT_LE(error, numerical_precision);
 
@@ -123,6 +127,7 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
     string original_path1 = folder + "source1.png";
     string original_path2 = folder + "destination1.png";
     string original_path3 = folder + "mask.png";
+    string reference_path = folder + "reference.png";
 
     Mat source = imread(original_path1, IMREAD_COLOR);
     Mat destination = imread(original_path2, IMREAD_COLOR);
@@ -140,7 +145,9 @@ TEST(Photo_SeamlessClone_featureExchange, regression)
 
     SAVE(result);
 
-    Mat reference = imread(folder + "reference.png");
+    Mat reference = imread(reference_path);
+    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
+
     double error = cvtest::norm(reference, result, NORM_L1);
     EXPECT_LE(error, numerical_precision);
 
@@ -151,6 +158,7 @@ TEST(Photo_SeamlessClone_colorChange, regression)
     string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/color_change/";
     string original_path1 = folder + "source1.png";
     string original_path2 = folder + "mask.png";
+    string reference_path = folder + "reference.png";
 
     Mat source = imread(original_path1, IMREAD_COLOR);
     Mat mask = imread(original_path2, IMREAD_COLOR);
@@ -163,7 +171,9 @@ TEST(Photo_SeamlessClone_colorChange, regression)
 
     SAVE(result);
 
-    Mat reference = imread(folder + "reference.png");
+    Mat reference = imread(reference_path);
+    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
+
     double error = cvtest::norm(reference, result, NORM_L1);
     EXPECT_LE(error, numerical_precision);
 
@@ -174,6 +184,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
     string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Illumination_Change/";
     string original_path1 = folder + "source1.png";
     string original_path2 = folder + "mask.png";
+    string reference_path = folder + "reference.png";
 
     Mat source = imread(original_path1, IMREAD_COLOR);
     Mat mask = imread(original_path2, IMREAD_COLOR);
@@ -186,7 +197,7 @@ TEST(Photo_SeamlessClone_illuminationChange, regression)
 
     SAVE(result);
 
-    Mat reference = imread(folder + "reference.png");
+    Mat reference = imread(reference_path);
     double error = cvtest::norm(reference, result, NORM_L1);
     EXPECT_LE(error, numerical_precision);
 
@@ -197,6 +208,7 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
     string folder = string(cvtest::TS::ptr()->get_data_path()) + "cloning/Texture_Flattening/";
     string original_path1 = folder + "source1.png";
     string original_path2 = folder + "mask.png";
+    string reference_path = folder + "reference.png";
 
     Mat source = imread(original_path1, IMREAD_COLOR);
     Mat mask = imread(original_path2, IMREAD_COLOR);
@@ -209,7 +221,9 @@ TEST(Photo_SeamlessClone_textureFlattening, regression)
 
     SAVE(result);
 
-    Mat reference = imread(folder + "reference.png");
+    Mat reference = imread(reference_path);
+    ASSERT_FALSE(reference.empty()) << "Could not load reference image " << reference_path;
+
     double error = cvtest::norm(reference, result, NORM_L1);
     EXPECT_LE(error, numerical_precision);
 
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 03877c0910..b6a832b6bb 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -2998,6 +2998,12 @@ void printVersionInfo(bool useStdOut)
 
     std::string cpu_features;
 
+#if CV_POPCNT
+    if (checkHardwareSupport(CV_CPU_POPCNT)) cpu_features += " popcnt";
+#endif
+#if CV_MMX
+    if (checkHardwareSupport(CV_CPU_MMX)) cpu_features += " mmx";
+#endif
 #if CV_SSE
     if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
 #endif
@@ -3019,6 +3025,39 @@ void printVersionInfo(bool useStdOut)
 #if CV_AVX
     if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
 #endif
+#if CV_AVX2
+    if (checkHardwareSupport(CV_CPU_AVX2)) cpu_features += " avx2";
+#endif
+#if CV_FMA3
+    if (checkHardwareSupport(CV_CPU_FMA3)) cpu_features += " fma3";
+#endif
+#if CV_AVX_512F
+    if (checkHardwareSupport(CV_CPU_AVX_512F) cpu_features += " avx-512f";
+#endif
+#if CV_AVX_512BW
+    if (checkHardwareSupport(CV_CPU_AVX_512BW) cpu_features += " avx-512bw";
+#endif
+#if CV_AVX_512CD
+    if (checkHardwareSupport(CV_CPU_AVX_512CD) cpu_features += " avx-512cd";
+#endif
+#if CV_AVX_512DQ
+    if (checkHardwareSupport(CV_CPU_AVX_512DQ) cpu_features += " avx-512dq";
+#endif
+#if CV_AVX_512ER
+    if (checkHardwareSupport(CV_CPU_AVX_512ER) cpu_features += " avx-512er";
+#endif
+#if CV_AVX_512IFMA512
+    if (checkHardwareSupport(CV_CPU_AVX_512IFMA512) cpu_features += " avx-512ifma512";
+#endif
+#if CV_AVX_512PF
+    if (checkHardwareSupport(CV_CPU_AVX_512PF) cpu_features += " avx-512pf";
+#endif
+#if CV_AVX_512VBMI
+    if (checkHardwareSupport(CV_CPU_AVX_512VBMI) cpu_features += " avx-512vbmi";
+#endif
+#if CV_AVX_512VL
+    if (checkHardwareSupport(CV_CPU_AVX_512VL) cpu_features += " avx-512vl";
+#endif
 #if CV_NEON
     if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
 #endif