From 2057f2c4521a62cf214196cf79550edb823dfe81 Mon Sep 17 00:00:00 2001
From: Vladimir Dudnik <no@email>
Date: Mon, 18 Apr 2011 21:24:57 +0000
Subject: [PATCH] fixed build issues related to changes in IPP calls.

---
 modules/core/src/arithm.cpp  | 552 +++++++++++++++++++++--------------
 modules/core/src/matmul.cpp  | 137 ++++-----
 modules/core/src/precomp.hpp |  14 +-
 modules/core/src/system.cpp  |   3 +-
 4 files changed, 404 insertions(+), 302 deletions(-)
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index d0fc0f44a4..dc9ad1c640 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -56,7 +56,7 @@ struct IPPArithmInitializer
 {
     IPPArithmInitializer(void)
     {
-        IppStatus status = ippStaticInit();
+        ippStaticInit();
     }
 };
 
@@ -64,19 +64,19 @@ IPPArithmInitializer ippArithmInitializer;
 #endif
 
 struct NOP {};
-    
+
 template<typename T, class Op, class Op8>
 void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
 {
     Op8 op8;
     Op op;
-    
+
     for( ; sz.height--; src1 += step1/sizeof(src1[0]),
                         src2 += step2/sizeof(src2[0]),
                         dst += step/sizeof(dst[0]) )
     {
         int x = 0;
-        
+
     #if CV_SSE2
         if( USE_SSE2 )
         {
@@ -97,7 +97,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
             }
         }
     #endif
-        
+
         for( ; x <= sz.width - 4; x += 4 )
         {
             T v0 = op(src1[x], src2[x]);
@@ -107,7 +107,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-        
+
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
@@ -119,13 +119,13 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2,
 {
     Op16 op16;
     Op op;
-    
+
     for( ; sz.height--; src1 += step1/sizeof(src1[0]),
         src2 += step2/sizeof(src2[0]),
         dst += step/sizeof(dst[0]) )
     {
         int x = 0;
-        
+
     #if CV_SSE2
         if( USE_SSE2 )
         {
@@ -147,7 +147,7 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2,
         }
         else
     #endif
-        
+
         for( ; x <= sz.width - 4; x += 4 )
         {
             T v0 = op(src1[x], src2[x]);
@@ -157,26 +157,26 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2,
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-        
+
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
 }
 
-    
+
 template<class Op, class Op32>
 void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
                int* dst, size_t step, Size sz)
 {
     Op32 op32;
     Op op;
-    
+
     for( ; sz.height--; src1 += step1/sizeof(src1[0]),
         src2 += step2/sizeof(src2[0]),
         dst += step/sizeof(dst[0]) )
     {
         int x = 0;
-        
+
 #if CV_SSE2
         if( USE_SSE2 )
         {
@@ -202,7 +202,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
                 }
         }
 #endif
-        
+
         for( ; x <= sz.width - 4; x += 4 )
         {
             int v0 = op(src1[x], src2[x]);
@@ -212,26 +212,26 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2,
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-        
+
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
 }
 
-    
+
 template<class Op, class Op32>
 void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
                float* dst, size_t step, Size sz)
 {
     Op32 op32;
     Op op;
-    
+
     for( ; sz.height--; src1 += step1/sizeof(src1[0]),
         src2 += step2/sizeof(src2[0]),
         dst += step/sizeof(dst[0]) )
     {
         int x = 0;
-        
+
     #if CV_SSE2
         if( USE_SSE2 )
         {
@@ -266,7 +266,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2,
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-        
+
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
@@ -278,13 +278,13 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step
 {
     Op64 op64;
     Op op;
-    
+
     for( ; sz.height--; src1 += step1/sizeof(src1[0]),
         src2 += step2/sizeof(src2[0]),
         dst += step/sizeof(dst[0]) )
     {
         int x = 0;
-        
+
     #if CV_SSE2
         if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
             for( ; x <= sz.width - 4; x += 4 )
@@ -307,14 +307,14 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step
             v1 = op(src1[x+3], src2[x+3]);
             dst[x+2] = v0; dst[x+3] = v1;
         }
-        
+
         for( ; x < sz.width; x++ )
             dst[x] = op(src1[x], src2[x]);
     }
 }
-    
+
 #if CV_SSE2
-    
+
 struct _VAdd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu8(a,b); }};
 struct _VSub8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu8(a,b); }};
 struct _VMin8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }};
@@ -410,7 +410,7 @@ struct _VAbsDiff32s
         __m128i m = _mm_cmpgt_epi32(b, a);
         return _mm_sub_epi32(_mm_xor_si128(d, m), m);
     }
-};    
+};
 
 struct _VAdd32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_add_ps(a,b); }};
 struct _VSub32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_sub_ps(a,b); }};
@@ -429,7 +429,7 @@ struct _VAdd64f { __m128d operator()(const __m128d& a, const __m128d& b) const {
 struct _VSub64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_sub_pd(a,b); }};
 struct _VMin64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_min_pd(a,b); }};
 struct _VMax64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_max_pd(a,b); }};
-    
+
 static int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
 struct _VAbsDiff64f
 {
@@ -437,13 +437,13 @@ struct _VAbsDiff64f
     {
         return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
     }
-};    
-    
+};
+
 struct _VAnd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_and_si128(a,b); }};
 struct _VOr8u  { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_or_si128(a,b); }};
 struct _VXor8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_xor_si128(a,b); }};
 struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { return _mm_andnot_si128(_mm_setzero_si128(),a); }};
-    
+
 #endif
 
 #if CV_SSE2
@@ -451,12 +451,12 @@ struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { re
 #else
 #define IF_SIMD(op) NOP
 #endif
-    
+
 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
 { return CV_FAST_CAST_8U(a + b); }
 template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
 { return CV_FAST_CAST_8U(a - b); }
-    
+
 template<typename T> struct OpAbsDiff
 {
     typedef T type1;
@@ -470,7 +470,7 @@ template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
 
 template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
 { return saturate_cast<schar>(std::abs(a - b)); }
-    
+
 template<typename T, typename WT=T> struct OpAbsDiffS
 {
     typedef T type1;
@@ -510,19 +510,19 @@ template<typename T> struct OpNot
     typedef T rtype;
     T operator()( T a, T ) const { return ~a; }
 };
-    
+
 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
 {
     if( sz.height == 1 )
         step1 = step2 = step = sz.width*elemSize;
 }
-    
+
 static void add8u( const uchar* src1, size_t step1,
                    const uchar* src2, size_t step2,
                    uchar* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
            (vBinOp8<uchar, OpAdd<uchar>, IF_SIMD(_VAdd8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -538,7 +538,7 @@ static void add16u( const ushort* src1, size_t step1,
                     ushort* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
             (vBinOp16<ushort, OpAdd<ushort>, IF_SIMD(_VAdd16u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -547,7 +547,7 @@ static void add16s( const short* src1, size_t step1,
                     short* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0),
            (vBinOp16<short, OpAdd<short>, IF_SIMD(_VAdd16s)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -563,7 +563,7 @@ static void add32f( const float* src1, size_t step1,
                     float* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAdd_32f_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0),
+           ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
            (vBinOp32f<OpAdd<float>, IF_SIMD(_VAdd32f)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -579,7 +579,7 @@ static void sub8u( const uchar* src1, size_t step1,
                    uchar* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
            (vBinOp8<uchar, OpSub<uchar>, IF_SIMD(_VSub8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -595,7 +595,7 @@ static void sub16u( const ushort* src1, size_t step1,
                     ushort* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
            (vBinOp16<ushort, OpSub<ushort>, IF_SIMD(_VSub16u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -604,7 +604,7 @@ static void sub16s( const short* src1, size_t step1,
                     short* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0),
            (vBinOp16<short, OpSub<short>, IF_SIMD(_VSub16s)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -620,7 +620,7 @@ static void sub32f( const float* src1, size_t step1,
                    float* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiSub_32f_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0),
+           ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz),
            (vBinOp32f<OpSub<float>, IF_SIMD(_VSub32f)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -629,18 +629,36 @@ static void sub64f( const double* src1, size_t step1,
                     double* dst, size_t step, Size sz, void* )
 {
     vBinOp64f<OpSub<double>, IF_SIMD(_VSub64f)>(src1, step1, src2, step2, dst, step, sz);
-}    
+}
 
 template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
 template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
-    
+
 static void max8u( const uchar* src1, size_t step1,
                    const uchar* src2, size_t step2,
                    uchar* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp8<uchar, OpMax<uchar>, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    uchar* s1 = (uchar*)src1;
+    uchar* s2 = (uchar*)src2;
+    uchar* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMaxEvery_8u(s1, s2, d, sz.width);
+      s1 += step1;
+      s2 += step2;
+      d  += step;
+    }
+  }
+#else
+  vBinOp8<uchar, OpMax<uchar>, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMaxEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp8<uchar, OpMax<uchar>, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
 static void max8s( const schar* src1, size_t step1,
@@ -654,18 +672,34 @@ static void max16u( const ushort* src1, size_t step1,
                     const ushort* src2, size_t step2,
                     ushort* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<ushort, OpMax<ushort>, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    ushort* s1 = (ushort*)src1;
+    ushort* s2 = (ushort*)src2;
+    ushort* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMaxEvery_16u(s1, s2, d, sz.width);
+      s1 = (ushort*)((uchar*)s1 + step1);
+      s2 = (ushort*)((uchar*)s2 + step2);
+      d  = (ushort*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp16<ushort, OpMax<ushort>, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMaxEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp16<ushort, OpMax<ushort>, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
 static void max16s( const short* src1, size_t step1,
                     const short* src2, size_t step2,
                     short* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<short, OpMax<short>, IF_SIMD(_VMax16s)>(src1, step1, src2, step2, dst, step, sz)));
+    vBinOp16<short, OpMax<short>, IF_SIMD(_VMax16s)>(src1, step1, src2, step2, dst, step, sz);
 }
 
 static void max32s( const int* src1, size_t step1,
@@ -679,9 +713,26 @@ static void max32f( const float* src1, size_t step1,
                     const float* src2, size_t step2,
                     float* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp32f<OpMax<float>, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    float* s1 = (float*)src1;
+    float* s2 = (float*)src2;
+    float* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMaxEvery_32f(s1, s2, d, sz.width);
+      s1 = (float*)((uchar*)s1 + step1);
+      s2 = (float*)((uchar*)s2 + step2);
+      d  = (float*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp32f<OpMax<float>, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp32f<OpMax<float>, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
 static void max64f( const double* src1, size_t step1,
@@ -695,9 +746,27 @@ static void min8u( const uchar* src1, size_t step1,
                    const uchar* src2, size_t step2,
                    uchar* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp8<uchar, OpMin<uchar>, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    uchar* s1 = (uchar*)src1;
+    uchar* s2 = (uchar*)src2;
+    uchar* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMinEvery_8u(s1, s2, d, sz.width);
+      s1 += step1;
+      s2 += step2;
+      d  += step;
+    }
+  }
+#else
+  vBinOp8<uchar, OpMin<uchar>, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMinEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp8<uchar, OpMin<uchar>, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
 static void min8s( const schar* src1, size_t step1,
@@ -711,18 +780,34 @@ static void min16u( const ushort* src1, size_t step1,
                     const ushort* src2, size_t step2,
                     ushort* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<ushort, OpMin<ushort>, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    ushort* s1 = (ushort*)src1;
+    ushort* s2 = (ushort*)src2;
+    ushort* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMinEvery_16u(s1, s2, d, sz.width);
+      s1 = (ushort*)((uchar*)s1 + step1);
+      s2 = (ushort*)((uchar*)s2 + step2);
+      d  = (ushort*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp16<ushort, OpMin<ushort>, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMinEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp16<ushort, OpMin<ushort>, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
 static void min16s( const short* src1, size_t step1,
                     const short* src2, size_t step2,
                     short* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<short, OpMin<short>, IF_SIMD(_VMin16s)>(src1, step1, src2, step2, dst, step, sz)));
+    vBinOp16<short, OpMin<short>, IF_SIMD(_VMin16s)>(src1, step1, src2, step2, dst, step, sz);
 }
 
 static void min32s( const int* src1, size_t step1,
@@ -736,9 +821,26 @@ static void min32f( const float* src1, size_t step1,
                     const float* src2, size_t step2,
                     float* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp32f<OpMin<float>, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz)));
+#if (ARITHM_USE_IPP == 1)
+  {
+    float* s1 = (float*)src1;
+    float* s2 = (float*)src2;
+    float* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    for(int i = 0; i < sz.height; i++)
+    {
+      ippsMinEvery_32f(s1, s2, d, sz.width);
+      s1 = (float*)((uchar*)s1 + step1);
+      s2 = (float*)((uchar*)s2 + step2);
+      d  = (float*)((uchar*)d + step);
+    }
+  }
+#else
+  vBinOp32f<OpMin<float>, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz);
+#endif
+//    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+//           ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+//           (vBinOp32f<OpMin<float>, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
 static void min64f( const double* src1, size_t step1,
@@ -746,14 +848,14 @@ static void min64f( const double* src1, size_t step1,
                     double* dst, size_t step, Size sz, void* )
 {
     vBinOp64f<OpMin<double>, IF_SIMD(_VMin64f)>(src1, step1, src2, step2, dst, step, sz);
-}    
+}
 
 static void absdiff8u( const uchar* src1, size_t step1,
                        const uchar* src2, size_t step2,
                        uchar* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
            (vBinOp8<uchar, OpAbsDiff<uchar>, IF_SIMD(_VAbsDiff8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -769,7 +871,7 @@ static void absdiff16u( const ushort* src1, size_t step1,
                         ushort* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
            (vBinOp16<ushort, OpAbsDiff<ushort>, IF_SIMD(_VAbsDiff16u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -777,9 +879,7 @@ static void absdiff16s( const short* src1, size_t step1,
                         const short* src2, size_t step2,
                         short* dst, size_t step, Size sz, void* )
 {
-    IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
-           (vBinOp16<short, OpAbsDiff<short>, IF_SIMD(_VAbsDiff16s)>(src1, step1, src2, step2, dst, step, sz)));
+    vBinOp16<short, OpAbsDiff<short>, IF_SIMD(_VAbsDiff16s)>(src1, step1, src2, step2, dst, step, sz);
 }
 
 static void absdiff32s( const int* src1, size_t step1,
@@ -794,7 +894,7 @@ static void absdiff32f( const float* src1, size_t step1,
                         float* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
            (vBinOp32f<OpAbsDiff<float>, IF_SIMD(_VAbsDiff32f)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -803,15 +903,15 @@ static void absdiff64f( const double* src1, size_t step1,
                         double* dst, size_t step, Size sz, void* )
 {
     vBinOp64f<OpAbsDiff<double>, IF_SIMD(_VAbsDiff64f)>(src1, step1, src2, step2, dst, step, sz);
-}    
-    
+}
+
 
 static void and8u( const uchar* src1, size_t step1,
                    const uchar* src2, size_t step2,
                    uchar* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
            (vBinOp8<uchar, OpAnd<uchar>, IF_SIMD(_VAnd8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -820,7 +920,7 @@ static void or8u( const uchar* src1, size_t step1,
                   uchar* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
            (vBinOp8<uchar, OpOr<uchar>, IF_SIMD(_VOr8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
 
@@ -829,23 +929,23 @@ static void xor8u( const uchar* src1, size_t step1,
                    uchar* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz),
+           ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz),
            (vBinOp8<uchar, OpXor<uchar>, IF_SIMD(_VXor8u)>(src1, step1, src2, step2, dst, step, sz)));
-}    
+}
 
 static void not8u( const uchar* src1, size_t step1,
                    const uchar* src2, size_t step2,
                    uchar* dst, size_t step, Size sz, void* )
 {
     IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-           ippiNot_8u_C1R(src1, (int)step1, dst, (IppiSize&)sz),
+           ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, (IppiSize&)sz),
            (vBinOp8<uchar, OpNot<uchar>, IF_SIMD(_VNot8u)>(src1, step1, src2, step2, dst, step, sz)));
 }
-    
+
 /****************************************************************************************\
 *                                   logical operations                                   *
 \****************************************************************************************/
-    
+
 static inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
 {
     if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() )
@@ -856,7 +956,7 @@ static inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
     return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) ||
         (sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
 }
-    
+
 static void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
 {
     int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
@@ -872,9 +972,9 @@ static void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, si
     }
     for( size_t i = esz; i < blocksize*esz; i++ )
         scbuf[i] = scbuf[i - esz];
-    
+
 }
-    
+
 void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _dst,
                const InputArray& _mask, const BinaryFunc* tab, bool bitwise)
 {
@@ -883,7 +983,7 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
     bool haveMask = !_mask.empty(), haveScalar = false;
     BinaryFunc func;
     int c;
-    
+
     if( src1.dims <= 2 && src2.dims <= 2 && kind1 == kind2 &&
         src1.size() == src2.size() && src1.type() == src2.type() && !haveMask )
     {
@@ -899,12 +999,12 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
             func = tab[src1.depth()];
             c = src1.channels();
         }
-            
+
         Size sz = getContinuousSize(src1, src2, dst, c);
         func(src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, 0);
         return;
     }
-    
+
     if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 ||
         src1.size != src2.size || src1.type() != src2.type() )
     {
@@ -917,13 +1017,13 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                       "nor 'array op scalar', nor 'scalar op array'" );
         haveScalar = true;
     }
-    
+
     size_t esz = src1.elemSize();
     size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
     int cn = src1.channels();
     BinaryFunc copymask = 0;
     Mat mask;
-    
+
     if( haveMask )
     {
         mask = _mask.getMat();
@@ -931,13 +1031,13 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         CV_Assert( mask.size == src1.size );
         copymask = getCopyMaskFunc(esz);
     }
-    
+
     AutoBuffer<uchar> _buf;
     uchar *scbuf = 0, *maskbuf = 0;
-    
+
     _dst.create(src1.dims, src1.size, src1.type());
     Mat dst = _dst.getMat();
-    
+
     if( bitwise )
     {
         func = *tab;
@@ -948,35 +1048,35 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         func = tab[src1.depth()];
         c = cn;
     }
-    
+
     if( !haveScalar )
     {
         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
         uchar* ptrs[4];
-        
+
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size, blocksize = total;
-        
+
         if( haveMask )
         {
             blocksize = std::min(blocksize, blocksize0);
             _buf.allocate(blocksize*esz);
             maskbuf = _buf;
         }
-        
+
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
             for( size_t j = 0; j < total; j += blocksize )
             {
                 int bsz = (int)std::min(total - j, blocksize);
-                
-                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 );                
+
+                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 );
                 if( haveMask )
                 {
                     copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
                     ptrs[3] += bsz;
                 }
-                
+
                 bsz *= (int)esz;
                 ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
             }
@@ -986,41 +1086,41 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
     {
         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
         uchar* ptrs[3];
-        
+
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size, blocksize = std::min(total, blocksize0);
-        
+
         _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
         scbuf = _buf;
         maskbuf = alignPtr(scbuf + blocksize*esz, 16);
-        
+
         convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
-        
+
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
             for( size_t j = 0; j < total; j += blocksize )
             {
                 int bsz = (int)std::min(total - j, blocksize);
-                
+
                 func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*c, 1), 0 );
                 if( haveMask )
                 {
                     copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
                     ptrs[2] += bsz;
                 }
-                
+
                 bsz *= (int)esz;
                 ptrs[0] += bsz; ptrs[1] += bsz;
             }
         }
     }
 }
-    
+
 static BinaryFunc maxTab[] =
 {
     (BinaryFunc)max8u, (BinaryFunc)max8s, (BinaryFunc)max16u, (BinaryFunc)max16s,
     (BinaryFunc)max32s, (BinaryFunc)max32f, (BinaryFunc)max64f, 0
-};    
+};
 
 static BinaryFunc minTab[] =
 {
@@ -1029,7 +1129,7 @@ static BinaryFunc minTab[] =
 };
 
 }
-    
+
 void cv::bitwise_and(const InputArray& a, const InputArray& b, OutputArray c, const InputArray& mask)
 {
     BinaryFunc f = and8u;
@@ -1068,26 +1168,26 @@ void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
 {
     OutputArray _dst(dst);
     binary_op(src1, src2, _dst, InputArray(), maxTab, false );
-}    
-    
+}
+
 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
 {
     OutputArray _dst(dst);
     binary_op(src1, src2, _dst, InputArray(), minTab, false );
 }
-    
+
 void cv::max(const Mat& src1, double src2, Mat& dst)
 {
     OutputArray _dst(dst);
     binary_op(src1, src2, _dst, InputArray(), maxTab, false );
-}    
+}
 
 void cv::min(const Mat& src1, double src2, Mat& dst)
 {
     OutputArray _dst(dst);
     binary_op(src1, src2, _dst, InputArray(), minTab, false );
 }
-    
+
 /****************************************************************************************\
 *                                      add/subtract                                      *
 \****************************************************************************************/
@@ -1101,7 +1201,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
     int kind1 = _src1.kind(), kind2 = _src2.kind();
     Mat src1 = _src1.getMat(), src2 = _src2.getMat();
     bool haveMask = !_mask.empty();
-    
+
     if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 &&
         src1.size() == src2.size() && src1.type() == src2.type() &&
         !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == src1.depth())) ||
@@ -1113,9 +1213,9 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         tab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
         return;
     }
-    
+
     bool haveScalar = false, swapped12 = false;
-    
+
     if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 ||
         src1.size != src2.size || src1.channels() != src2.channels() )
     {
@@ -1131,10 +1231,10 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                      "nor 'array op scalar', nor 'scalar op array'" );
         haveScalar = true;
     }
-    
+
     int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth(), wtype;
     BinaryFunc cvtsrc1 = 0, cvtsrc2 = 0, cvtdst = 0;
-    
+
     if( dtype < 0 )
     {
         if( _dst.fixedType() )
@@ -1149,7 +1249,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         }
     }
     dtype = CV_MAT_DEPTH(dtype);
-    
+
     if( depth1 == depth2 && dtype == depth1 )
         wtype = dtype;
     else if( !muldiv )
@@ -1157,7 +1257,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
                 depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
         wtype = std::max(wtype, dtype);
-    
+
         // when the result of addition should be converted to an integer type,
         // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
         // instead of converting the other input to floating-point and then converting the operation result back to integers.
@@ -1169,20 +1269,20 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         wtype = std::max(depth1, std::max(depth2, CV_32F));
         wtype = std::max(wtype, dtype);
     }
-    
+
     cvtsrc1 = depth1 == wtype ? 0 : getConvertFunc(depth1, wtype);
     cvtsrc2 = depth2 == depth1 ? cvtsrc1 : depth2 == wtype ? 0 : getConvertFunc(depth2, wtype);
     cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
-    
+
     dtype = CV_MAKETYPE(dtype, cn);
     wtype = CV_MAKETYPE(wtype, cn);
-    
+
     size_t esz1 = src1.elemSize(), esz2 = src2.elemSize();
     size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
     size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
     BinaryFunc copymask = 0;
     Mat mask;
-    
+
     if( haveMask )
     {
         mask = _mask.getMat();
@@ -1190,23 +1290,23 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         CV_Assert( mask.size == src1.size );
         copymask = getCopyMaskFunc(dsz);
     }
-    
+
     AutoBuffer<uchar> _buf;
     uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
     size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0);
-    
+
     _dst.create(src1.dims, src1.size, src1.type());
     Mat dst = _dst.getMat();
     BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
-    
+
     if( !haveScalar )
     {
         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
         uchar* ptrs[4];
-        
+
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size, blocksize = total;
-        
+
         if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
             blocksize = std::min(blocksize, blocksize0);
 
@@ -1221,7 +1321,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
             buf = alignPtr(buf + blocksize*wsz, 16);
         if( haveMask )
             maskbuf = buf;
-        
+
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
             for( size_t j = 0; j < total; j += blocksize )
@@ -1242,7 +1342,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                     cvtsrc2( sptr2, 0, 0, 0, buf2, 0, bszn, 0 );
                     sptr2 = buf2;
                 }
-                
+
                 if( !haveMask && !cvtdst )
                     func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata );
                 else
@@ -1270,10 +1370,10 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
     {
         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
         uchar* ptrs[3];
-        
+
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size, blocksize = std::min(total, blocksize0);
-        
+
         _buf.allocate(bufesz*blocksize + 64);
         buf = _buf;
         if( cvtsrc1 )
@@ -1284,9 +1384,9 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
             buf = alignPtr(buf + blocksize*wsz, 16);
         if( haveMask )
             maskbuf = buf;
-        
+
         convertAndUnrollScalar( src2, wtype, buf2, blocksize);
-        
+
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
             for( size_t j = 0; j < total; j += blocksize )
@@ -1296,16 +1396,16 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
                 const uchar *sptr1 = ptrs[0];
                 const uchar* sptr2 = buf2;
                 uchar* dptr = ptrs[1];
-                
+
                 if( cvtsrc1 )
                 {
                     cvtsrc1( sptr1, 0, 0, 0, buf1, 0, bszn, 0 );
                     sptr1 = buf1;
                 }
-                
+
                 if( swapped12 )
                     std::swap(sptr1, sptr2);
-                
+
                 if( !haveMask && !cvtdst )
                     func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata );
                 else
@@ -1330,13 +1430,13 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d
         }
     }
 }
- 
+
 static BinaryFunc addTab[] =
 {
     (BinaryFunc)add8u, (BinaryFunc)add8s, (BinaryFunc)add16u, (BinaryFunc)add16s,
     (BinaryFunc)add32s, (BinaryFunc)add32f, (BinaryFunc)add64f, 0
 };
-    
+
 static BinaryFunc subTab[] =
 {
     (BinaryFunc)sub8u, (BinaryFunc)sub8s, (BinaryFunc)sub16u, (BinaryFunc)sub16s,
@@ -1348,10 +1448,10 @@ static BinaryFunc absdiffTab[] =
     (BinaryFunc)absdiff8u, (BinaryFunc)absdiff8s, (BinaryFunc)absdiff16u,
     (BinaryFunc)absdiff16s, (BinaryFunc)absdiff32s, (BinaryFunc)absdiff32f,
     (BinaryFunc)absdiff64f, 0
-};    
+};
 
 }
-    
+
 void cv::add( const InputArray& src1, const InputArray& src2, OutputArray dst,
           const InputArray& mask, int dtype )
 {
@@ -1367,7 +1467,7 @@ void cv::subtract( const InputArray& src1, const InputArray& src2, OutputArray d
 void cv::absdiff( const InputArray& src1, const InputArray& src2, OutputArray dst )
 {
     arithm_op(src1, src2, dst, InputArray(), -1, absdiffTab);
-}    
+}
 
 /****************************************************************************************\
 *                                    multiply/divide                                     *
@@ -1437,7 +1537,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
     step1 /= sizeof(src1[0]);
     step2 /= sizeof(src2[0]);
     step /= sizeof(dst[0]);
-    
+
     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
     {
         int i = 0;
@@ -1450,12 +1550,12 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
                 double d = scale/(a * b);
                 b *= d;
                 a *= d;
-                
+
                 T z0 = saturate_cast<T>(src2[i+1] * ((double)src1[i] * b));
                 T z1 = saturate_cast<T>(src2[i] * ((double)src1[i+1] * b));
                 T z2 = saturate_cast<T>(src2[i+3] * ((double)src1[i+2] * a));
                 T z3 = saturate_cast<T>(src2[i+2] * ((double)src1[i+3] * a));
-                
+
                 dst[i] = z0; dst[i+1] = z1;
                 dst[i+2] = z2; dst[i+3] = z3;
             }
@@ -1465,12 +1565,12 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2,
                 T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
                 T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
                 T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
-                
+
                 dst[i] = z0; dst[i+1] = z1;
                 dst[i+2] = z2; dst[i+3] = z3;
             }
         }
-        
+
         for( ; i < size.width; i++ )
             dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
     }
@@ -1482,7 +1582,7 @@ recip_( const T*, size_t, const T* src2, size_t step2,
 {
     step2 /= sizeof(src2[0]);
     step /= sizeof(dst[0]);
-    
+
     for( ; size.height--; src2 += step2, dst += step )
     {
         int i = 0;
@@ -1495,12 +1595,12 @@ recip_( const T*, size_t, const T* src2, size_t step2,
                 double d = scale/(a * b);
                 b *= d;
                 a *= d;
-                
+
                 T z0 = saturate_cast<T>(src2[i+1] * b);
                 T z1 = saturate_cast<T>(src2[i] * b);
                 T z2 = saturate_cast<T>(src2[i+3] * a);
                 T z3 = saturate_cast<T>(src2[i+2] * a);
-                
+
                 dst[i] = z0; dst[i+1] = z1;
                 dst[i+2] = z2; dst[i+3] = z3;
             }
@@ -1515,13 +1615,13 @@ recip_( const T*, size_t, const T* src2, size_t step2,
                 dst[i+2] = z2; dst[i+3] = z3;
             }
         }
-        
+
         for( ; i < size.width; i++ )
             dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
     }
 }
-    
-    
+
+
 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                    uchar* dst, size_t step, Size sz, void* scale)
 {
@@ -1551,7 +1651,7 @@ static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2
 {
     mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
-    
+
 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
                     float* dst, size_t step, Size sz, void* scale)
 {
@@ -1563,7 +1663,7 @@ static void mul64f( const double* src1, size_t step1, const double* src2, size_t
 {
     mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
-    
+
 static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                    uchar* dst, size_t step, Size sz, void* scale)
 {
@@ -1650,8 +1750,8 @@ static void recip64f( const double* src1, size_t step1, const double* src2, size
 {
     recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
 }
-    
-    
+
+
 static BinaryFunc mulTab[] =
 {
     (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
@@ -1673,9 +1773,9 @@ static BinaryFunc recipTab[] =
     (BinaryFunc)recip64f, 0
 };
 
-    
+
 }
-    
+
 void cv::multiply(const InputArray& src1, const InputArray& src2,
                   OutputArray dst, double scale, int dtype)
 {
@@ -1692,8 +1792,8 @@ void cv::divide(double scale, const InputArray& src2,
                 OutputArray dst, int dtype)
 {
     arithm_op(src2, src2, dst, InputArray(), dtype, recipTab, true, &scale);
-}    
-    
+}
+
 /****************************************************************************************\
 *                                      addWeighted                                       *
 \****************************************************************************************/
@@ -1739,34 +1839,34 @@ addWeighted8u( const uchar* src1, size_t step1,
 {
     const double* scalars = (const double*)_scalars;
     float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
-    
+
     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
     {
         int x = 0;
-        
+
 #if CV_SSE2
         if( USE_SSE2 )
         {
             __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
             __m128i z = _mm_setzero_si128();
-            
+
             for( ; x <= size.width - 8; x += 8 )
             {
                 __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
                 __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
-                
+
                 __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
                 __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
                 __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
                 __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
-                
+
                 u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
                 u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
                 u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
-                
+
                 u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
                 u = _mm_packus_epi16(u, u);
-                
+
                 _mm_storel_epi64((__m128i*)(dst + x), u);
             }
         }
@@ -1837,9 +1937,9 @@ static BinaryFunc addWeightedTab[] =
     (BinaryFunc)addWeighted16s, (BinaryFunc)addWeighted32s, (BinaryFunc)addWeighted32f,
     (BinaryFunc)addWeighted64f, 0
 };
-    
+
 }
-    
+
 void cv::addWeighted( const InputArray& src1, double alpha, const InputArray& src2,
                       double beta, double gamma, OutputArray dst, int dtype )
 {
@@ -1847,7 +1947,7 @@ void cv::addWeighted( const InputArray& src1, double alpha, const InputArray& sr
     arithm_op(src1, src2, dst, InputArray(), dtype, addWeightedTab, true, scalars);
 }
 
-    
+
 /****************************************************************************************\
 *                                          compare                                       *
 \****************************************************************************************/
@@ -1867,7 +1967,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
         std::swap(step1, step2);
         code = code == CMP_GE ? CMP_LE : CMP_GT;
     }
-    
+
     if( code == CMP_GT || code == CMP_LE )
     {
         int m = code == CMP_GT ? 0 : 255;
@@ -1884,7 +1984,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                 t1 = -(src1[x+3] > src2[x+3]) ^ m;
                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
             }
-            
+
             for( ; x < size.width; x++ )
                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
         }
@@ -1905,14 +2005,14 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
                 t1 = -(src1[x+3] == src2[x+3]) ^ m;
                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
             }
-            
+
             for( ; x < size.width; x++ )
                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
         }
     }
 }
-    
-    
+
+
 static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size size, void* _cmpop)
 {
@@ -1953,8 +2053,8 @@ static void cmp64f(const double* src1, size_t step1, const double* src2, size_t
                   uchar* dst, size_t step, Size size, void* _cmpop)
 {
     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
-}    
-    
+}
+
 static BinaryFunc cmpTab[] =
 {
     (BinaryFunc)cmp8u, (BinaryFunc)cmp8s, (BinaryFunc)cmp16u,
@@ -1962,7 +2062,7 @@ static BinaryFunc cmpTab[] =
     (BinaryFunc)cmp64f, 0
 };
 
-    
+
 static double getMinVal(int depth)
 {
     static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
@@ -1973,18 +2073,18 @@ static double getMaxVal(int depth)
 {
     static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
     return tab[depth];
-}    
-   
 }
-    
+
+}
+
 void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _dst, int op)
 {
     CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
                op == CMP_NE || op == CMP_GE || op == CMP_GT );
-    
+
     int kind1 = _src1.kind(), kind2 = _src2.kind();
     Mat src1 = _src1.getMat(), src2 = _src2.getMat();
-    
+
     if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
     {
         _dst.create(src1.size(), CV_8UC1);
@@ -1993,9 +2093,9 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
         cmpTab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, &op);
         return;
     }
-    
+
     bool haveScalar = false;
-    
+
     if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 ||
         src1.size != src2.size || src1.type() != src2.type() )
     {
@@ -2012,26 +2112,26 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
                      "nor 'array op scalar', nor 'scalar op array'" );
         haveScalar = true;
     }
-    
+
     int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
     if( cn != 1 )
         CV_Error( CV_StsUnsupportedFormat, "compare() can only process single-channel arrays" );
-        
+
     size_t esz = src1.elemSize();
     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
-    
+
     _dst.create(src1.dims, src1.size, CV_8U);
     Mat dst = _dst.getMat();
     BinaryFunc func = cmpTab[depth1];
-    
+
     if( !haveScalar )
     {
         const Mat* arrays[] = { &src1, &src2, &dst, 0 };
         uchar* ptrs[3];
-        
+
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size;
-        
+
         for( size_t i = 0; i < it.nplanes; i++, ++it )
             func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
     }
@@ -2039,10 +2139,10 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
     {
         const Mat* arrays[] = { &src1, &dst, 0 };
         uchar* ptrs[2];
-        
+
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size, blocksize = std::min(total, blocksize0);
-        
+
         AutoBuffer<uchar> _buf(blocksize*esz);
         uchar *buf = _buf;
 
@@ -2057,13 +2157,13 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
                 dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
                 return;
             }
-            
+
             if( fval > getMaxVal(depth1) )
             {
                 dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
                 return;
             }
-            
+
             int ival = cvRound(fval);
             if( fval != ival )
             {
@@ -2079,7 +2179,7 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
             }
             convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
         }
-        
+
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
             for( size_t j = 0; j < total; j += blocksize )
@@ -2092,7 +2192,7 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _
         }
     }
 }
-    
+
 /****************************************************************************************\
 *                                        inRange                                         *
 \****************************************************************************************/
@@ -2108,7 +2208,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
     step1 /= sizeof(src1[0]);
     step2 /= sizeof(src2[0]);
     step3 /= sizeof(src3[0]);
-    
+
     for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
     {
         int x = 0;
@@ -2122,13 +2222,13 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
             t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
             dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
         }
-            
+
         for( ; x < size.width; x++ )
             dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
     }
 }
 
-    
+
 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                       const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
 {
@@ -2169,7 +2269,7 @@ static void inRange64f(const double* src1, size_t step1, const double* src2, siz
                        const double* src3, size_t step3, uchar* dst, size_t step, Size size)
 {
     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
-}    
+}
 
 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
 {
@@ -2187,14 +2287,14 @@ static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
     else
         for( i = j = 0; i < len; i++, j += cn )
             dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
-    
+
     for( ; k < cn; k += 4 )
     {
         for( i = 0, j = k; i < len; i++, j += cn )
             dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
     }
 }
-    
+
 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                              const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );
 
@@ -2204,7 +2304,7 @@ static InRangeFunc inRangeTab[] =
     (InRangeFunc)inRange16s, (InRangeFunc)inRange32s, (InRangeFunc)inRange32f,
     (InRangeFunc)inRange64f, 0
 };
-    
+
 }
 
 void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
@@ -2212,9 +2312,9 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
 {
     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
     Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
-    
+
     bool lbScalar = false, ubScalar = false;
-    
+
     if( (lkind == InputArray::MATX && skind != InputArray::MATX) ||
         src.size != lb.size || src.type() != lb.type() )
     {
@@ -2223,7 +2323,7 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
                      "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
         lbScalar = true;
     }
-    
+
     if( (ukind == InputArray::MATX && skind != InputArray::MATX) ||
         src.size != ub.size || src.type() != ub.type() )
     {
@@ -2232,47 +2332,47 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
                      "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
         ubScalar = true;
     }
-    
+
     CV_Assert( ((int)lbScalar ^ (int)ubScalar) == 0 );
-    
+
     int cn = src.channels(), depth = src.depth();
-    
+
     size_t esz = src.elemSize();
     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
-    
+
     _dst.create(src.dims, src.size, CV_8U);
     Mat dst = _dst.getMat();
     InRangeFunc func = inRangeTab[depth];
-    
+
     const Mat* arrays_sc[] = { &src, &dst, 0 };
     const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
     uchar* ptrs[4];
-    
+
     NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
     size_t total = it.size, blocksize = std::min(total, blocksize0);
-    
+
     AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
     uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0;
     buf = alignPtr(buf + blocksize*cn, 16);
-    
+
     if( lbScalar && ubScalar )
     {
         lbuf = buf;
         ubuf = buf = alignPtr(buf + blocksize*esz, 16);
-        
+
         CV_Assert( lb.type() == ub.type() );
         int scdepth = lb.depth();
-        
+
         if( scdepth != depth && depth < CV_32S )
         {
             int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
             int* iubuf = ilbuf + cn;
-            
+
             BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
             sccvtfunc(lb.data, 0, 0, 0, (uchar*)ilbuf, 0, Size(cn, 1), 0);
             sccvtfunc(ub.data, 0, 0, 0, (uchar*)iubuf, 0, Size(cn, 1), 0);
             int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
-            
+
             for( int k = 0; k < cn; k++ )
             {
                 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
@@ -2281,11 +2381,11 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb,
             lb = Mat(cn, 1, CV_32S, ilbuf);
             ub = Mat(cn, 1, CV_32S, iubuf);
         }
-        
+
         convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
         convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
     }
-    
+
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
         for( size_t j = 0; j < total; j += blocksize )
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index 9d0d957dc4..1af4950249 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -646,8 +646,8 @@ static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step,
 {
     GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
 }
-    
-    
+
+
 static void GEMMStore_32f( const float* c_data, size_t c_step,
           const double* d_buf, size_t d_buf_step,
           float* d_data, size_t d_step, Size d_size,
@@ -664,7 +664,7 @@ static void GEMMStore_64f( const double* c_data, size_t c_step,
 {
     GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
 }
-    
+
 
 static void GEMMStore_32fc( const Complexf* c_data, size_t c_step,
                           const Complexd* d_buf, size_t d_buf_step,
@@ -1130,7 +1130,7 @@ void cv::gemm( const InputArray& matA, const InputArray& matB, double alpha,
         int dm0, dn0, dk0;
         size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
         int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
-        
+
         if( !is_a_t )
             a_step0 = A.step, a_step1 = elem_size;
         else
@@ -1273,7 +1273,7 @@ template<typename T, typename WT> static void
 transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
 {
     int x;
-    
+
     if( scn == 2 && dcn == 2 )
     {
         for( x = 0; x < len*2; x += 2 )
@@ -1352,7 +1352,7 @@ load4x4Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3, _
 }
 
 #endif
-    
+
 static void
 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
 {
@@ -1379,7 +1379,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
         __m128i m2 = _mm_setr_epi16(0, m20, m21, m22, m20, m21, m22, 0);
         __m128i m3 = _mm_setr_epi32(m03, m13, m23, 0);
         int x = 0;
-        
+
         for( ; x <= (len - 8)*3; x += 8*3 )
         {
             __m128i z = _mm_setzero_si128(), t0, t1, t2, r0, r1;
@@ -1470,14 +1470,14 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
         return;
     }
 #endif
-    
+
     transform_(src, dst, m, len, scn, dcn);
 }
 
 static void
 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SSE2    
+#if CV_SSE2
     if( USE_SSE2 && scn == 3 && dcn == 3 )
     {
         __m128 m0, m1, m2, m3;
@@ -1536,11 +1536,11 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
         return;
     }
 #endif
-    
+
     transform_(src, dst, m, len, scn, dcn);
 }
-    
-    
+
+
 static void
 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
 {
@@ -1574,12 +1574,12 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
             }
             return;
         }
-        
+
         if( scn == 4 && dcn == 4 )
         {
             __m128 m0, m1, m2, m3, m4;
             load4x4Matrix(m, m0, m1, m2, m3, m4);
-        
+
             for( ; x < len*4; x += 4 )
             {
                 __m128 x0 = _mm_loadu_ps(src + x);
@@ -1616,18 +1616,18 @@ transform_32s(const int* src, int* dst, const double* m, int len, int scn, int d
 {
     transform_(src, dst, m, len, scn, dcn);
 }
-    
+
 static void
 transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
 {
     transform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 template<typename T, typename WT> static void
 diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
 {
     int x;
-    
+
     if( cn == 2 )
     {
         for( x = 0; x < len*2; x += 2 )
@@ -1674,8 +1674,8 @@ static void
 diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
 {
     diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 static void
 diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
 {
@@ -1686,8 +1686,8 @@ static void
 diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
 {
     diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 static void
 diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
 {
@@ -1704,17 +1704,17 @@ static void
 diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
 {
     diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
+}
+
 static void
 diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
 {
     diagtransform_(src, dst, m, len, scn, dcn);
-}    
-    
-    
+}
+
+
 typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int );
-    
+
 static TransformFunc transformTab[] =
 {
     (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
@@ -1728,23 +1728,23 @@ static TransformFunc diagTransformTab[] =
     (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
     (TransformFunc)diagtransform_64f, 0
 };
-    
+
 }
-    
+
 void cv::transform( const InputArray& _src, OutputArray _dst, const InputArray& _mtx )
 {
     Mat src = _src.getMat(), m = _mtx.getMat();
     int depth = src.depth(), scn = src.channels(), dcn = m.rows;
     CV_Assert( scn == m.cols || scn + 1 == m.cols );
     bool isDiag = false;
-    
+
     _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
     Mat dst = _dst.getMat();
 
     int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F;
     AutoBuffer<double> _mbuf;
     double* mbuf = _mbuf;
-    
+
     if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 )
     {
         _mbuf.allocate(dcn*(scn+1));
@@ -1791,12 +1791,12 @@ void cv::transform( const InputArray& _src, OutputArray _dst, const InputArray&
 
     TransformFunc func = isDiag ? diagTransformTab[depth] : transformTab[depth];
     CV_Assert( func != 0 );
-    
+
     const Mat* arrays[] = {&src, &dst, 0};
     uchar* ptrs[2];
     NAryMatIterator it(arrays, ptrs);
     size_t i, total = it.size;
-    
+
     for( i = 0; i < it.nplanes; i++, ++it )
         func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
 }
@@ -1813,7 +1813,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
 {
     const double eps = FLT_EPSILON;
     int i;
-    
+
     if( scn == 2 && dcn == 2 )
     {
         for( i = 0; i < len*2; i += 2 )
@@ -1837,7 +1837,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
         {
             T x = src[i], y = src[i + 1], z = src[i + 2];
             double w = x*m[12] + y*m[13] + z*m[14] + m[15];
-            
+
             if( fabs(w) > eps )
             {
                 w = 1./w;
@@ -1855,7 +1855,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
         {
             T x = src[0], y = src[1], z = src[2];
             double w = x*m[8] + y*m[9] + z*m[10] + m[11];
-            
+
             if( fabs(w) > eps )
             {
                 w = 1./w;
@@ -1893,7 +1893,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn,
     }
 }
 
-    
+
 static void
 perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn)
 {
@@ -1905,22 +1905,22 @@ perspectiveTransform_64f(const double* src, double* dst, const double* m, int le
 {
     perspectiveTransform_(src, dst, m, len, scn, dcn);
 }
-    
+
 }
-    
+
 void cv::perspectiveTransform( const InputArray& _src, OutputArray _dst, const InputArray& _mtx )
 {
     Mat src = _src.getMat(), m = _mtx.getMat();
     int depth = src.depth(), scn = src.channels(), dcn = m.rows-1;
     CV_Assert( scn + 1 == m.cols && (depth == CV_32F || depth == CV_64F));
-    
+
     _dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
     Mat dst = _dst.getMat();
-    
+
     const int mtype = CV_64F;
     AutoBuffer<double> _mbuf;
     double* mbuf = _mbuf;
-    
+
     if( !m.isContinuous() || m.type() != mtype )
     {
         _mbuf.allocate((dcn+1)*(scn+1));
@@ -1930,20 +1930,20 @@ void cv::perspectiveTransform( const InputArray& _src, OutputArray _dst, const I
     }
     else
         mbuf = (double*)m.data;
-    
+
     TransformFunc func = depth == CV_32F ?
         (TransformFunc)perspectiveTransform_32f :
         (TransformFunc)perspectiveTransform_64f;
     CV_Assert( func != 0 );
-    
+
     const Mat* arrays[] = {&src, &dst, 0};
     uchar* ptrs[2];
     NAryMatIterator it(arrays, ptrs);
     size_t i, total = it.size;
-    
+
     for( i = 0; i < it.nplanes; i++, ++it )
         func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
-}    
+}
 
 /****************************************************************************************\
 *                                       ScaleAdd                                         *
@@ -2000,7 +2000,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
         dst[i] = src1[i]*alpha + src2[i];
 }
 
-    
+
 static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
                          int len, double* _alpha)
 {
@@ -2040,39 +2040,39 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
 typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha);
 
 }
-    
+
 void cv::scaleAdd( const InputArray& _src1, double alpha, const InputArray& _src2, OutputArray _dst )
 {
     Mat src1 = _src1.getMat(), src2 = _src2.getMat();
     int depth = src1.depth(), cn = src1.channels();
-    
+
     CV_Assert( src1.type() == src2.type() );
     if( depth < CV_32F )
     {
         addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
         return;
     }
-    
+
     _dst.create(src1.dims, src1.size, src1.type());
     Mat dst = _dst.getMat();
-    
+
     float falpha = (float)alpha;
     void* palpha = depth == CV_32F ? (void*)&falpha : (void*)&alpha;
-    
+
     ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f; 
-    
+
     if( src1.isContinuous() && src2.isContinuous() && dst.isContinuous() )
     {
         size_t len = src1.total()*cn;
         func(src1.data, src2.data, dst.data, (int)len, palpha);
         return;
     }
-    
+
     const Mat* arrays[] = {&src1, &src2, &dst, 0};
     uchar* ptrs[3];
     NAryMatIterator it(arrays, ptrs);
     size_t i, len = it.size*cn;
-    
+
     for( i = 0; i < it.nplanes; i++, ++it )
         func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha );
 }
@@ -2243,7 +2243,7 @@ double cv::Mahalonobis( const InputArray& _v1, const InputArray& _v2, const Inpu
 {
     return Mahalanobis(_v1, _v2, _icovar);
 }
-    
+
 /****************************************************************************************\
 *                                        MulTransposed                                   *
 \****************************************************************************************/
@@ -2445,7 +2445,7 @@ MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scal
 typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale);
 
 }
-    
+
 void cv::mulTransposed( const InputArray& _src, OutputArray _dst, bool ata,
                         const InputArray& _delta, double scale, int dtype )
 {
@@ -2578,7 +2578,7 @@ dotProd_(const T* src1, const T* src2, int len)
             (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
     for( ; i < len; i++ )
         result += (double)src1[i]*src2[i];
-    
+
     return result;
 }
 
@@ -2590,9 +2590,10 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
     ippiDotProd_8u64f_C1R(src1, (int)(len*sizeof(src1[0])),
                           src2, (int)(len*sizeof(src2[0])),
                           ippiSize(len, 1), &r);
+    return r;
 #else
     int i = 0;
-    
+
 #if CV_SSE2
     if( USE_SSE2 )
     {
@@ -2616,7 +2617,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
                 s = _mm_add_epi32(s, s0);
                 s = _mm_add_epi32(s, s2);
             }
-            
+
             for( ; j < blockSize; j += 4 )
             {
                 __m128i s0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + j)), z);
@@ -2627,7 +2628,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
             CV_DECL_ALIGNED(16) int buf[4];
             _mm_store_si128((__m128i*)buf, s);
             r += buf[0] + buf[1] + buf[2] + buf[3];
-            
+
             src1 += blockSize;
             src2 += blockSize;
             i += blockSize;
@@ -2692,7 +2693,7 @@ static double dotProd_64f(const double* src1, const double* src2, int len)
 
 
 typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
-    
+
 static DotProdFunc dotProdTab[] =
 {
     (DotProdFunc)dotProd_8u, (DotProdFunc)dotProd_8s, (DotProdFunc)dotProd_16u,
@@ -2713,16 +2714,16 @@ double Mat::dot(const InputArray& _mat) const
         if( len == (size_t)(int)len )
             return func(data, mat.data, len);
     }
-    
+
     const Mat* arrays[] = {this, &mat, 0};
     uchar* ptrs[2];
     NAryMatIterator it(arrays, ptrs);
     int len = (int)(it.size*cn);
     double r = 0;
-    
+
     for( size_t i = 0; i < it.nplanes; i++, ++it )
         r += func( ptrs[0], ptrs[1], len );
-    
+
     return r;
 }
 
@@ -3027,12 +3028,12 @@ cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigen
     evects = pca.eigenvectors;
     int ecount0 = evals0.cols + evals0.rows - 1;
     int ecount = evals.cols + evals.rows - 1;
-    
+
     CV_Assert( (evals0.cols == 1 || evals0.rows == 1) &&
                 ecount0 <= ecount &&
                 evects0.cols == evects.cols &&
                 evects0.rows == ecount0 );
-    
+
     cv::Mat temp = evals0;
     if( evals.rows == 1 )
         evals.colRange(0, ecount0).convertTo(temp, evals0.type());
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 09711b69a1..24c81ace60 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -87,7 +87,7 @@ extern const uchar g_Saturate8u[];
 void deleteThreadAllocData();
 void deleteThreadRNGData();
 #endif
-    
+
 template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
 {
     typedef T1 type1;
@@ -176,24 +176,24 @@ typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
                            void*);
 
 BinaryFunc getConvertFunc(int sdepth, int ddepth);
-BinaryFunc getConvertScaleFunc(int sdepth, int ddepth);  
+BinaryFunc getConvertScaleFunc(int sdepth, int ddepth);
 BinaryFunc getCopyMaskFunc(size_t esz);
 
 enum { BLOCK_SIZE = 1024 };
 
 #ifdef HAVE_IPP
-static inline IppiSize ippiSize(int width, int height) { IppiSize sz={width, height}; return sz; }
-static inline IppiSize ippiSize(Size _sz) { reIppiSize sz={_sz.width, _sz.height}; return sz; }
+static inline IppiSize ippiSize(int width, int height) { IppiSize sz = { width, height}; return sz; }
+static inline IppiSize ippiSize(Size _sz)              { IppiSize sz = { _sz.width, _sz.height}; return sz; }
 #endif
-    
+
 #if defined HAVE_IPP && (IPP_VERSION_MAJOR >= 7)
 #define ARITHM_USE_IPP 1
 #define IF_IPP(then_call, else_call) then_call
 #else
 #define ARITHM_USE_IPP 0
 #define IF_IPP(then_call, else_call) else_call
-#endif    
-    
+#endif
+
 }
 
 #endif /*_CXCORE_INTERNAL_H_*/
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index e4ef11e91c..474eab7ed3 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -170,9 +170,10 @@ struct IPPInitializer
 IPPInitializer ippInitializer;
 #else
 volatile bool useOptimizedFlag = false;
-volatile bool USE_SSE2 = false;
 #endif
 
+volatile bool USE_SSE2 = false;
+
 void setUseOptimized( bool flag )
 {
     useOptimizedFlag = flag;