From 2057f2c4521a62cf214196cf79550edb823dfe81 Mon Sep 17 00:00:00 2001 From: Vladimir Dudnik Date: Mon, 18 Apr 2011 21:24:57 +0000 Subject: [PATCH] fixed build issues related to changes in IPP calls. --- modules/core/src/arithm.cpp | 552 +++++++++++++++++++++-------------- modules/core/src/matmul.cpp | 137 ++++----- modules/core/src/precomp.hpp | 14 +- modules/core/src/system.cpp | 3 +- 4 files changed, 404 insertions(+), 302 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index d0fc0f44a4..dc9ad1c640 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -56,7 +56,7 @@ struct IPPArithmInitializer { IPPArithmInitializer(void) { - IppStatus status = ippStaticInit(); + ippStaticInit(); } }; @@ -64,19 +64,19 @@ IPPArithmInitializer ippArithmInitializer; #endif struct NOP {}; - + template void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) { Op8 op8; Op op; - + for( ; sz.height--; src1 += step1/sizeof(src1[0]), src2 += step2/sizeof(src2[0]), dst += step/sizeof(dst[0]) ) { int x = 0; - + #if CV_SSE2 if( USE_SSE2 ) { @@ -97,7 +97,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s } } #endif - + for( ; x <= sz.width - 4; x += 4 ) { T v0 = op(src1[x], src2[x]); @@ -107,7 +107,7 @@ void vBinOp8(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, s v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - + for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } @@ -119,13 +119,13 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2, { Op16 op16; Op op; - + for( ; sz.height--; src1 += step1/sizeof(src1[0]), src2 += step2/sizeof(src2[0]), dst += step/sizeof(dst[0]) ) { int x = 0; - + #if CV_SSE2 if( USE_SSE2 ) { @@ -147,7 +147,7 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2, } else #endif - + for( ; x <= sz.width - 4; x += 4 ) { T v0 = op(src1[x], src2[x]); @@ -157,26 +157,26 @@ void vBinOp16(const T* src1, size_t step1, const T* src2, size_t step2, v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - + for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } } - + template void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz) { Op32 op32; Op op; - + for( ; sz.height--; src1 += step1/sizeof(src1[0]), src2 += step2/sizeof(src2[0]), dst += step/sizeof(dst[0]) ) { int x = 0; - + #if CV_SSE2 if( USE_SSE2 ) { @@ -202,7 +202,7 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2, } } #endif - + for( ; x <= sz.width - 4; x += 4 ) { int v0 = op(src1[x], src2[x]); @@ -212,26 +212,26 @@ void vBinOp32s(const int* src1, size_t step1, const int* src2, size_t step2, v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - + for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } } - + template void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz) { Op32 op32; Op op; - + for( ; sz.height--; src1 += step1/sizeof(src1[0]), src2 += step2/sizeof(src2[0]), dst += step/sizeof(dst[0]) ) { int x = 0; - + #if CV_SSE2 if( USE_SSE2 ) { @@ -266,7 +266,7 @@ void vBinOp32f(const float* src1, size_t step1, const float* src2, size_t step2, v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - + for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } @@ -278,13 +278,13 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step { Op64 op64; Op op; - + for( ; sz.height--; src1 += step1/sizeof(src1[0]), src2 += step2/sizeof(src2[0]), dst += step/sizeof(dst[0]) ) { int x = 0; - + #if CV_SSE2 if( USE_SSE2 && (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) for( ; x <= sz.width - 4; x += 4 ) @@ -307,14 +307,14 @@ void vBinOp64f(const double* src1, size_t step1, const double* src2, size_t step v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } - + for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } } - + #if CV_SSE2 - + struct _VAdd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_adds_epu8(a,b); }}; struct _VSub8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_subs_epu8(a,b); }}; struct _VMin8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); }}; @@ -410,7 +410,7 @@ struct _VAbsDiff32s __m128i m = _mm_cmpgt_epi32(b, a); return _mm_sub_epi32(_mm_xor_si128(d, m), m); } -}; +}; struct _VAdd32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_add_ps(a,b); }}; struct _VSub32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_sub_ps(a,b); }}; @@ -429,7 +429,7 @@ struct _VAdd64f { __m128d operator()(const __m128d& a, const __m128d& b) const { struct _VSub64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_sub_pd(a,b); }}; struct _VMin64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_min_pd(a,b); }}; struct _VMax64f { __m128d operator()(const __m128d& a, const __m128d& b) const { return _mm_max_pd(a,b); }}; - + static int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; struct _VAbsDiff64f { @@ -437,13 +437,13 @@ struct _VAbsDiff64f { return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); } -}; - +}; + struct _VAnd8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_and_si128(a,b); }}; struct _VOr8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_or_si128(a,b); }}; struct _VXor8u { __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_xor_si128(a,b); }}; struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { return _mm_andnot_si128(_mm_setzero_si128(),a); }}; - + #endif #if CV_SSE2 @@ -451,12 +451,12 @@ struct _VNot8u { __m128i operator()(const __m128i& a, const __m128i&) const { re #else #define IF_SIMD(op) NOP #endif - + template<> inline uchar OpAdd::operator ()(uchar a, uchar b) const { return CV_FAST_CAST_8U(a + b); } template<> inline uchar OpSub::operator ()(uchar a, uchar b) const { return CV_FAST_CAST_8U(a - b); } - + template struct OpAbsDiff { typedef T type1; @@ -470,7 +470,7 @@ template<> inline short OpAbsDiff::operator ()(short a, short b) const template<> inline schar OpAbsDiff::operator ()(schar a, schar b) const { return saturate_cast(std::abs(a - b)); } - + template struct OpAbsDiffS { typedef T type1; @@ -510,19 +510,19 @@ template struct OpNot typedef T rtype; T operator()( T a, T ) const { return ~a; } }; - + static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step) { if( sz.height == 1 ) step1 = step2 = step = sz.width*elemSize; } - + static void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0), + ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0), (vBinOp8, IF_SIMD(_VAdd8u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -538,7 +538,7 @@ static void add16u( const ushort* src1, size_t step1, ushort* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0), + ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0), (vBinOp16, IF_SIMD(_VAdd16u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -547,7 +547,7 @@ static void add16s( const short* src1, size_t step1, short* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0), + ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz, 0), (vBinOp16, IF_SIMD(_VAdd16s)>(src1, step1, src2, step2, dst, step, sz))); } @@ -563,7 +563,7 @@ static void add32f( const float* src1, size_t step1, float* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAdd_32f_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz, 0), + ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), (vBinOp32f, IF_SIMD(_VAdd32f)>(src1, step1, src2, step2, dst, step, sz))); } @@ -579,7 +579,7 @@ static void sub8u( const uchar* src1, size_t step1, uchar* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0), + ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0), (vBinOp8, IF_SIMD(_VSub8u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -595,7 +595,7 @@ static void sub16u( const ushort* src1, size_t step1, ushort* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0), + ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0), (vBinOp16, IF_SIMD(_VSub16u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -604,7 +604,7 @@ static void sub16s( const short* src1, size_t step1, short* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0), + ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz, 0), (vBinOp16, IF_SIMD(_VSub16s)>(src1, step1, src2, step2, dst, step, sz))); } @@ -620,7 +620,7 @@ static void sub32f( const float* src1, size_t step1, float* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiSub_32f_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (IppiSize&)sz, 0), + ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, (IppiSize&)sz), (vBinOp32f, IF_SIMD(_VSub32f)>(src1, step1, src2, step2, dst, step, sz))); } @@ -629,18 +629,36 @@ static void sub64f( const double* src1, size_t step1, double* dst, size_t step, Size sz, void* ) { vBinOp64f, IF_SIMD(_VSub64f)>(src1, step1, src2, step2, dst, step, sz); -} +} template<> inline uchar OpMin::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } template<> inline uchar OpMax::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } - + static void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMaxEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp8, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz))); +#if (ARITHM_USE_IPP == 1) + { + uchar* s1 = (uchar*)src1; + uchar* s2 = (uchar*)src2; + uchar* d = dst; + fixSteps(sz, sizeof(dst[0]), step1, step2, step); + for(int i = 0; i < sz.height; i++) + { + ippsMaxEvery_8u(s1, s2, d, sz.width); + s1 += step1; + s2 += step2; + d += step; + } + } +#else + vBinOp8, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz); +#endif + +// IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); +// ippiMaxEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), +// (vBinOp8, IF_SIMD(_VMax8u)>(src1, step1, src2, step2, dst, step, sz))); } static void max8s( const schar* src1, size_t step1, @@ -654,18 +672,34 @@ static void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMaxEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp16, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz))); +#if (ARITHM_USE_IPP == 1) + { + ushort* s1 = (ushort*)src1; + ushort* s2 = (ushort*)src2; + ushort* d = dst; + fixSteps(sz, sizeof(dst[0]), step1, step2, step); + for(int i = 0; i < sz.height; i++) + { + ippsMaxEvery_16u(s1, s2, d, sz.width); + s1 = (ushort*)((uchar*)s1 + step1); + s2 = (ushort*)((uchar*)s2 + step2); + d = (ushort*)((uchar*)d + step); + } + } +#else + vBinOp16, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz); +#endif + +// IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); +// ippiMaxEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), +// (vBinOp16, IF_SIMD(_VMax16u)>(src1, step1, src2, step2, dst, step, sz))); } static void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMaxEvery_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp16, IF_SIMD(_VMax16s)>(src1, step1, src2, step2, dst, step, sz))); + vBinOp16, IF_SIMD(_VMax16s)>(src1, step1, src2, step2, dst, step, sz); } static void max32s( const int* src1, size_t step1, @@ -679,9 +713,26 @@ static void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp32f, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz))); +#if (ARITHM_USE_IPP == 1) + { + float* s1 = (float*)src1; + float* s2 = (float*)src2; + float* d = dst; + fixSteps(sz, sizeof(dst[0]), step1, step2, step); + for(int i = 0; i < sz.height; i++) + { + ippsMaxEvery_32f(s1, s2, d, sz.width); + s1 = (float*)((uchar*)s1 + step1); + s2 = (float*)((uchar*)s2 + step2); + d = (float*)((uchar*)d + step); + } + } +#else + vBinOp32f, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz); +#endif +// IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); +// ippiMaxEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), +// (vBinOp32f, IF_SIMD(_VMax32f)>(src1, step1, src2, step2, dst, step, sz))); } static void max64f( const double* src1, size_t step1, @@ -695,9 +746,27 @@ static void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMinEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp8, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz))); +#if (ARITHM_USE_IPP == 1) + { + uchar* s1 = (uchar*)src1; + uchar* s2 = (uchar*)src2; + uchar* d = dst; + fixSteps(sz, sizeof(dst[0]), step1, step2, step); + for(int i = 0; i < sz.height; i++) + { + ippsMinEvery_8u(s1, s2, d, sz.width); + s1 += step1; + s2 += step2; + d += step; + } + } +#else + vBinOp8, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz); +#endif + +// IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); +// ippiMinEvery_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), +// (vBinOp8, IF_SIMD(_VMin8u)>(src1, step1, src2, step2, dst, step, sz))); } static void min8s( const schar* src1, size_t step1, @@ -711,18 +780,34 @@ static void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMinEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp16, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz))); +#if (ARITHM_USE_IPP == 1) + { + ushort* s1 = (ushort*)src1; + ushort* s2 = (ushort*)src2; + ushort* d = dst; + fixSteps(sz, sizeof(dst[0]), step1, step2, step); + for(int i = 0; i < sz.height; i++) + { + ippsMinEvery_16u(s1, s2, d, sz.width); + s1 = (ushort*)((uchar*)s1 + step1); + s2 = (ushort*)((uchar*)s2 + step2); + d = (ushort*)((uchar*)d + step); + } + } +#else + vBinOp16, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz); +#endif + +// IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); +// ippiMinEvery_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), +// (vBinOp16, IF_SIMD(_VMin16u)>(src1, step1, src2, step2, dst, step, sz))); } static void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMinEvery_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp16, IF_SIMD(_VMin16s)>(src1, step1, src2, step2, dst, step, sz))); + vBinOp16, IF_SIMD(_VMin16s)>(src1, step1, src2, step2, dst, step, sz); } static void min32s( const int* src1, size_t step1, @@ -736,9 +821,26 @@ static void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp32f, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz))); +#if (ARITHM_USE_IPP == 1) + { + float* s1 = (float*)src1; + float* s2 = (float*)src2; + float* d = dst; + fixSteps(sz, sizeof(dst[0]), step1, step2, step); + for(int i = 0; i < sz.height; i++) + { + ippsMinEvery_32f(s1, s2, d, sz.width); + s1 = (float*)((uchar*)s1 + step1); + s2 = (float*)((uchar*)s2 + step2); + d = (float*)((uchar*)d + step); + } + } +#else + vBinOp32f, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz); +#endif +// IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); +// ippiMinEvery_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), +// (vBinOp32f, IF_SIMD(_VMin32f)>(src1, step1, src2, step2, dst, step, sz))); } static void min64f( const double* src1, size_t step1, @@ -746,14 +848,14 @@ static void min64f( const double* src1, size_t step1, double* dst, size_t step, Size sz, void* ) { vBinOp64f, IF_SIMD(_VMin64f)>(src1, step1, src2, step2, dst, step, sz); -} +} static void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), + ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), (vBinOp8, IF_SIMD(_VAbsDiff8u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -769,7 +871,7 @@ static void absdiff16u( const ushort* src1, size_t step1, ushort* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), + ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), (vBinOp16, IF_SIMD(_VAbsDiff16u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -777,9 +879,7 @@ static void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { - IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAbsDiff_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), - (vBinOp16, IF_SIMD(_VAbsDiff16s)>(src1, step1, src2, step2, dst, step, sz))); + vBinOp16, IF_SIMD(_VAbsDiff16s)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff32s( const int* src1, size_t step1, @@ -794,7 +894,7 @@ static void absdiff32f( const float* src1, size_t step1, float* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), + ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), (vBinOp32f, IF_SIMD(_VAbsDiff32f)>(src1, step1, src2, step2, dst, step, sz))); } @@ -803,15 +903,15 @@ static void absdiff64f( const double* src1, size_t step1, double* dst, size_t step, Size sz, void* ) { vBinOp64f, IF_SIMD(_VAbsDiff64f)>(src1, step1, src2, step2, dst, step, sz); -} - +} + static void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), + ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), (vBinOp8, IF_SIMD(_VAnd8u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -820,7 +920,7 @@ static void or8u( const uchar* src1, size_t step1, uchar* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), + ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), (vBinOp8, IF_SIMD(_VOr8u)>(src1, step1, src2, step2, dst, step, sz))); } @@ -829,23 +929,23 @@ static void xor8u( const uchar* src1, size_t step1, uchar* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (IppiSize&)sz), + ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, (IppiSize&)sz), (vBinOp8, IF_SIMD(_VXor8u)>(src1, step1, src2, step2, dst, step, sz))); -} +} static void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { IF_IPP(fixSteps(sz, sizeof(dst[0]), step1, step2, step); - ippiNot_8u_C1R(src1, (int)step1, dst, (IppiSize&)sz), + ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, (IppiSize&)sz), (vBinOp8, IF_SIMD(_VNot8u)>(src1, step1, src2, step2, dst, step, sz))); } - + /****************************************************************************************\ * logical operations * \****************************************************************************************/ - + static inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind) { if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() ) @@ -856,7 +956,7 @@ static inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind) return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) || (sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4); } - + static void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize ) { int scn = (int)sc.total(), cn = CV_MAT_CN(buftype); @@ -872,9 +972,9 @@ static void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, si } for( size_t i = esz; i < blocksize*esz; i++ ) scbuf[i] = scbuf[i - esz]; - + } - + void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _dst, const InputArray& _mask, const BinaryFunc* tab, bool bitwise) { @@ -883,7 +983,7 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d bool haveMask = !_mask.empty(), haveScalar = false; BinaryFunc func; int c; - + if( src1.dims <= 2 && src2.dims <= 2 && kind1 == kind2 && src1.size() == src2.size() && src1.type() == src2.type() && !haveMask ) { @@ -899,12 +999,12 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d func = tab[src1.depth()]; c = src1.channels(); } - + Size sz = getContinuousSize(src1, src2, dst, c); func(src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, 0); return; } - + if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 || src1.size != src2.size || src1.type() != src2.type() ) { @@ -917,13 +1017,13 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; } - + size_t esz = src1.elemSize(); size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; int cn = src1.channels(); BinaryFunc copymask = 0; Mat mask; - + if( haveMask ) { mask = _mask.getMat(); @@ -931,13 +1031,13 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d CV_Assert( mask.size == src1.size ); copymask = getCopyMaskFunc(esz); } - + AutoBuffer _buf; uchar *scbuf = 0, *maskbuf = 0; - + _dst.create(src1.dims, src1.size, src1.type()); Mat dst = _dst.getMat(); - + if( bitwise ) { func = *tab; @@ -948,35 +1048,35 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d func = tab[src1.depth()]; c = cn; } - + if( !haveScalar ) { const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; uchar* ptrs[4]; - + NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = total; - + if( haveMask ) { blocksize = std::min(blocksize, blocksize0); _buf.allocate(blocksize*esz); maskbuf = _buf; } - + for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)std::min(total - j, blocksize); - - func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 ); + + func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz ); ptrs[3] += bsz; } - + bsz *= (int)esz; ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz; } @@ -986,41 +1086,41 @@ void binary_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d { const Mat* arrays[] = { &src1, &dst, &mask, 0 }; uchar* ptrs[3]; - + NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = std::min(total, blocksize0); - + _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32); scbuf = _buf; maskbuf = alignPtr(scbuf + blocksize*esz, 16); - + convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize); - + for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)std::min(total - j, blocksize); - + func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*c, 1), 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz ); ptrs[2] += bsz; } - + bsz *= (int)esz; ptrs[0] += bsz; ptrs[1] += bsz; } } } } - + static BinaryFunc maxTab[] = { (BinaryFunc)max8u, (BinaryFunc)max8s, (BinaryFunc)max16u, (BinaryFunc)max16s, (BinaryFunc)max32s, (BinaryFunc)max32f, (BinaryFunc)max64f, 0 -}; +}; static BinaryFunc minTab[] = { @@ -1029,7 +1129,7 @@ static BinaryFunc minTab[] = }; } - + void cv::bitwise_and(const InputArray& a, const InputArray& b, OutputArray c, const InputArray& mask) { BinaryFunc f = and8u; @@ -1068,26 +1168,26 @@ void cv::max(const Mat& src1, const Mat& src2, Mat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, InputArray(), maxTab, false ); -} - +} + void cv::min(const Mat& src1, const Mat& src2, Mat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, InputArray(), minTab, false ); } - + void cv::max(const Mat& src1, double src2, Mat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, InputArray(), maxTab, false ); -} +} void cv::min(const Mat& src1, double src2, Mat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, InputArray(), minTab, false ); } - + /****************************************************************************************\ * add/subtract * \****************************************************************************************/ @@ -1101,7 +1201,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d int kind1 = _src1.kind(), kind2 = _src2.kind(); Mat src1 = _src1.getMat(), src2 = _src2.getMat(); bool haveMask = !_mask.empty(); - + if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() && !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == src1.depth())) || @@ -1113,9 +1213,9 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d tab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata); return; } - + bool haveScalar = false, swapped12 = false; - + if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 || src1.size != src2.size || src1.channels() != src2.channels() ) { @@ -1131,10 +1231,10 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; } - + int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth(), wtype; BinaryFunc cvtsrc1 = 0, cvtsrc2 = 0, cvtdst = 0; - + if( dtype < 0 ) { if( _dst.fixedType() ) @@ -1149,7 +1249,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d } } dtype = CV_MAT_DEPTH(dtype); - + if( depth1 == depth2 && dtype == depth1 ) wtype = dtype; else if( !muldiv ) @@ -1157,7 +1257,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S : depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2); wtype = std::max(wtype, dtype); - + // when the result of addition should be converted to an integer type, // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation, // instead of converting the other input to floating-point and then converting the operation result back to integers. @@ -1169,20 +1269,20 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d wtype = std::max(depth1, std::max(depth2, CV_32F)); wtype = std::max(wtype, dtype); } - + cvtsrc1 = depth1 == wtype ? 0 : getConvertFunc(depth1, wtype); cvtsrc2 = depth2 == depth1 ? cvtsrc1 : depth2 == wtype ? 0 : getConvertFunc(depth2, wtype); cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); - + dtype = CV_MAKETYPE(dtype, cn); wtype = CV_MAKETYPE(wtype, cn); - + size_t esz1 = src1.elemSize(), esz2 = src2.elemSize(); size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz; BinaryFunc copymask = 0; Mat mask; - + if( haveMask ) { mask = _mask.getMat(); @@ -1190,23 +1290,23 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d CV_Assert( mask.size == src1.size ); copymask = getCopyMaskFunc(dsz); } - + AutoBuffer _buf; uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0; size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0); - + _dst.create(src1.dims, src1.size, src1.type()); Mat dst = _dst.getMat(); BinaryFunc func = tab[CV_MAT_DEPTH(wtype)]; - + if( !haveScalar ) { const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; uchar* ptrs[4]; - + NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = total; - + if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst ) blocksize = std::min(blocksize, blocksize0); @@ -1221,7 +1321,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d buf = alignPtr(buf + blocksize*wsz, 16); if( haveMask ) maskbuf = buf; - + for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) @@ -1242,7 +1342,7 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d cvtsrc2( sptr2, 0, 0, 0, buf2, 0, bszn, 0 ); sptr2 = buf2; } - + if( !haveMask && !cvtdst ) func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata ); else @@ -1270,10 +1370,10 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d { const Mat* arrays[] = { &src1, &dst, &mask, 0 }; uchar* ptrs[3]; - + NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = std::min(total, blocksize0); - + _buf.allocate(bufesz*blocksize + 64); buf = _buf; if( cvtsrc1 ) @@ -1284,9 +1384,9 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d buf = alignPtr(buf + blocksize*wsz, 16); if( haveMask ) maskbuf = buf; - + convertAndUnrollScalar( src2, wtype, buf2, blocksize); - + for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) @@ -1296,16 +1396,16 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d const uchar *sptr1 = ptrs[0]; const uchar* sptr2 = buf2; uchar* dptr = ptrs[1]; - + if( cvtsrc1 ) { cvtsrc1( sptr1, 0, 0, 0, buf1, 0, bszn, 0 ); sptr1 = buf1; } - + if( swapped12 ) std::swap(sptr1, sptr2); - + if( !haveMask && !cvtdst ) func( sptr1, 0, sptr2, 0, dptr, 0, bszn, usrdata ); else @@ -1330,13 +1430,13 @@ void arithm_op(const InputArray& _src1, const InputArray& _src2, OutputArray& _d } } } - + static BinaryFunc addTab[] = { (BinaryFunc)add8u, (BinaryFunc)add8s, (BinaryFunc)add16u, (BinaryFunc)add16s, (BinaryFunc)add32s, (BinaryFunc)add32f, (BinaryFunc)add64f, 0 }; - + static BinaryFunc subTab[] = { (BinaryFunc)sub8u, (BinaryFunc)sub8s, (BinaryFunc)sub16u, (BinaryFunc)sub16s, @@ -1348,10 +1448,10 @@ static BinaryFunc absdiffTab[] = (BinaryFunc)absdiff8u, (BinaryFunc)absdiff8s, (BinaryFunc)absdiff16u, (BinaryFunc)absdiff16s, (BinaryFunc)absdiff32s, (BinaryFunc)absdiff32f, (BinaryFunc)absdiff64f, 0 -}; +}; } - + void cv::add( const InputArray& src1, const InputArray& src2, OutputArray dst, const InputArray& mask, int dtype ) { @@ -1367,7 +1467,7 @@ void cv::subtract( const InputArray& src1, const InputArray& src2, OutputArray d void cv::absdiff( const InputArray& src1, const InputArray& src2, OutputArray dst ) { arithm_op(src1, src2, dst, InputArray(), -1, absdiffTab); -} +} /****************************************************************************************\ * multiply/divide * @@ -1437,7 +1537,7 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); - + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int i = 0; @@ -1450,12 +1550,12 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, double d = scale/(a * b); b *= d; a *= d; - + T z0 = saturate_cast(src2[i+1] * ((double)src1[i] * b)); T z1 = saturate_cast(src2[i] * ((double)src1[i+1] * b)); T z2 = saturate_cast(src2[i+3] * ((double)src1[i+2] * a)); T z3 = saturate_cast(src2[i+2] * ((double)src1[i+3] * a)); - + dst[i] = z0; dst[i+1] = z1; dst[i+2] = z2; dst[i+3] = z3; } @@ -1465,12 +1565,12 @@ div_( const T* src1, size_t step1, const T* src2, size_t step2, T z1 = src2[i+1] != 0 ? saturate_cast(src1[i+1]*scale/src2[i+1]) : 0; T z2 = src2[i+2] != 0 ? saturate_cast(src1[i+2]*scale/src2[i+2]) : 0; T z3 = src2[i+3] != 0 ? saturate_cast(src1[i+3]*scale/src2[i+3]) : 0; - + dst[i] = z0; dst[i+1] = z1; dst[i+2] = z2; dst[i+3] = z3; } } - + for( ; i < size.width; i++ ) dst[i] = src2[i] != 0 ? saturate_cast(src1[i]*scale/src2[i]) : 0; } @@ -1482,7 +1582,7 @@ recip_( const T*, size_t, const T* src2, size_t step2, { step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); - + for( ; size.height--; src2 += step2, dst += step ) { int i = 0; @@ -1495,12 +1595,12 @@ recip_( const T*, size_t, const T* src2, size_t step2, double d = scale/(a * b); b *= d; a *= d; - + T z0 = saturate_cast(src2[i+1] * b); T z1 = saturate_cast(src2[i] * b); T z2 = saturate_cast(src2[i+3] * a); T z3 = saturate_cast(src2[i+2] * a); - + dst[i] = z0; dst[i+1] = z1; dst[i+2] = z2; dst[i+3] = z3; } @@ -1515,13 +1615,13 @@ recip_( const T*, size_t, const T* src2, size_t step2, dst[i+2] = z2; dst[i+3] = z3; } } - + for( ; i < size.width; i++ ) dst[i] = src2[i] != 0 ? saturate_cast(scale/src2[i]) : 0; } } - - + + static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* scale) { @@ -1551,7 +1651,7 @@ static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2 { mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } - + static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { @@ -1563,7 +1663,7 @@ static void mul64f( const double* src1, size_t step1, const double* src2, size_t { mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } - + static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* scale) { @@ -1650,8 +1750,8 @@ static void recip64f( const double* src1, size_t step1, const double* src2, size { recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } - - + + static BinaryFunc mulTab[] = { (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u, @@ -1673,9 +1773,9 @@ static BinaryFunc recipTab[] = (BinaryFunc)recip64f, 0 }; - + } - + void cv::multiply(const InputArray& src1, const InputArray& src2, OutputArray dst, double scale, int dtype) { @@ -1692,8 +1792,8 @@ void cv::divide(double scale, const InputArray& src2, OutputArray dst, int dtype) { arithm_op(src2, src2, dst, InputArray(), dtype, recipTab, true, &scale); -} - +} + /****************************************************************************************\ * addWeighted * \****************************************************************************************/ @@ -1739,34 +1839,34 @@ addWeighted8u( const uchar* src1, size_t step1, { const double* scalars = (const double*)_scalars; float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2]; - + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int x = 0; - + #if CV_SSE2 if( USE_SSE2 ) { __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma); __m128i z = _mm_setzero_si128(); - + for( ; x <= size.width - 8; x += 8 ) { __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z); __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z); - + __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z)); __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z)); __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z)); __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z)); - + u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4)); u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4)); u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4); - + u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1)); u = _mm_packus_epi16(u, u); - + _mm_storel_epi64((__m128i*)(dst + x), u); } } @@ -1837,9 +1937,9 @@ static BinaryFunc addWeightedTab[] = (BinaryFunc)addWeighted16s, (BinaryFunc)addWeighted32s, (BinaryFunc)addWeighted32f, (BinaryFunc)addWeighted64f, 0 }; - + } - + void cv::addWeighted( const InputArray& src1, double alpha, const InputArray& src2, double beta, double gamma, OutputArray dst, int dtype ) { @@ -1847,7 +1947,7 @@ void cv::addWeighted( const InputArray& src1, double alpha, const InputArray& sr arithm_op(src1, src2, dst, InputArray(), dtype, addWeightedTab, true, scalars); } - + /****************************************************************************************\ * compare * \****************************************************************************************/ @@ -1867,7 +1967,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, std::swap(step1, step2); code = code == CMP_GE ? CMP_LE : CMP_GT; } - + if( code == CMP_GT || code == CMP_LE ) { int m = code == CMP_GT ? 0 : 255; @@ -1884,7 +1984,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, t1 = -(src1[x+3] > src2[x+3]) ^ m; dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; } - + for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); } @@ -1905,14 +2005,14 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, t1 = -(src1[x+3] == src2[x+3]) ^ m; dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; } - + for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); } } } - - + + static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size size, void* _cmpop) { @@ -1953,8 +2053,8 @@ static void cmp64f(const double* src1, size_t step1, const double* src2, size_t uchar* dst, size_t step, Size size, void* _cmpop) { cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); -} - +} + static BinaryFunc cmpTab[] = { (BinaryFunc)cmp8u, (BinaryFunc)cmp8s, (BinaryFunc)cmp16u, @@ -1962,7 +2062,7 @@ static BinaryFunc cmpTab[] = (BinaryFunc)cmp64f, 0 }; - + static double getMinVal(int depth) { static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0}; @@ -1973,18 +2073,18 @@ static double getMaxVal(int depth) { static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0}; return tab[depth]; -} - } - + +} + void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _dst, int op) { CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ || op == CMP_NE || op == CMP_GE || op == CMP_GT ); - + int kind1 = _src1.kind(), kind2 = _src2.kind(); Mat src1 = _src1.getMat(), src2 = _src2.getMat(); - + if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() ) { _dst.create(src1.size(), CV_8UC1); @@ -1993,9 +2093,9 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _ cmpTab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, &op); return; } - + bool haveScalar = false; - + if( (kind1 == InputArray::MATX) + (kind2 == InputArray::MATX) == 1 || src1.size != src2.size || src1.type() != src2.type() ) { @@ -2012,26 +2112,26 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _ "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; } - + int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth(); if( cn != 1 ) CV_Error( CV_StsUnsupportedFormat, "compare() can only process single-channel arrays" ); - + size_t esz = src1.elemSize(); size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz; - + _dst.create(src1.dims, src1.size, CV_8U); Mat dst = _dst.getMat(); BinaryFunc func = cmpTab[depth1]; - + if( !haveScalar ) { const Mat* arrays[] = { &src1, &src2, &dst, 0 }; uchar* ptrs[3]; - + NAryMatIterator it(arrays, ptrs); size_t total = it.size; - + for( size_t i = 0; i < it.nplanes; i++, ++it ) func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op ); } @@ -2039,10 +2139,10 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _ { const Mat* arrays[] = { &src1, &dst, 0 }; uchar* ptrs[2]; - + NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = std::min(total, blocksize0); - + AutoBuffer _buf(blocksize*esz); uchar *buf = _buf; @@ -2057,13 +2157,13 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _ dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0); return; } - + if( fval > getMaxVal(depth1) ) { dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0); return; } - + int ival = cvRound(fval); if( fval != ival ) { @@ -2079,7 +2179,7 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _ } convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize); } - + for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) @@ -2092,7 +2192,7 @@ void cv::compare(const InputArray& _src1, const InputArray& _src2, OutputArray _ } } } - + /****************************************************************************************\ * inRange * \****************************************************************************************/ @@ -2108,7 +2208,7 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2, step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]); step3 /= sizeof(src3[0]); - + for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step ) { int x = 0; @@ -2122,13 +2222,13 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2, t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3]; dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1; } - + for( ; x < size.width; x++ ) dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]); } } - + static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, const uchar* src3, size_t step3, uchar* dst, size_t step, Size size) { @@ -2169,7 +2269,7 @@ static void inRange64f(const double* src1, size_t step1, const double* src2, siz const double* src3, size_t step3, uchar* dst, size_t step, Size size) { inRange_(src1, step1, src2, step2, src3, step3, dst, step, size); -} +} static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn) { @@ -2187,14 +2287,14 @@ static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn) else for( i = j = 0; i < len; i++, j += cn ) dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3]; - + for( ; k < cn; k += 4 ) { for( i = 0, j = k; i < len; i++, j += cn ) dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3]; } } - + typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2, const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz ); @@ -2204,7 +2304,7 @@ static InRangeFunc inRangeTab[] = (InRangeFunc)inRange16s, (InRangeFunc)inRange32s, (InRangeFunc)inRange32f, (InRangeFunc)inRange64f, 0 }; - + } void cv::inRange(const InputArray& _src, const InputArray& _lowerb, @@ -2212,9 +2312,9 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb, { int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind(); Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat(); - + bool lbScalar = false, ubScalar = false; - + if( (lkind == InputArray::MATX && skind != InputArray::MATX) || src.size != lb.size || src.type() != lb.type() ) { @@ -2223,7 +2323,7 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb, "The lower bounary is neither an array of the same size and same type as src, nor a scalar"); lbScalar = true; } - + if( (ukind == InputArray::MATX && skind != InputArray::MATX) || src.size != ub.size || src.type() != ub.type() ) { @@ -2232,47 +2332,47 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb, "The upper bounary is neither an array of the same size and same type as src, nor a scalar"); ubScalar = true; } - + CV_Assert( ((int)lbScalar ^ (int)ubScalar) == 0 ); - + int cn = src.channels(), depth = src.depth(); - + size_t esz = src.elemSize(); size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz; - + _dst.create(src.dims, src.size, CV_8U); Mat dst = _dst.getMat(); InRangeFunc func = inRangeTab[depth]; - + const Mat* arrays_sc[] = { &src, &dst, 0 }; const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 }; uchar* ptrs[4]; - + NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs); size_t total = it.size, blocksize = std::min(total, blocksize0); - + AutoBuffer _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128); uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0; buf = alignPtr(buf + blocksize*cn, 16); - + if( lbScalar && ubScalar ) { lbuf = buf; ubuf = buf = alignPtr(buf + blocksize*esz, 16); - + CV_Assert( lb.type() == ub.type() ); int scdepth = lb.depth(); - + if( scdepth != depth && depth < CV_32S ) { int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16); int* iubuf = ilbuf + cn; - + BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S); sccvtfunc(lb.data, 0, 0, 0, (uchar*)ilbuf, 0, Size(cn, 1), 0); sccvtfunc(ub.data, 0, 0, 0, (uchar*)iubuf, 0, Size(cn, 1), 0); int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth)); - + for( int k = 0; k < cn; k++ ) { if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval ) @@ -2281,11 +2381,11 @@ void cv::inRange(const InputArray& _src, const InputArray& _lowerb, lb = Mat(cn, 1, CV_32S, ilbuf); ub = Mat(cn, 1, CV_32S, iubuf); } - + convertAndUnrollScalar( lb, src.type(), lbuf, blocksize ); convertAndUnrollScalar( ub, src.type(), ubuf, blocksize ); } - + for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index 9d0d957dc4..1af4950249 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -646,8 +646,8 @@ static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step, { GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags); } - - + + static void GEMMStore_32f( const float* c_data, size_t c_step, const double* d_buf, size_t d_buf_step, float* d_data, size_t d_step, Size d_size, @@ -664,7 +664,7 @@ static void GEMMStore_64f( const double* c_data, size_t c_step, { GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags); } - + static void GEMMStore_32fc( const Complexf* c_data, size_t c_step, const Complexd* d_buf, size_t d_buf_step, @@ -1130,7 +1130,7 @@ void cv::gemm( const InputArray& matA, const InputArray& matB, double alpha, int dm0, dn0, dk0; size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1; int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0); - + if( !is_a_t ) a_step0 = A.step, a_step1 = elem_size; else @@ -1273,7 +1273,7 @@ template static void transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn ) { int x; - + if( scn == 2 && dcn == 2 ) { for( x = 0; x < len*2; x += 2 ) @@ -1352,7 +1352,7 @@ load4x4Matrix( const float* m, __m128& m0, __m128& m1, __m128& m2, __m128& m3, _ } #endif - + static void transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn ) { @@ -1379,7 +1379,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in __m128i m2 = _mm_setr_epi16(0, m20, m21, m22, m20, m21, m22, 0); __m128i m3 = _mm_setr_epi32(m03, m13, m23, 0); int x = 0; - + for( ; x <= (len - 8)*3; x += 8*3 ) { __m128i z = _mm_setzero_si128(), t0, t1, t2, r0, r1; @@ -1470,14 +1470,14 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in return; } #endif - + transform_(src, dst, m, len, scn, dcn); } static void transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn ) { -#if CV_SSE2 +#if CV_SSE2 if( USE_SSE2 && scn == 3 && dcn == 3 ) { __m128 m0, m1, m2, m3; @@ -1536,11 +1536,11 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, return; } #endif - + transform_(src, dst, m, len, scn, dcn); } - - + + static void transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn ) { @@ -1574,12 +1574,12 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i } return; } - + if( scn == 4 && dcn == 4 ) { __m128 m0, m1, m2, m3, m4; load4x4Matrix(m, m0, m1, m2, m3, m4); - + for( ; x < len*4; x += 4 ) { __m128 x0 = _mm_loadu_ps(src + x); @@ -1616,18 +1616,18 @@ transform_32s(const int* src, int* dst, const double* m, int len, int scn, int d { transform_(src, dst, m, len, scn, dcn); } - + static void transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) { transform_(src, dst, m, len, scn, dcn); -} - +} + template static void diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int ) { int x; - + if( cn == 2 ) { for( x = 0; x < len*2; x += 2 ) @@ -1674,8 +1674,8 @@ static void diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn) { diagtransform_(src, dst, m, len, scn, dcn); -} - +} + static void diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn) { @@ -1686,8 +1686,8 @@ static void diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn) { diagtransform_(src, dst, m, len, scn, dcn); -} - +} + static void diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn) { @@ -1704,17 +1704,17 @@ static void diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn) { diagtransform_(src, dst, m, len, scn, dcn); -} - +} + static void diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn) { diagtransform_(src, dst, m, len, scn, dcn); -} - - +} + + typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int ); - + static TransformFunc transformTab[] = { (TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u, @@ -1728,23 +1728,23 @@ static TransformFunc diagTransformTab[] = (TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f, (TransformFunc)diagtransform_64f, 0 }; - + } - + void cv::transform( const InputArray& _src, OutputArray _dst, const InputArray& _mtx ) { Mat src = _src.getMat(), m = _mtx.getMat(); int depth = src.depth(), scn = src.channels(), dcn = m.rows; CV_Assert( scn == m.cols || scn + 1 == m.cols ); bool isDiag = false; - + _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); Mat dst = _dst.getMat(); int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F; AutoBuffer _mbuf; double* mbuf = _mbuf; - + if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 ) { _mbuf.allocate(dcn*(scn+1)); @@ -1791,12 +1791,12 @@ void cv::transform( const InputArray& _src, OutputArray _dst, const InputArray& TransformFunc func = isDiag ? diagTransformTab[depth] : transformTab[depth]; CV_Assert( func != 0 ); - + const Mat* arrays[] = {&src, &dst, 0}; uchar* ptrs[2]; NAryMatIterator it(arrays, ptrs); size_t i, total = it.size; - + for( i = 0; i < it.nplanes; i++, ++it ) func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); } @@ -1813,7 +1813,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, { const double eps = FLT_EPSILON; int i; - + if( scn == 2 && dcn == 2 ) { for( i = 0; i < len*2; i += 2 ) @@ -1837,7 +1837,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, { T x = src[i], y = src[i + 1], z = src[i + 2]; double w = x*m[12] + y*m[13] + z*m[14] + m[15]; - + if( fabs(w) > eps ) { w = 1./w; @@ -1855,7 +1855,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, { T x = src[0], y = src[1], z = src[2]; double w = x*m[8] + y*m[9] + z*m[10] + m[11]; - + if( fabs(w) > eps ) { w = 1./w; @@ -1893,7 +1893,7 @@ perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, } } - + static void perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn) { @@ -1905,22 +1905,22 @@ perspectiveTransform_64f(const double* src, double* dst, const double* m, int le { perspectiveTransform_(src, dst, m, len, scn, dcn); } - + } - + void cv::perspectiveTransform( const InputArray& _src, OutputArray _dst, const InputArray& _mtx ) { Mat src = _src.getMat(), m = _mtx.getMat(); int depth = src.depth(), scn = src.channels(), dcn = m.rows-1; CV_Assert( scn + 1 == m.cols && (depth == CV_32F || depth == CV_64F)); - + _dst.create( src.size(), CV_MAKETYPE(depth, dcn) ); Mat dst = _dst.getMat(); - + const int mtype = CV_64F; AutoBuffer _mbuf; double* mbuf = _mbuf; - + if( !m.isContinuous() || m.type() != mtype ) { _mbuf.allocate((dcn+1)*(scn+1)); @@ -1930,20 +1930,20 @@ void cv::perspectiveTransform( const InputArray& _src, OutputArray _dst, const I } else mbuf = (double*)m.data; - + TransformFunc func = depth == CV_32F ? (TransformFunc)perspectiveTransform_32f : (TransformFunc)perspectiveTransform_64f; CV_Assert( func != 0 ); - + const Mat* arrays[] = {&src, &dst, 0}; uchar* ptrs[2]; NAryMatIterator it(arrays, ptrs); size_t i, total = it.size; - + for( i = 0; i < it.nplanes; i++, ++it ) func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn ); -} +} /****************************************************************************************\ * ScaleAdd * @@ -2000,7 +2000,7 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst, dst[i] = src1[i]*alpha + src2[i]; } - + static void scaleAdd_64f(const double* src1, const double* src2, double* dst, int len, double* _alpha) { @@ -2040,39 +2040,39 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst, typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha); } - + void cv::scaleAdd( const InputArray& _src1, double alpha, const InputArray& _src2, OutputArray _dst ) { Mat src1 = _src1.getMat(), src2 = _src2.getMat(); int depth = src1.depth(), cn = src1.channels(); - + CV_Assert( src1.type() == src2.type() ); if( depth < CV_32F ) { addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth); return; } - + _dst.create(src1.dims, src1.size, src1.type()); Mat dst = _dst.getMat(); - + float falpha = (float)alpha; void* palpha = depth == CV_32F ? (void*)&falpha : (void*)α - + ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f; - + if( src1.isContinuous() && src2.isContinuous() && dst.isContinuous() ) { size_t len = src1.total()*cn; func(src1.data, src2.data, dst.data, (int)len, palpha); return; } - + const Mat* arrays[] = {&src1, &src2, &dst, 0}; uchar* ptrs[3]; NAryMatIterator it(arrays, ptrs); size_t i, len = it.size*cn; - + for( i = 0; i < it.nplanes; i++, ++it ) func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha ); } @@ -2243,7 +2243,7 @@ double cv::Mahalonobis( const InputArray& _v1, const InputArray& _v2, const Inpu { return Mahalanobis(_v1, _v2, _icovar); } - + /****************************************************************************************\ * MulTransposed * \****************************************************************************************/ @@ -2445,7 +2445,7 @@ MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scal typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale); } - + void cv::mulTransposed( const InputArray& _src, OutputArray _dst, bool ata, const InputArray& _delta, double scale, int dtype ) { @@ -2578,7 +2578,7 @@ dotProd_(const T* src1, const T* src2, int len) (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3]; for( ; i < len; i++ ) result += (double)src1[i]*src2[i]; - + return result; } @@ -2590,9 +2590,10 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) ippiDotProd_8u64f_C1R(src1, (int)(len*sizeof(src1[0])), src2, (int)(len*sizeof(src2[0])), ippiSize(len, 1), &r); + return r; #else int i = 0; - + #if CV_SSE2 if( USE_SSE2 ) { @@ -2616,7 +2617,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) s = _mm_add_epi32(s, s0); s = _mm_add_epi32(s, s2); } - + for( ; j < blockSize; j += 4 ) { __m128i s0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src1 + j)), z); @@ -2627,7 +2628,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) CV_DECL_ALIGNED(16) int buf[4]; _mm_store_si128((__m128i*)buf, s); r += buf[0] + buf[1] + buf[2] + buf[3]; - + src1 += blockSize; src2 += blockSize; i += blockSize; @@ -2692,7 +2693,7 @@ static double dotProd_64f(const double* src1, const double* src2, int len) typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len); - + static DotProdFunc dotProdTab[] = { (DotProdFunc)dotProd_8u, (DotProdFunc)dotProd_8s, (DotProdFunc)dotProd_16u, @@ -2713,16 +2714,16 @@ double Mat::dot(const InputArray& _mat) const if( len == (size_t)(int)len ) return func(data, mat.data, len); } - + const Mat* arrays[] = {this, &mat, 0}; uchar* ptrs[2]; NAryMatIterator it(arrays, ptrs); int len = (int)(it.size*cn); double r = 0; - + for( size_t i = 0; i < it.nplanes; i++, ++it ) r += func( ptrs[0], ptrs[1], len ); - + return r; } @@ -3027,12 +3028,12 @@ cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigen evects = pca.eigenvectors; int ecount0 = evals0.cols + evals0.rows - 1; int ecount = evals.cols + evals.rows - 1; - + CV_Assert( (evals0.cols == 1 || evals0.rows == 1) && ecount0 <= ecount && evects0.cols == evects.cols && evects0.rows == ecount0 ); - + cv::Mat temp = evals0; if( evals.rows == 1 ) evals.colRange(0, ecount0).convertTo(temp, evals0.type()); diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 09711b69a1..24c81ace60 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -87,7 +87,7 @@ extern const uchar g_Saturate8u[]; void deleteThreadAllocData(); void deleteThreadRNGData(); #endif - + template struct OpAdd { typedef T1 type1; @@ -176,24 +176,24 @@ typedef void (*BinaryFunc)(const uchar* src1, size_t step1, void*); BinaryFunc getConvertFunc(int sdepth, int ddepth); -BinaryFunc getConvertScaleFunc(int sdepth, int ddepth); +BinaryFunc getConvertScaleFunc(int sdepth, int ddepth); BinaryFunc getCopyMaskFunc(size_t esz); enum { BLOCK_SIZE = 1024 }; #ifdef HAVE_IPP -static inline IppiSize ippiSize(int width, int height) { IppiSize sz={width, height}; return sz; } -static inline IppiSize ippiSize(Size _sz) { reIppiSize sz={_sz.width, _sz.height}; return sz; } +static inline IppiSize ippiSize(int width, int height) { IppiSize sz = { width, height}; return sz; } +static inline IppiSize ippiSize(Size _sz) { IppiSize sz = { _sz.width, _sz.height}; return sz; } #endif - + #if defined HAVE_IPP && (IPP_VERSION_MAJOR >= 7) #define ARITHM_USE_IPP 1 #define IF_IPP(then_call, else_call) then_call #else #define ARITHM_USE_IPP 0 #define IF_IPP(then_call, else_call) else_call -#endif - +#endif + } #endif /*_CXCORE_INTERNAL_H_*/ diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index e4ef11e91c..474eab7ed3 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -170,9 +170,10 @@ struct IPPInitializer IPPInitializer ippInitializer; #else volatile bool useOptimizedFlag = false; -volatile bool USE_SSE2 = false; #endif +volatile bool USE_SSE2 = false; + void setUseOptimized( bool flag ) { useOptimizedFlag = flag;