From a82e70cd4081f14009737d8d0f79246d8c3c3bef Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Fri, 13 Apr 2018 21:19:16 +0900 Subject: [PATCH] remove raw SSE2/NEON implementation from imgwarp.cpp * use universal intrinsic instead of raw intrinsic * add 2 channels de-interleave on x86 platform * add v_int32x4 version of v_muladd * add accumulate version of v_dotprod based on the commit from seiko2plus on bf1852d * remove some verify check in performance test * avoid the out of boundary access and keep the performance --- .../include/opencv2/core/hal/intrin_cpp.hpp | 25 +- .../include/opencv2/core/hal/intrin_neon.hpp | 25 +- .../include/opencv2/core/hal/intrin_sse.hpp | 35 +- .../include/opencv2/core/hal/intrin_vsx.hpp | 6 + modules/core/test/test_intrin_utils.hpp | 14 +- modules/imgproc/perf/opencl/perf_imgwarp.cpp | 2 +- modules/imgproc/perf/perf_warp.cpp | 6 +- modules/imgproc/src/imgwarp.cpp | 960 ++++++++---------- 8 files changed, 539 insertions(+), 534 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index e7ea899b7e..5518eace9b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> /** @brief Multiply and add Returns \f$ a*b + c \f$ -For floating point types only. */ +For floating point types and signed 32bit int only. */ template inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) @@ -828,6 +828,29 @@ template inline v_reg::w_type, n return c; } +/** @brief Dot product of elements + +Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs. +Scheme: +@code + {A1 A2 ...} // 16-bit +x {B1 B2 ...} // 16-bit +------------- + {A1B1+A2B2+C1 ...} // 32-bit + +@endcode +Implemented only for 16-bit signed source type (v_int16x8). +*/ +template inline v_reg::w_type, n/2> + v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg::w_type, n / 2>& c) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + v_reg s; + for( int i = 0; i < (n/2); i++ ) + s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i]; + return s; +} + /** @brief Multiply and expand Multiply values two registers and store results in two registers with wider pack type. diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index c3c49c902b..033cf0f2dc 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); } +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + v_int32x4 s = v_dotprod(a, b); + return v_int32x4(vaddq_s32(s.val , c.val)); +} + #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ @@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_ return v_float32x4(vmlaq_f32(c.val, a.val, b.val)); } +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_int32x4(vmlaq_s32(c.val, a.val, b.val)); +} + #if CV_SIMD128_64F inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) { @@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32) OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64) #endif +#if CV_SIMD128_64F +inline v_int32x4 v_round(const v_float32x4& a) +{ + float32x4_t a_ = a.val; + int32x4_t result; + __asm__ ("fcvtns %0.4s, %1.4s" + : "=w"(result) + : "w"(a_) + : /* No clobbers */); + return v_int32x4(result); +} +#else inline v_int32x4 v_round(const v_float32x4& a) { static const int32x4_t v_sign = vdupq_n_s32(1 << 31), @@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a) int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); } - +#endif inline v_int32x4 v_floor(const v_float32x4& a) { int32x4_t a1 = vcvtq_s32_f32(a.val); diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 0e740f6418..64bea04e1e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) return v_int32x4(_mm_madd_epi16(a.val, b.val)); } +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val)); +} + #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \ @@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) __m128i m = _mm_cmpgt_epi32(b.val, a.val); return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m)); } +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return a * b + c; +} #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ @@ -1599,7 +1608,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& c = v_reinterpret_as_f64(t2); } -// 2-channel, float only +// 2-channel inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) { const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); @@ -1611,7 +1620,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 } -inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b ) +inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b) +{ + __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3 + __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7 + + __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5 + __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7 + __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6 + __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7 + + a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7 + b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7 +} + +inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b) +{ + v_int16x8 sa, sb; + v_load_deinterleave((const short*)ptr, sa, sb); + a = v_reinterpret_as_u16(sa); + b = v_reinterpret_as_u16(sb); +} + +inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b) { __m128i t0, t1; t0 = _mm_unpacklo_epi16(a.val, b.val); diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 9f050f7c21..b9e73ca1f8 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -821,6 +821,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4) OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2) +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ return a * b + c; } + // TODO: exp, log, sin, cos /** Absolute values **/ @@ -904,6 +907,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); } +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_int32x4(vec_msum(a.val, b.val, c.val)); } + inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index cd9373ad6c..43d8aaff4d 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -521,15 +521,25 @@ template struct TheTest TheTest & test_dot_prod() { typedef typename V_RegTrait128::w_reg Rx2; + typedef typename Rx2::lane_type w_type; + Data dataA, dataB(2); R a = dataA, b = dataB; - Data res = v_dotprod(a, b); + Data dataC; + dataC += std::numeric_limits::is_signed ? + std::numeric_limits::min() : + std::numeric_limits::max() - R::nlanes * (dataB[0] + 1); + Rx2 c = dataC; + + Data resD = v_dotprod(a, b), + resE = v_dotprod(a, b, c); const int n = R::nlanes / 2; for (int i = 0; i < n; ++i) { - EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]); + EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]); + EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]); } return *this; } diff --git a/modules/imgproc/perf/opencl/perf_imgwarp.cpp b/modules/imgproc/perf/opencl/perf_imgwarp.cpp index 7a90e33380..44fb84d1cc 100644 --- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp +++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp @@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap, OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode); - SANITY_CHECK(dst, eps); + SANITY_CHECK_NOTHING(); } } } // namespace opencv_test::ocl diff --git a/modules/imgproc/perf/perf_warp.cpp b/modules/imgproc/perf/perf_warp.cpp index d0e09a5bab..728a7bba70 100644 --- a/modules/imgproc/perf/perf_warp.cpp +++ b/modules/imgproc/perf/perf_warp.cpp @@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear, PERF_TEST_P( TestRemap, remap, Combine( - Values( TYPICAL_MAT_TYPES ), - Values( szVGA, sz720p, sz1080p ), + Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ), + Values( szVGA, sz1080p ), InterType::all(), BorderMode::all(), RemapMode::all() @@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap, remap(source, destination, map_x, map_y, interpolationType, borderMode); } - SANITY_CHECK(destination, 1); + SANITY_CHECK_NOTHING(); } void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode ) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 9a530c4a6c..ba1858b898 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -50,7 +50,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" #include "hal_replacement.hpp" - +#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" #include "imgwarp.hpp" @@ -130,7 +130,7 @@ static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2]; static float BilinearTab_f[INTER_TAB_SIZE2][2][2]; static short BilinearTab_i[INTER_TAB_SIZE2][2][2]; -#if CV_SSE2 || CV_NEON +#if CV_SIMD128 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8]; static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16); #endif @@ -266,7 +266,7 @@ static const void* initInterTab2D( int method, bool fixpt ) } tab -= INTER_TAB_SIZE2*ksize*ksize; itab -= INTER_TAB_SIZE2*ksize*ksize; -#if CV_SSE2 || CV_NEON +#if CV_SIMD128 if( method == INTER_LINEAR ) { for( i = 0; i < INTER_TAB_SIZE2; i++ ) @@ -432,7 +432,7 @@ struct RemapNoVec const void*, int ) const { return 0; } }; -#if CV_SSE2 +#if CV_SIMD128 struct RemapVec_8u { @@ -441,190 +441,192 @@ struct RemapVec_8u { int cn = _src.channels(), x = 0, sstep = (int)_src.step; - if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) || + if( (cn != 1 && cn != 3 && cn != 4) || !hasSIMD128() || sstep > 0x8000 ) return 0; const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1); const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0]; uchar* D = (uchar*)_dst; - __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2); - __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16)); - __m128i z = _mm_setzero_si128(); + v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2); + v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16))); int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4]; + const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes; +#define CV_PICK_AND_PACK_RGB(ptr, offset, result) \ + { \ + const uchar* const p = ((const uchar*)ptr) + (offset); \ + if (p <= src_limit_8bytes) \ + { \ + v_uint8x16 rrggbb, dummy; \ + v_uint16x8 rrggbb8, dummy8; \ + v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \ + v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + 3), 0, 0, 0)); \ + v_zip(rgb0, rgb1, rrggbb, dummy); \ + v_expand(rrggbb, rrggbb8, dummy8); \ + result = v_reinterpret_as_s16(rrggbb8); \ + } \ + else \ + { \ + result = v_int16x8((short)p[0], (short)p[3], /* r0r1 */ \ + (short)p[1], (short)p[4], /* g0g1 */ \ + (short)p[2], (short)p[5], /* b0b1 */ 0, 0); \ + } \ + } +#define CV_PICK_AND_PACK_RGBA(ptr, offset, result) \ + { \ + const uchar* const p = ((const uchar*)ptr) + (offset); \ + CV_DbgAssert(p <= src_limit_8bytes); \ + v_uint8x16 rrggbbaa, dummy; \ + v_uint16x8 rrggbbaa8, dummy8; \ + v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \ + v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + v_int32x4::nlanes), 0, 0, 0)); \ + v_zip(rgba0, rgba1, rrggbbaa, dummy); \ + v_expand(rrggbbaa, rrggbbaa8, dummy8); \ + result = v_reinterpret_as_s16(rrggbbaa8); \ + } +#define CV_PICK_AND_PACK4(base,offset) \ + v_uint16x8(*(ushort*)(base + offset[0]), *(ushort*)(base + offset[1]), \ + *(ushort*)(base + offset[2]), *(ushort*)(base + offset[3]), \ + 0, 0, 0, 0) if( cn == 1 ) { for( ; x <= width - 8; x += 8 ) { - __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); - __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8)); - __m128i v0, v1, v2, v3, a0, a1, b0, b1; - unsigned i0, i1; + v_int16x8 _xy0 = v_load(XY + x*2); + v_int16x8 _xy1 = v_load(XY + x*2 + 8); + v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2; - xy0 = _mm_madd_epi16( xy0, xy2ofs ); - xy1 = _mm_madd_epi16( xy1, xy2ofs ); - _mm_store_si128( (__m128i*)iofs0, xy0 ); - _mm_store_si128( (__m128i*)iofs1, xy1 ); + v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs ); + v_int32x4 xy1 = v_dotprod( _xy1, xy2ofs ); + v_store( iofs0, xy0 ); + v_store( iofs1, xy1 ); - i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16); - i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16); - v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); - i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16); - i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16); - v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); - v0 = _mm_unpacklo_epi8(v0, z); - v1 = _mm_unpacklo_epi8(v1, z); + v_uint16x8 stub, dummy; + v_uint16x8 vec16; + vec16 = CV_PICK_AND_PACK4(S0, iofs0); + v_expand(v_reinterpret_as_u8(vec16), stub, dummy); + v0 = v_reinterpret_as_s32(stub); + vec16 = CV_PICK_AND_PACK4(S1, iofs0); + v_expand(v_reinterpret_as_u8(vec16), stub, dummy); + v1 = v_reinterpret_as_s32(stub); - a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)), - _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4))); - a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)), - _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4))); - b0 = _mm_unpacklo_epi64(a0, a1); - b1 = _mm_unpackhi_epi64(a0, a1); - v0 = _mm_madd_epi16(v0, b0); - v1 = _mm_madd_epi16(v1, b1); - v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta); + v_zip(v_load_low((int*)(wtab + FXY[x] * 4)), v_load_low((int*)(wtab + FXY[x + 1] * 4)), a0, a1); + v_zip(v_load_low((int*)(wtab + FXY[x + 2] * 4)), v_load_low((int*)(wtab + FXY[x + 3] * 4)), b0, b1); + v_recombine(a0, b0, a2, b2); + v1 = v_dotprod(v_reinterpret_as_s16(v1), v_reinterpret_as_s16(b2), delta); + v0 = v_dotprod(v_reinterpret_as_s16(v0), v_reinterpret_as_s16(a2), v1); - i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16); - i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16); - v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); - i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16); - i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16); - v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); - v2 = _mm_unpacklo_epi8(v2, z); - v3 = _mm_unpacklo_epi8(v3, z); + vec16 = CV_PICK_AND_PACK4(S0, iofs1); + v_expand(v_reinterpret_as_u8(vec16), stub, dummy); + v2 = v_reinterpret_as_s32(stub); + vec16 = CV_PICK_AND_PACK4(S1, iofs1); + v_expand(v_reinterpret_as_u8(vec16), stub, dummy); + v3 = v_reinterpret_as_s32(stub); - a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)), - _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4))); - a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)), - _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4))); - b0 = _mm_unpacklo_epi64(a0, a1); - b1 = _mm_unpackhi_epi64(a0, a1); - v2 = _mm_madd_epi16(v2, b0); - v3 = _mm_madd_epi16(v3, b1); - v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta); + v_zip(v_load_low((int*)(wtab + FXY[x + 4] * 4)), v_load_low((int*)(wtab + FXY[x + 5] * 4)), c0, c1); + v_zip(v_load_low((int*)(wtab + FXY[x + 6] * 4)), v_load_low((int*)(wtab + FXY[x + 7] * 4)), d0, d1); + v_recombine(c0, d0, c2, d2); + v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta); + v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3); - v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS); - v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS); - v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z); - _mm_storel_epi64( (__m128i*)(D + x), v0 ); + v0 = v0 >> INTER_REMAP_COEF_BITS; + v2 = v2 >> INTER_REMAP_COEF_BITS; + v_pack_u_store(D + x, v_pack(v0, v2)); } } else if( cn == 3 ) { for( ; x <= width - 5; x += 4, D += 12 ) { - __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); - __m128i u0, v0, u1, v1; + v_int16x8 u0, v0, u1, v1; + v_int16x8 _xy0 = v_load(XY + x * 2); - xy0 = _mm_madd_epi16( xy0, xy2ofs ); - _mm_store_si128( (__m128i*)iofs0, xy0 ); - const __m128i *w0, *w1; - w0 = (const __m128i*)(wtab + FXY[x]*16); - w1 = (const __m128i*)(wtab + FXY[x+1]*16); + v_int32x4 xy0 = v_dotprod(_xy0, xy2ofs); + v_store(iofs0, xy0); - u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3))); - v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3))); - u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3))); - v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3))); - u0 = _mm_unpacklo_epi8(u0, z); - v0 = _mm_unpacklo_epi8(v0, z); - u1 = _mm_unpacklo_epi8(u1, z); - v1 = _mm_unpacklo_epi8(v1, z); - u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); - u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); - u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); - u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); - u0 = _mm_slli_si128(u0, 4); - u0 = _mm_packs_epi32(u0, u1); - u0 = _mm_packus_epi16(u0, u0); - _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1)); + int offset0 = FXY[x] * 16; + int offset1 = FXY[x + 1] * 16; + int offset2 = FXY[x + 2] * 16; + int offset3 = FXY[x + 3] * 16; + v_int16x8 w00 = v_load(wtab + offset0); + v_int16x8 w01 = v_load(wtab + offset0 + 8); + v_int16x8 w10 = v_load(wtab + offset1); + v_int16x8 w11 = v_load(wtab + offset1 + 8); - w0 = (const __m128i*)(wtab + FXY[x+2]*16); - w1 = (const __m128i*)(wtab + FXY[x+3]*16); + CV_PICK_AND_PACK_RGB(S0, iofs0[0], u0); + CV_PICK_AND_PACK_RGB(S1, iofs0[0], v0); + CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1); + CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1); - u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3))); - v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3))); - u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3))); - v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3))); - u0 = _mm_unpacklo_epi8(u0, z); - v0 = _mm_unpacklo_epi8(v0, z); - u1 = _mm_unpacklo_epi8(u1, z); - v1 = _mm_unpacklo_epi8(v1, z); - u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); - u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); - u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); - u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); - u0 = _mm_slli_si128(u0, 4); - u0 = _mm_packs_epi32(u0, u1); - u0 = _mm_packus_epi16(u0, u0); - _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1)); + v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; + v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + + result0 = v_rotate_left<1>(result0); + v_int16x8 result8 = v_pack(result0, result1); + v_uint8x16 result16 = v_pack_u(result8, result8); + v_store_low(D, v_rotate_right<1>(result16)); + + + w00 = v_load(wtab + offset2); + w01 = v_load(wtab + offset2 + 8); + w10 = v_load(wtab + offset3); + w11 = v_load(wtab + offset3 + 8); + CV_PICK_AND_PACK_RGB(S0, iofs0[2], u0); + CV_PICK_AND_PACK_RGB(S1, iofs0[2], v0); + CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1); + CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1); + + result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; + result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + + result0 = v_rotate_left<1>(result0); + result8 = v_pack(result0, result1); + result16 = v_pack_u(result8, result8); + v_store_low(D + 6, v_rotate_right<1>(result16)); } } else if( cn == 4 ) { for( ; x <= width - 4; x += 4, D += 16 ) { - __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); - __m128i u0, v0, u1, v1; + v_int16x8 _xy0 = v_load(XY + x * 2); + v_int16x8 u0, v0, u1, v1; - xy0 = _mm_madd_epi16( xy0, xy2ofs ); - _mm_store_si128( (__m128i*)iofs0, xy0 ); - const __m128i *w0, *w1; - w0 = (const __m128i*)(wtab + FXY[x]*16); - w1 = (const __m128i*)(wtab + FXY[x+1]*16); + v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs ); + v_store(iofs0, xy0); + int offset0 = FXY[x] * 16; + int offset1 = FXY[x + 1] * 16; + int offset2 = FXY[x + 2] * 16; + int offset3 = FXY[x + 3] * 16; - u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4))); - v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4))); - u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4))); - v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4))); - u0 = _mm_unpacklo_epi8(u0, z); - v0 = _mm_unpacklo_epi8(v0, z); - u1 = _mm_unpacklo_epi8(u1, z); - v1 = _mm_unpacklo_epi8(v1, z); - u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); - u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); - u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); - u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); - u0 = _mm_packs_epi32(u0, u1); - u0 = _mm_packus_epi16(u0, u0); - _mm_storel_epi64((__m128i*)D, u0); + v_int16x8 w00 = v_load(wtab + offset0); + v_int16x8 w01 = v_load(wtab + offset0 + 8); + v_int16x8 w10 = v_load(wtab + offset1); + v_int16x8 w11 = v_load(wtab + offset1 + 8); + CV_PICK_AND_PACK_RGBA(S0, iofs0[0], u0); + CV_PICK_AND_PACK_RGBA(S1, iofs0[0], v0); + CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1); + CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1); - w0 = (const __m128i*)(wtab + FXY[x+2]*16); - w1 = (const __m128i*)(wtab + FXY[x+3]*16); + v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; + v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + v_int16x8 result8 = v_pack(result0, result1); + v_pack_u_store(D, result8); - u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4))); - v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4))); - u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), - _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4))); - v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), - _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4))); - u0 = _mm_unpacklo_epi8(u0, z); - v0 = _mm_unpacklo_epi8(v0, z); - u1 = _mm_unpacklo_epi8(u1, z); - v1 = _mm_unpacklo_epi8(v1, z); - u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); - u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); - u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); - u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); - u0 = _mm_packs_epi32(u0, u1); - u0 = _mm_packus_epi16(u0, u0); - _mm_storel_epi64((__m128i*)(D + 8), u0); + w00 = v_load(wtab + offset2); + w01 = v_load(wtab + offset2 + 8); + w10 = v_load(wtab + offset3); + w11 = v_load(wtab + offset3 + 8); + CV_PICK_AND_PACK_RGBA(S0, iofs0[2], u0); + CV_PICK_AND_PACK_RGBA(S1, iofs0[2], v0); + CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1); + CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1); + + result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; + result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; + result8 = v_pack(result0, result1); + v_pack_u_store(D + 8, result8); } } @@ -660,7 +662,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy, unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0); CV_Assert( ssize.area() > 0 ); -#if CV_SSE2 +#if CV_SIMD128 if( _src.type() == CV_8UC3 ) width1 = std::max(ssize.width-2, 0); #endif @@ -1091,9 +1093,9 @@ public: int brows0 = std::min(128, dst->rows), map_depth = m1->depth(); int bcols0 = std::min(buf_size/brows0, dst->cols); brows0 = std::min(buf_size/bcols0, dst->rows); - #if CV_SSE2 - bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); - #endif +#if CV_SIMD128 + bool useSIMD = hasSIMD128(); +#endif Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa; if( !nnfunc ) @@ -1139,29 +1141,24 @@ public: const float* sY = m2->ptr(y+y1) + x; x1 = 0; - #if CV_SSE2 + #if CV_SIMD128 if( useSIMD ) { - for( ; x1 <= bcols - 8; x1 += 8 ) + int span = v_float32x4::nlanes; + for( ; x1 <= bcols - span * 2; x1 += span * 2 ) { - __m128 fx0 = _mm_loadu_ps(sX + x1); - __m128 fx1 = _mm_loadu_ps(sX + x1 + 4); - __m128 fy0 = _mm_loadu_ps(sY + x1); - __m128 fy1 = _mm_loadu_ps(sY + x1 + 4); - __m128i ix0 = _mm_cvtps_epi32(fx0); - __m128i ix1 = _mm_cvtps_epi32(fx1); - __m128i iy0 = _mm_cvtps_epi32(fy0); - __m128i iy1 = _mm_cvtps_epi32(fy1); - ix0 = _mm_packs_epi32(ix0, ix1); - iy0 = _mm_packs_epi32(iy0, iy1); - ix1 = _mm_unpacklo_epi16(ix0, iy0); - iy1 = _mm_unpackhi_epi16(ix0, iy0); - _mm_storeu_si128((__m128i*)(XY + x1*2), ix1); - _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); + v_int32x4 ix0 = v_round(v_load(sX + x1)); + v_int32x4 iy0 = v_round(v_load(sY + x1)); + v_int32x4 ix1 = v_round(v_load(sX + x1 + span)); + v_int32x4 iy1 = v_round(v_load(sY + x1 + span)); + + v_int16x8 dx, dy; + dx = v_pack(ix0, ix1); + dy = v_pack(iy0, iy1); + v_store_interleave(XY + x1 * 2, dx, dy); } } - #endif - + #endif for( ; x1 < bcols; x1++ ) { XY[x1*2] = saturate_cast(sX[x1]); @@ -1186,16 +1183,15 @@ public: const ushort* sA = m2->ptr(y+y1) + x; x1 = 0; - #if CV_NEON - uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1); - for ( ; x1 <= bcols - 8; x1 += 8) - vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale)); - #elif CV_SSE2 - __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1); - for ( ; x1 <= bcols - 8; x1 += 8) - _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale)); - #endif - + #if CV_SIMD128 + if (useSIMD) + { + v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1); + int span = v_uint16x8::nlanes; + for( ; x1 <= bcols - span; x1 += span ) + v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale); + } + #endif for( ; x1 < bcols; x1++ ) A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1)); } @@ -1205,60 +1201,29 @@ public: const float* sY = m2->ptr(y+y1) + x; x1 = 0; - #if CV_SSE2 + #if CV_SIMD128 if( useSIMD ) { - __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE); - __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1); - for( ; x1 <= bcols - 8; x1 += 8 ) + v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); + v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1); + int span = v_float32x4::nlanes; + for( ; x1 <= bcols - span * 2; x1 += span * 2 ) { - __m128 fx0 = _mm_loadu_ps(sX + x1); - __m128 fx1 = _mm_loadu_ps(sX + x1 + 4); - __m128 fy0 = _mm_loadu_ps(sY + x1); - __m128 fy1 = _mm_loadu_ps(sY + x1 + 4); - __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale)); - __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale)); - __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale)); - __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale)); - __m128i mx0 = _mm_and_si128(ix0, mask); - __m128i mx1 = _mm_and_si128(ix1, mask); - __m128i my0 = _mm_and_si128(iy0, mask); - __m128i my1 = _mm_and_si128(iy1, mask); - mx0 = _mm_packs_epi32(mx0, mx1); - my0 = _mm_packs_epi32(my0, my1); - my0 = _mm_slli_epi16(my0, INTER_BITS); - mx0 = _mm_or_si128(mx0, my0); - _mm_storeu_si128((__m128i*)(A + x1), mx0); - ix0 = _mm_srai_epi32(ix0, INTER_BITS); - ix1 = _mm_srai_epi32(ix1, INTER_BITS); - iy0 = _mm_srai_epi32(iy0, INTER_BITS); - iy1 = _mm_srai_epi32(iy1, INTER_BITS); - ix0 = _mm_packs_epi32(ix0, ix1); - iy0 = _mm_packs_epi32(iy0, iy1); - ix1 = _mm_unpacklo_epi16(ix0, iy0); - iy1 = _mm_unpackhi_epi16(ix0, iy0); - _mm_storeu_si128((__m128i*)(XY + x1*2), ix1); - _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); + v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1)); + v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1)); + v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span)); + v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span)); + v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2)); + v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2)); + v_uint16x8 v_v = v_shl(v_sy8) | (v_sx8); + v_store(A + x1, v_v); + + v_int16x8 v_d0 = v_pack(v_shr(v_sx0), v_shr(v_sx1)); + v_int16x8 v_d1 = v_pack(v_shr(v_sy0), v_shr(v_sy1)); + v_store_interleave(XY + (x1 << 1), v_d0, v_d1); } } - #elif CV_NEON - float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); - int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); - - for( ; x1 <= bcols - 4; x1 += 4 ) - { - int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)), - v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale)); - int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, - vandq_s32(v_sy, v_scale2)); - vst1_u16(A + x1, vqmovun_s32(v_v)); - - int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), - vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); - vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); - } - #endif - + #endif for( ; x1 < bcols; x1++ ) { int sx = cvRound(sX[x1]*INTER_TAB_SIZE); @@ -1274,26 +1239,33 @@ public: const float* sXY = m1->ptr(y+y1) + x*2; x1 = 0; - #if CV_NEON - float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE); - int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); - - for( ; x1 <= bcols - 4; x1 += 4 ) + #if CV_SIMD128 + if( useSIMD ) { - float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1)); - int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale)); - int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale)); - int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, - vandq_s32(v_sy, v_scale2)); - vst1_u16(A + x1, vqmovun_s32(v_v)); - - int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), - vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); - vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); + v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); + v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE); + int span = v_float32x4::nlanes; + for( ; x1 <= bcols - span * 2; x1 += span * 2 ) + { + v_float32x4 v_fx, v_fy; + v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy); + v_int32x4 v_sx0 = v_round(v_fx * v_scale); + v_int32x4 v_sy0 = v_round(v_fy * v_scale); + v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy); + v_int32x4 v_sx1 = v_round(v_fx * v_scale); + v_int32x4 v_sy1 = v_round(v_fy * v_scale); + v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2)); + v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2)); + v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1)); + v_store(A + x1, v_v8); + v_int16x8 v_dx = v_pack(v_shr(v_sx0), v_shr(v_sx1)); + v_int16x8 v_dy = v_pack(v_shr(v_sy0), v_shr(v_sy1)); + v_store_interleave(XY + (x1 << 1), v_dx, v_dy); + } } - #endif + #endif - for( x1 = 0; x1 < bcols; x1++ ) + for( ; x1 < bcols; x1++ ) { int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE); int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE); @@ -1915,8 +1887,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, size.height = 1; } -#if CV_SSE2 - bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); +#if CV_SIMD128 + bool useSIMD = hasSIMD128(); #endif #if CV_TRY_SSE4_1 bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1; @@ -1941,67 +1913,75 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, { if( nninterpolate ) { - #if CV_NEON - for( ; x <= size.width - 8; x += 8 ) - { - int16x8x2_t v_dst; - v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), - vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))); - v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))), - vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4)))); - - vst2q_s16(dst1 + (x << 1), v_dst); - } - #elif CV_TRY_SSE4_1 + #if CV_TRY_SSE4_1 if (useSSE4_1) opt_SSE4_1::convertMaps_nninterpolate32f1c16s_SSE41(src1f, src2f, dst1, size.width); else #endif - for( ; x < size.width; x++ ) { - dst1[x*2] = saturate_cast(src1f[x]); - dst1[x*2+1] = saturate_cast(src2f[x]); + #if CV_SIMD128 + if( useSIMD ) + { + int span = v_int16x8::nlanes; + for( ; x <= size.width - span; x += span ) + { + v_int16x8 v_dst[2]; + #define CV_PACK_MAP(X) v_pack(v_round(v_load(X)), v_round(v_load((X)+4))) + v_dst[0] = CV_PACK_MAP(src1f + x); + v_dst[1] = CV_PACK_MAP(src2f + x); + #undef CV_PACK_MAP + v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]); + } + } + #endif + for( ; x < size.width; x++ ) + { + dst1[x*2] = saturate_cast(src1f[x]); + dst1[x*2+1] = saturate_cast(src2f[x]); + } } } else { - #if CV_NEON - float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); - int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); - - for( ; x <= size.width - 8; x += 8 ) - { - int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale)); - int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale)); - int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale)); - int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale)); - - int16x8x2_t v_dst; - v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), - vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); - v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), - vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); - - vst2q_s16(dst1 + (x << 1), v_dst); - - uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), - vandq_s32(v_ix0, v_mask))); - uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), - vandq_s32(v_ix1, v_mask))); - vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); - } - #elif CV_TRY_SSE4_1 + #if CV_TRY_SSE4_1 if (useSSE4_1) opt_SSE4_1::convertMaps_32f1c16s_SSE41(src1f, src2f, dst1, dst2, size.width); else #endif - for( ; x < size.width; x++ ) { - int ix = saturate_cast(src1f[x]*INTER_TAB_SIZE); - int iy = saturate_cast(src2f[x]*INTER_TAB_SIZE); - dst1[x*2] = saturate_cast(ix >> INTER_BITS); - dst1[x*2+1] = saturate_cast(iy >> INTER_BITS); - dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); + #if CV_SIMD128 + if( useSIMD ) + { + v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); + v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1); + v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE); + int span = v_float32x4::nlanes; + for( ; x <= size.width - span * 2; x += span * 2 ) + { + v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x))); + v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span))); + v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x))); + v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span))); + + v_int16x8 v_dst[2]; + v_dst[0] = v_pack(v_shr(v_ix0), v_shr(v_ix1)); + v_dst[1] = v_pack(v_shr(v_iy0), v_shr(v_iy1)); + v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]); + + v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)); + v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask)); + v_store(dst2 + x, v_pack_u(v_dst0, v_dst1)); + } + } + #endif + for( ; x < size.width; x++ ) + { + int ix = saturate_cast(src1f[x]*INTER_TAB_SIZE); + int iy = saturate_cast(src2f[x]*INTER_TAB_SIZE); + dst1[x*2] = saturate_cast(ix >> INTER_BITS); + dst1[x*2+1] = saturate_cast(iy >> INTER_BITS); + dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); + } } } } @@ -2009,16 +1989,12 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, { if( nninterpolate ) { - #if CV_NEON - for( ; x <= (size.width << 1) - 8; x += 8 ) - vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), - vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))))); - #elif CV_SSE2 - for( ; x <= (size.width << 1) - 8; x += 8 ) - { - _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), - _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)))); - } + #if CV_SIMD128 + int span = v_float32x4::nlanes; + if( useSIMD ) + for( ; x <= (size.width << 1) - span * 2; x += span * 2 ) + v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)), + v_round(v_load(src1f + x + span)))); #endif for( ; x < size.width; x++ ) { @@ -2028,118 +2004,92 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, } else { - #if CV_NEON - float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); - int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); - - for( ; x <= size.width - 8; x += 8 ) - { - float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8); - int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale)); - int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale)); - int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale)); - int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale)); - - int16x8x2_t v_dst; - v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), - vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); - v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), - vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); - - vst2q_s16(dst1 + (x << 1), v_dst); - - uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), - vandq_s32(v_ix0, v_mask))); - uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), - vandq_s32(v_ix1, v_mask))); - vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); - } - #elif CV_TRY_SSE4_1 - if (useSSE4_1) + #if CV_TRY_SSE4_1 + if( useSSE4_1 ) opt_SSE4_1::convertMaps_32f2c16s_SSE41(src1f, dst1, dst2, size.width); else #endif - for( ; x < size.width; x++ ) { - int ix = saturate_cast(src1f[x*2]*INTER_TAB_SIZE); - int iy = saturate_cast(src1f[x*2+1]*INTER_TAB_SIZE); - dst1[x*2] = saturate_cast(ix >> INTER_BITS); - dst1[x*2+1] = saturate_cast(iy >> INTER_BITS); - dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); + #if CV_SIMD128 + if( useSIMD ) + { + v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); + v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1); + v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE); + int span = v_uint16x8::nlanes; + for (; x <= size.width - span; x += span ) + { + v_float32x4 v_src0[2], v_src1[2]; + v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]); + v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]); + v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale); + v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale); + v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale); + v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale); + + v_int16x8 v_dst[2]; + v_dst[0] = v_pack(v_shr(v_ix0), v_shr(v_ix1)); + v_dst[1] = v_pack(v_shr(v_iy0), v_shr(v_iy1)); + v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]); + + v_store(dst2 + x, v_pack_u( + v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)), + v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask)))); + } + } + #endif + for( ; x < size.width; x++ ) + { + int ix = saturate_cast(src1f[x*2]*INTER_TAB_SIZE); + int iy = saturate_cast(src1f[x*2+1]*INTER_TAB_SIZE); + dst1[x*2] = saturate_cast(ix >> INTER_BITS); + dst1[x*2+1] = saturate_cast(iy >> INTER_BITS); + dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); + } } } } else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 ) { - #if CV_NEON - uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1); - uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1); - float32x4_t v_scale = vdupq_n_f32(scale); - - for( ; x <= size.width - 8; x += 8) + #if CV_SIMD128 + if( useSIMD ) { - uint32x4_t v_fxy1, v_fxy2; - if (src2) + v_uint16x8 v_mask2 = v_setall_u16(INTER_TAB_SIZE2-1); + v_uint32x4 v_zero = v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1); + v_float32x4 v_scale = v_setall_f32(scale); + int span = v_float32x4::nlanes; + for( ; x <= size.width - span * 2; x += span * 2 ) { - uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2); - v_fxy1 = vmovl_u16(vget_low_u16(v_src2)); - v_fxy2 = vmovl_u16(vget_high_u16(v_src2)); + v_uint32x4 v_fxy1, v_fxy2; + if ( src2 ) + { + v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2; + v_expand(v_src2, v_fxy1, v_fxy2); + } + else + v_fxy1 = v_fxy2 = v_zero; + + v_int16x8 v_src[2]; + v_int32x4 v_src0[2], v_src1[2]; + v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]); + v_expand(v_src[0], v_src0[0], v_src0[1]); + v_expand(v_src[1], v_src1[0], v_src1[1]); + #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\ + v_cvt_f32(v_reinterpret_as_s32(X))) + #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\ + v_cvt_f32(v_reinterpret_as_s32(Y))) + v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1); + v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1); + v_store(dst1f + x, v_dst1); + v_store(dst2f + x, v_dst2); + + v_dst1 = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2); + v_dst2 = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2); + v_store(dst1f + x + span, v_dst1); + v_store(dst2f + x + span, v_dst2); + #undef CV_COMPUTE_MAP_X + #undef CV_COMPUTE_MAP_Y } - else - v_fxy1 = v_fxy2 = v_zero; - - int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); - float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), - v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask))); - float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), - v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS))); - vst1q_f32(dst1f + x, v_dst1); - vst1q_f32(dst2f + x, v_dst2); - - v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), - v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask))); - v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), - v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS))); - vst1q_f32(dst1f + x + 4, v_dst1); - vst1q_f32(dst2f + x + 4, v_dst2); - } - #elif CV_SSE2 - __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); - __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128 v_scale = _mm_set1_ps(scale); - - for( ; x <= size.width - 16; x += 16) - { - __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); - __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8)); - __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16)); - __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24)); - - _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21); - - __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; - __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero); - _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); - _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); - v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); - _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); - _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); - - v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero; - v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); - _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); - _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); - v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); - _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); - _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)), - _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); } #endif for( ; x < size.width; x++ ) @@ -2151,56 +2101,42 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, } else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 ) { - #if CV_NEON - int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1); - int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1); - float32x4_t v_scale = vdupq_n_f32(scale); - - for( ; x <= size.width - 8; x += 8) + #if CV_SIMD128 + if( useSIMD ) { - int32x4_t v_fxy1, v_fxy2; - if (src2) + v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1); + v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1); + v_float32x4 v_scale = v_setall_f32(scale); + int span = v_int16x8::nlanes; + for( ; x <= size.width - span; x += span ) { - int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2); - v_fxy1 = vmovl_s16(vget_low_s16(v_src2)); - v_fxy2 = vmovl_s16(vget_high_s16(v_src2)); - } - else - v_fxy1 = v_fxy2 = v_zero; + v_int32x4 v_fxy1, v_fxy2; + if (src2) + { + v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2; + v_expand(v_src2, v_fxy1, v_fxy2); + } + else + v_fxy1 = v_fxy2 = v_zero; - int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); - float32x4x2_t v_dst; - v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), - v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask))); - v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), - v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS))); - vst2q_f32(dst1f + (x << 1), v_dst); + v_int16x8 v_src[2]; + v_int32x4 v_src0[2], v_src1[2]; + v_float32x4 v_dst[2]; + v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]); + v_expand(v_src[0], v_src0[0], v_src0[1]); + v_expand(v_src[1], v_src1[0], v_src1[1]); - v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), - v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask))); - v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), - v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS))); - vst2q_f32(dst1f + (x << 1) + 8, v_dst); - } - #elif CV_SSE2 - if (useSSE2) - { - __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); - __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); - __m128 v_scale = _mm_set1_ps(scale); + #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X)) + #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y)) + v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1); + v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1); + v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]); - for ( ; x <= size.width - 8; x += 8) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); - __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; - __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); - __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); - - __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); - _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); - - v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); - _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); + v_dst[0] = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2); + v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2); + v_store_interleave(dst1f + (x << 1) + span, v_dst[0], v_dst[1]); + #undef CV_COMPUTE_MAP_X + #undef CV_COMPUTE_MAP_Y } } #endif @@ -2242,8 +2178,8 @@ public: #if CV_TRY_AVX2 bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #endif - #if CV_SSE2 - bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #if CV_SIMD128 + bool useSIMD = hasSIMD128(); #endif #if CV_TRY_SSE4_1 bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1; @@ -2272,94 +2208,70 @@ public: if( interpolation == INTER_NEAREST ) { x1 = 0; - #if CV_NEON - int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0); - for( ; x1 <= bw - 8; x1 += 8 ) - { - int16x8x2_t v_dst; - v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)), - vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS))); - v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)), - vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS))); - - vst2q_s16(xy + (x1 << 1), v_dst); - } - #elif CV_TRY_SSE4_1 - if (useSSE4_1) + #if CV_TRY_SSE4_1 + if( useSSE4_1 ) opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw); else #endif - for( ; x1 < bw; x1++ ) { - int X = (X0 + adelta[x+x1]) >> AB_BITS; - int Y = (Y0 + bdelta[x+x1]) >> AB_BITS; - xy[x1*2] = saturate_cast(X); - xy[x1*2+1] = saturate_cast(Y); + #if CV_SIMD128 + if( useSIMD ) + { + v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0); + int span = v_uint16x8::nlanes; + for( ; x1 <= bw - span; x1 += span ) + { + v_int16x8 v_dst[2]; + #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr(shift+v_load(ptr + offset)),\ + v_shr(shift+v_load(ptr + offset + 4))) + v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0); + v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0); + #undef CV_CONVERT_MAP + v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]); + } + } + #endif + for( ; x1 < bw; x1++ ) + { + int X = (X0 + adelta[x+x1]) >> AB_BITS; + int Y = (Y0 + bdelta[x+x1]) >> AB_BITS; + xy[x1*2] = saturate_cast(X); + xy[x1*2+1] = saturate_cast(Y); + } } } else { short* alpha = A + y1*bw; x1 = 0; - #if CV_TRY_AVX2 + #if CV_TRY_AVX2 if ( useAVX2 ) x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw); - #endif - #if CV_SSE2 - if( useSSE2 ) + #endif + #if CV_SIMD128 + if( useSIMD ) { - __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); - __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); - for( ; x1 <= bw - 8; x1 += 8 ) + v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0); + v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1); + int span = v_float32x4::nlanes; + for( ; x1 <= bw - span * 2; x1 += span * 2 ) { - __m128i tx0, tx1, ty0, ty1; - tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX); - ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY); - tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX); - ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY); + v_int32x4 v_X0 = v_shr(v__X0 + v_load(adelta + x + x1)); + v_int32x4 v_Y0 = v_shr(v__Y0 + v_load(bdelta + x + x1)); + v_int32x4 v_X1 = v_shr(v__X0 + v_load(adelta + x + x1 + span)); + v_int32x4 v_Y1 = v_shr(v__Y0 + v_load(bdelta + x + x1 + span)); - tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS); - ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS); - tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS); - ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS); + v_int16x8 v_xy[2]; + v_xy[0] = v_pack(v_shr(v_X0), v_shr(v_X1)); + v_xy[1] = v_pack(v_shr(v_Y0), v_shr(v_Y1)); + v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]); - __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask), - _mm_and_si128(tx1, fxy_mask)); - __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask), - _mm_and_si128(ty1, fxy_mask)); - tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS), - _mm_srai_epi32(tx1, INTER_BITS)); - ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS), - _mm_srai_epi32(ty1, INTER_BITS)); - fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS)); - - _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0)); - _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0)); - _mm_storeu_si128((__m128i*)(alpha + x1), fx_); + v_int32x4 v_alpha0 = v_shl(v_Y0 & v_mask) | (v_X0 & v_mask); + v_int32x4 v_alpha1 = v_shl(v_Y1 & v_mask) | (v_X1 & v_mask); + v_store(alpha + x1, v_pack(v_alpha0, v_alpha1)); } } - #elif CV_NEON - int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); - for( ; x1 <= bw - 8; x1 += 8 ) - { - int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS); - int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS); - int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS); - int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS); - - int16x8x2_t v_xy; - v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS))); - v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS))); - - vst2q_s16(xy + (x1 << 1), v_xy); - - int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS), - vandq_s32(v_X0, v_mask))); - int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS), - vandq_s32(v_X1, v_mask))); - vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1)); - } - #endif + #endif for( ; x1 < bw; x1++ ) { int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);