Merge remote-tracking branch 'upstream/3.4' into merge-3.4

This commit is contained in:
Alexander Alekhin
2018-12-03 18:38:27 +03:00
19 changed files with 301 additions and 158 deletions
@@ -1133,6 +1133,41 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
return v_float32x8(_mm256_hadd_ps(ab, cd));
}
inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
{
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val));
}
inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
{
__m256i half = _mm256_set1_epi8(0x7f);
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half)));
}
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{
v_uint32x8 l, h;
v_expand(v_add_wrap(a - b, b - a), l, h);
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
{
v_uint32x8 l, h;
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
{
return v_reduce_sum(v_max(a, b) - v_min(a, b));
}
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
{
v_int32x8 m = a < b;
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
}
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
{
return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
}
/** Popcount **/
#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
inline v_uint32x8 v_popcount(const _Tpvec& a) \
@@ -686,10 +686,10 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
template<int n>
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
{
typedef typename V_TypeTraits<float>::int_type itype;
v_reg<float, n> c;
for (int i = 0; i < n; i++)
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
typedef typename V_TypeTraits<float>::int_type itype;
v_reg<float, n> c;
for (int i = 0; i < n; i++)
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
return c;
}
template<int n>
@@ -1063,6 +1063,21 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
return r;
}
/** @brief Sum absolute differences of values
Scheme:
@code
{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
@endcode
For all types except 64-bit types.*/
template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
{
typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
for (int i = 1; i < n; i++)
c += _absdiff(a.s[i], b.s[i]);
return c;
}
/** @brief Get negative values mask
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
@@ -999,6 +999,49 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
return v_float32x4(vaddq_f32(v0, v1));
}
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
{
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
{
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
{
uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
{
uint32x4_t t0 = vabdq_u32(a.val, b.val);
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
{
uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
{
float32x4_t t0 = vabdq_f32(a.val, b.val);
float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
return vget_lane_f32(vpadd_f32(t1, t1), 0);
}
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
@@ -1477,6 +1477,41 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
{
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val));
}
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
{
__m128i half = _mm_set1_epi8(0x7f);
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half),
_mm_add_epi8(b.val, half)));
}
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
{
v_uint32x4 l, h;
v_expand(v_absdiff(a, b), l, h);
return v_reduce_sum(l + h);
}
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
{
return v_reduce_sum(v_absdiff(a, b));
}
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
{
return v_reduce_sum(v_absdiff(a, b));
}
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
{
return v_reduce_sum(v_absdiff(a, b));
}
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
@@ -1930,13 +1965,11 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
}
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
@@ -739,6 +739,50 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
return v_float32x4(vec_mergeh(ac, bd));
}
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
{
const vec_uint4 zero4 = vec_uint4_z;
vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
}
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
{
const vec_int4 zero4 = vec_int4_z;
vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
vec_int4 sum4 = vec_sum4s(ad, zero4);
return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
}
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
vec_ushort8 ad = vec_absd(a.val, b.val);
VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad)));
return (unsigned)vec_extract(sum, 3);
}
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
{
const vec_int4 zero4 = vec_int4_z;
vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
vec_int4 sum4 = vec_sum4s(ad, zero4);
return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
}
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
{
const vec_uint4 ad = vec_absd(a.val, b.val);
const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
}
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
{
vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
}
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
{
const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
}
/** Popcount **/
template<typename _Tpvec>
inline v_uint32x4 v_popcount(const _Tpvec& a)
@@ -567,7 +567,7 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
@@ -588,7 +588,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
@@ -615,7 +615,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);