Merge remote-tracking branch 'upstream/3.4' into merge-3.4
This commit is contained in:
@@ -1133,6 +1133,41 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
|
||||
return v_float32x8(_mm256_hadd_ps(ab, cd));
|
||||
}
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
|
||||
{
|
||||
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
|
||||
{
|
||||
__m256i half = _mm256_set1_epi8(0x7f);
|
||||
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half)));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_add_wrap(a - b, b - a), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
|
||||
{
|
||||
v_uint32x8 l, h;
|
||||
v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
|
||||
{
|
||||
return v_reduce_sum(v_max(a, b) - v_min(a, b));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
|
||||
{
|
||||
v_int32x8 m = a < b;
|
||||
return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
|
||||
{
|
||||
return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
|
||||
}
|
||||
|
||||
/** Popcount **/
|
||||
#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
|
||||
inline v_uint32x8 v_popcount(const _Tpvec& a) \
|
||||
|
||||
@@ -686,10 +686,10 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
|
||||
template<int n>
|
||||
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
|
||||
{
|
||||
typedef typename V_TypeTraits<float>::int_type itype;
|
||||
v_reg<float, n> c;
|
||||
for (int i = 0; i < n; i++)
|
||||
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
|
||||
typedef typename V_TypeTraits<float>::int_type itype;
|
||||
v_reg<float, n> c;
|
||||
for (int i = 0; i < n; i++)
|
||||
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
|
||||
return c;
|
||||
}
|
||||
template<int n>
|
||||
@@ -1063,6 +1063,21 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
return r;
|
||||
}
|
||||
|
||||
/** @brief Sum absolute differences of values
|
||||
|
||||
Scheme:
|
||||
@code
|
||||
{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
|
||||
@endcode
|
||||
For all types except 64-bit types.*/
|
||||
template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
||||
{
|
||||
typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
|
||||
for (int i = 1; i < n; i++)
|
||||
c += _absdiff(a.s[i], b.s[i]);
|
||||
return c;
|
||||
}
|
||||
|
||||
/** @brief Get negative values mask
|
||||
|
||||
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
|
||||
|
||||
@@ -999,6 +999,49 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
return v_float32x4(vaddq_f32(v0, v1));
|
||||
}
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
uint32x4_t t0 = vabdq_u32(a.val, b.val);
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
|
||||
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
|
||||
return vget_lane_u32(vpadd_u32(t1, t1), 0);
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
float32x4_t t0 = vabdq_f32(a.val, b.val);
|
||||
float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
|
||||
return vget_lane_f32(vpadd_f32(t1, t1), 0);
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
||||
{ \
|
||||
|
||||
@@ -1477,6 +1477,41 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
|
||||
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
__m128i half = _mm_set1_epi8(0x7f);
|
||||
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half),
|
||||
_mm_add_epi8(b.val, half)));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
v_uint32x4 l, h;
|
||||
v_expand(v_absdiff(a, b), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
v_uint32x4 l, h;
|
||||
v_expand(v_absdiff(a, b), l, h);
|
||||
return v_reduce_sum(l + h);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
return v_reduce_sum(v_absdiff(a, b));
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
return v_reduce_sum(v_absdiff(a, b));
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
return v_reduce_sum(v_absdiff(a, b));
|
||||
}
|
||||
|
||||
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a) \
|
||||
{ \
|
||||
@@ -1930,13 +1965,11 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
|
||||
|
||||
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
|
||||
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
|
||||
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
|
||||
|
||||
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
|
||||
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
|
||||
a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
|
||||
b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
|
||||
}
|
||||
|
||||
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
|
||||
|
||||
@@ -739,6 +739,50 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
||||
return v_float32x4(vec_mergeh(ac, bd));
|
||||
}
|
||||
|
||||
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
|
||||
{
|
||||
const vec_uint4 zero4 = vec_uint4_z;
|
||||
vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
|
||||
return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
|
||||
{
|
||||
const vec_int4 zero4 = vec_int4_z;
|
||||
vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
|
||||
vec_int4 sum4 = vec_sum4s(ad, zero4);
|
||||
return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
|
||||
{
|
||||
vec_ushort8 ad = vec_absd(a.val, b.val);
|
||||
VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad)));
|
||||
return (unsigned)vec_extract(sum, 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
|
||||
{
|
||||
const vec_int4 zero4 = vec_int4_z;
|
||||
vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
|
||||
vec_int4 sum4 = vec_sum4s(ad, zero4);
|
||||
return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
|
||||
{
|
||||
const vec_uint4 ad = vec_absd(a.val, b.val);
|
||||
const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
|
||||
return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
|
||||
}
|
||||
inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
|
||||
{
|
||||
vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
|
||||
return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
|
||||
}
|
||||
inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
|
||||
{
|
||||
const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
|
||||
const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
|
||||
return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
|
||||
}
|
||||
|
||||
/** Popcount **/
|
||||
template<typename _Tpvec>
|
||||
inline v_uint32x4 v_popcount(const _Tpvec& a)
|
||||
|
||||
@@ -567,7 +567,7 @@ inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m
|
||||
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
@@ -588,7 +588,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m12
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
__m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
@@ -615,7 +615,7 @@ inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
|
||||
inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
|
||||
__m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
|
||||
{
|
||||
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
|
||||
|
||||
__m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
|
||||
__m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
|
||||
|
||||
Reference in New Issue
Block a user