diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 25cd0d2f68..c8dfbc039d 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) ocv_add_dispatched_file(convert SSE2 AVX2 VSX3) ocv_add_dispatched_file(convert_scale SSE2 AVX2) ocv_add_dispatched_file(count_non_zero SSE2 AVX2) -ocv_add_dispatched_file(matmul SSE2 AVX2) +ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX) ocv_add_dispatched_file(mean SSE2 AVX2) ocv_add_dispatched_file(merge SSE2 AVX2) ocv_add_dispatched_file(split SSE2 AVX2) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index c821dc3991..8f6c982c72 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1431,6 +1431,28 @@ inline v_float64x4 v_cvt_f64(const v_float32x8& a) inline v_float64x4 v_cvt_f64_high(const v_float32x8& a) { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); } +// from (Mysticial and wim) https://stackoverflow.com/q/41144668 +inline v_float64x4 v_cvt_f64(const v_int64x4& v) +{ + // constants encoded as floating-point + __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); // 2^52 + __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63 + __m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52 + __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all); + + // Blend the 32 lowest significant bits of v with magic_int_lo + __m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55); + // Extract the 32 most significant bits of v + __m256i v_hi = _mm256_srli_epi64(v.val, 32); + // Flip the msb of v_hi and blend with 0x45300000 + v_hi = _mm256_xor_si256(v_hi, magic_i_hi32); + // Compute in double precision + __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); + // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition + __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo)); + return v_float64x4(result); +} + ////////////// Lookup table access //////////////////// inline v_int8x32 v256_lut(const schar* tab, const int* idx) @@ -1638,12 +1660,165 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec) ////////// Matrix operations ///////// +//////// Dot Product //////// + +// 16 >> 32 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b) { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); } - inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) { return v_dotprod(a, b) + c; } +// 32 >> 64 +inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b) +{ + __m256i even = _mm256_mul_epi32(a.val, b.val); + __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32)); + return v_int64x4(_mm256_add_epi64(even, odd)); +} +inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c) +{ return v_dotprod(a, b) + c; } + +// 8 >> 32 +inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b) +{ + __m256i even_m = _mm256_set1_epi32(0xFF00FF00); + __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m); + __m256i odd_a = _mm256_srli_epi16(a.val, 8); + + __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m); + __m256i odd_b = _mm256_srli_epi16(b.val, 8); + + __m256i prod0 = _mm256_madd_epi16(even_a, even_b); + __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b); + return v_uint32x8(_mm256_add_epi32(prod0, prod1)); +} +inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b) +{ + __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8); + __m256i odd_a = _mm256_srai_epi16(a.val, 8); + + __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8); + __m256i odd_b = _mm256_srai_epi16(b.val, 8); + + __m256i prod0 = _mm256_madd_epi16(even_a, even_b); + __m256i prod1 = _mm256_madd_epi16(odd_a, odd_b); + return v_int32x8(_mm256_add_epi32(prod0, prod1)); +} +inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i mullo = _mm256_mullo_epi16(a.val, b.val); + __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val); + __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi); + __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi); + + __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA); + __m256i p13 = _mm256_srli_epi64(mul0, 32); + __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA); + __m256i p57 = _mm256_srli_epi64(mul1, 32); + + __m256i p15_ = _mm256_add_epi64(p02, p13); + __m256i p9d_ = _mm256_add_epi64(p46, p57); + + return v_uint64x4(_mm256_add_epi64( + _mm256_unpacklo_epi64(p15_, p9d_), + _mm256_unpackhi_epi64(p15_, p9d_) + )); +} +inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b) +{ + __m256i prod = _mm256_madd_epi16(a.val, b.val); + __m256i sign = _mm256_srai_epi32(prod, 31); + + __m256i lo = _mm256_unpacklo_epi32(prod, sign); + __m256i hi = _mm256_unpackhi_epi32(prod, sign); + + return v_int64x4(_mm256_add_epi64( + _mm256_unpacklo_epi64(lo, hi), + _mm256_unpackhi_epi64(lo, hi) + )); +} +inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b) +{ return v_dotprod(a, b); } +inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b) +{ return v_dotprod(a, b); } +inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c) +{ return v_dotprod_expand(a, b, c); } + +inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b) +{ + __m256i mullo = _mm256_mullo_epi16(a.val, b.val); + __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val); + __m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi); + __m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi); + + __m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA); + __m256i p13 = _mm256_srli_epi64(mul0, 32); + __m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA); + __m256i p57 = _mm256_srli_epi64(mul1, 32); + + __m256i p15_ = _mm256_add_epi64(p02, p13); + __m256i p9d_ = _mm256_add_epi64(p46, p57); + + return v_uint64x4(_mm256_add_epi64(p15_, p9d_)); +} +inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b) +{ + __m256i prod = _mm256_madd_epi16(a.val, b.val); + __m256i sign = _mm256_srai_epi32(prod, 31); + __m256i lo = _mm256_unpacklo_epi32(prod, sign); + __m256i hi = _mm256_unpackhi_epi32(prod, sign); + return v_int64x4(_mm256_add_epi64(lo, hi)); +} +inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 32 >> 64f +inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c) +{ return v_dotprod_expand(a, b, c); } + #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \ v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im))) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index d4edf0cdd1..844c546e38 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -1473,6 +1473,32 @@ inline v_float64x8 v_cvt_f64(const v_float32x16& a) inline v_float64x8 v_cvt_f64_high(const v_float32x16& a) { return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); } +// from (Mysticial and wim) https://stackoverflow.com/q/41144668 +inline v_float64x8 v_cvt_f64(const v_int64x8& v) +{ +#if CV_AVX_512DQ + return v_float64x8(_mm512_cvtepi64_pd(v.val)); +#else + // constants encoded as floating-point + __m512i magic_i_lo = _mm512_set1_epi64x(0x4330000000000000); // 2^52 + __m512i magic_i_hi32 = _mm512_set1_epi64x(0x4530000080000000); // 2^84 + 2^63 + __m512i magic_i_all = _mm512_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52 + __m512d magic_d_all = _mm512_castsi512_pd(magic_i_all); + + // Blend the 32 lowest significant bits of v with magic_int_lo + __m512i v_lo = _mm512_blend_epi32(magic_i_lo, v.val, 0x55); + // Extract the 32 most significant bits of v + __m512i v_hi = _mm512_srli_epi64(v.val, 32); + // Flip the msb of v_hi and blend with 0x45300000 + v_hi = _mm512_xor_si512(v_hi, magic_i_hi32); + // Compute in double precision + __m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all); + // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition + __m512d result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo)); + return v_float64x8(result); +#endif +} + ////////////// Lookup table access //////////////////// inline v_int8x64 v512_lut(const schar* tab, const int* idx) @@ -1672,12 +1698,152 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec) ////////// Matrix operations ///////// +//////// Dot Product //////// + +// 16 >> 32 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b) { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); } - inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c) { return v_dotprod(a, b) + c; } +// 32 >> 64 +inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b) +{ + __m512i even = _mm512_mul_epi32(a.val, b.val); + __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32)); + return v_int64x8(_mm512_add_epi64(even, odd)); +} +inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c) +{ return v_dotprod(a, b) + c; } + +// 8 >> 32 +inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b) +{ + __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512()); + __m512i odd_a = _mm512_srli_epi16(a.val, 8); + + __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512()); + __m512i odd_b = _mm512_srli_epi16(b.val, 8); + + __m512i prod0 = _mm512_madd_epi16(even_a, even_b); + __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b); + return v_uint32x16(_mm512_add_epi32(prod0, prod1)); +} +inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b) +{ + __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8); + __m512i odd_a = _mm512_srai_epi16(a.val, 8); + + __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8); + __m512i odd_b = _mm512_srai_epi16(b.val, 8); + + __m512i prod0 = _mm512_madd_epi16(even_a, even_b); + __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b); + return v_int32x16(_mm512_add_epi32(prod0, prod1)); +} +inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b) +{ + __m512i mullo = _mm512_mullo_epi16(a.val, b.val); + __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val); + __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi); + __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi); + + __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512()); + __m512i p13 = _mm512_srli_epi64(mul0, 32); + __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512()); + __m512i p57 = _mm512_srli_epi64(mul1, 32); + + __m512i p15_ = _mm512_add_epi64(p02, p13); + __m512i p9d_ = _mm512_add_epi64(p46, p57); + + return v_uint64x8(_mm512_add_epi64( + _mm512_unpacklo_epi64(p15_, p9d_), + _mm512_unpackhi_epi64(p15_, p9d_) + )); +} +inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b) +{ + __m512i prod = _mm512_madd_epi16(a.val, b.val); + __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32); + __m512i odd = _mm512_srai_epi64(prod, 32); + return v_int64x8(_mm512_add_epi64(even, odd)); +} +inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b) +{ return v_dotprod(a, b); } +inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b) +{ return v_dotprod(a, b); } +inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c) +{ return v_dotprod_expand(a, b, c); } + +inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b) +{ + __m512i mullo = _mm512_mullo_epi16(a.val, b.val); + __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val); + __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi); + __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi); + + __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512()); + __m512i p13 = _mm512_srli_epi64(mul0, 32); + __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512()); + __m512i p57 = _mm512_srli_epi64(mul1, 32); + + __m512i p15_ = _mm512_add_epi64(p02, p13); + __m512i p9d_ = _mm512_add_epi64(p46, p57); + return v_uint64x8(_mm512_add_epi64(p15_, p9d_)); +} +inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b) +{ return v_dotprod_expand(a, b); } +inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c) +{ return v_dotprod_expand(a, b, c); } + +// 32 >> 64f +inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) +{ return v_dotprod_expand(a, b) + c; } + + #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \ v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im))) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 75c39acf39..9b3dc84681 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -171,7 +171,8 @@ Different type conversions and casts: ### Matrix operations -In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4 +In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast, +@ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4 ### Usability @@ -195,7 +196,10 @@ Regular integers: |mul_expand | x | x | x | x | x | | |compare | x | x | x | x | x | x | |shift | | | x | x | x | x | -|dotprod | | | | x | | | +|dotprod | | | | x | | x | +|dotprod_fast | | | | x | | x | +|dotprod_expand | x | x | x | x | | x | +|dotprod_expand_fast| x | x | x | x | | x | |logical | x | x | x | x | x | x | |min, max | x | x | x | x | x | x | |absdiff | x | x | x | x | x | x | @@ -222,6 +226,7 @@ Big integers: |logical | x | x | |extract | x | x | |rotate (lanes) | x | x | +|cvt_flt64 | | x | Floating point: @@ -853,17 +858,18 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, /** @brief Dot product of elements Multiply values in two registers and sum adjacent result pairs. + Scheme: @code {A1 A2 ...} // 16-bit x {B1 B2 ...} // 16-bit ------------- {A1B1+A2B2 ...} // 32-bit + @endcode -Implemented only for 16-bit signed source type (v_int16x8). */ template inline v_reg::w_type, n/2> - v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) { typedef typename V_TypeTraits<_Tp>::w_type w_type; v_reg c; @@ -881,12 +887,11 @@ Scheme: x {B1 B2 ...} // 16-bit ------------- {A1B1+A2B2+C1 ...} // 32-bit - @endcode -Implemented only for 16-bit signed source type (v_int16x8). */ template inline v_reg::w_type, n/2> - v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg::w_type, n / 2>& c) +v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::w_type, n / 2>& c) { typedef typename V_TypeTraits<_Tp>::w_type w_type; v_reg s; @@ -895,6 +900,95 @@ template inline v_reg::w_type, n return s; } +/** @brief Fast Dot product of elements + +Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms, +this intrinsic can be used if the sum among all lanes is only matters +and also it should be yielding better performance on the affected platforms. + +*/ +template inline v_reg::w_type, n/2> +v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ return v_dotprod(a, b); } + +/** @brief Fast Dot product of elements + +Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs. +*/ +template inline v_reg::w_type, n/2> +v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::w_type, n / 2>& c) +{ return v_dotprod(a, b, c); } + +/** @brief Dot product of elements and expand + +Multiply values in two registers and expand the sum of adjacent result pairs. + +Scheme: +@code + {A1 A2 A3 A4 ...} // 8-bit +x {B1 B2 B3 B4 ...} // 8-bit +------------- + {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit + +@endcode +*/ +template inline v_reg::q_type, n/4> +v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg s; + for( int i = 0; i < (n/4); i++ ) + s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + + (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3]; + return s; +} + +/** @brief Dot product of elements + +Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs. +Scheme: +@code + {A1 A2 A3 A4 ...} // 8-bit +x {B1 B2 B3 B4 ...} // 8-bit +------------- + {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit +@endcode +*/ +template inline v_reg::q_type, n/4> +v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::q_type, n / 4>& c) +{ + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg s; + for( int i = 0; i < (n/4); i++ ) + s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + + (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i]; + return s; +} + +/** @brief Fast Dot product of elements and expand + +Multiply values in two registers and expand the sum of adjacent result pairs. + +Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms, +this intrinsic can be used if the sum among all lanes is only matters +and also it should be yielding better performance on the affected platforms. + +*/ +template inline v_reg::q_type, n/4> +v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ return v_dotprod_expand(a, b); } + +/** @brief Fast Dot product of elements + +Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs. +*/ +template inline v_reg::q_type, n/4> +v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::q_type, n / 4>& c) +{ return v_dotprod_expand(a, b, c); } + /** @brief Multiply and expand Multiply values two registers and store results in two registers with wider pack type. @@ -1810,6 +1904,17 @@ template inline v_reg v_cvt_f64(const v_reg& a) return c; } +/** @brief Convert to double + +Supported input type is cv::v_int64x2. */ +template inline v_reg v_cvt_f64(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx) { v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp index 6873633165..979f15a277 100644 --- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp @@ -160,6 +160,16 @@ void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __ void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&); #endif +// Conversions +__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a); +__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a); +__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b); +__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a); +__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a); +__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a); +__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a); +__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a); + /** Cleanup **/ #undef CV__SIMD_FORWARD #undef __CV_VX diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp index 6a964841f8..5ece9c131e 100755 --- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp @@ -461,15 +461,124 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16)); } +//////// Dot Product //////// + +// 16 >> 32 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); } +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); } + +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) +{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); } +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) { - return v_int32x4(msa_dotp_s_w(a.val, b.val)); + v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8); + v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8); + v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8); + v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8); + v4u32 prod = msa_dotp_u_w(even_a, even_b); + return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b)); +} +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ + v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8); + v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8); + v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8); + v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8); + v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b); + return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b)); } -inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) { - return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); + v8i16 prod = msa_dotp_s_h(a.val, b.val); + return v_int32x4(msa_hadd_s32(prod, prod)); } +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, + const v_int32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16); + v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16); + v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16); + v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16); + v2u64 prod = msa_dotp_u_d(even_a, even_b); + return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b)); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, + const v_uint64x2& c) +{ + v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16); + v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16); + v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16); + v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16); + v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b); + return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b)); +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + v4i32 prod = msa_dotp_s_w(a.val, b.val); + return v_int64x2(msa_hadd_s64(prod, prod)); +} +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } + + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod(a, b); } +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod(a, b); } +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b, c); } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \ OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \ @@ -1311,6 +1420,11 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) return v_float64x2(msa_cvtfhq_f64_f32(a.val)); } +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ + return v_float64x2(msa_cvtfintq_f64_s64(a.val)); +} + ////////////// Lookup table access //////////////////// inline v_int8x16 v_lut(const schar* tab, const int* idx) { diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index e5f707ca57..3e8321aca3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -62,23 +62,63 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD128_64F 0 #endif +// TODO +#define CV_NEON_DOT 0 + +//////////// Utils //////////// + +#if CV_SIMD128_64F +#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \ + inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \ + { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); } +#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \ + inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \ + { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); } +#else +#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \ + inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \ + { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; } +#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \ + inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \ + { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; } +#endif + #if CV_SIMD128_64F #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \ -template static inline \ -_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \ -template static inline \ -float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; } -OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8) -OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8) -OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16) -OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16) -OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32) -OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32) -OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64) -OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64) -OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32) + template static inline \ + _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \ + template static inline \ + float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; } +#else +#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) #endif +#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \ + OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \ + OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \ + OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix) + +#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \ + OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix) + +#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \ + OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) + +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64) +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64) +#if CV_SIMD128_64F +OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64) +#endif + +//////////// Types //////////// + struct v_uint8x16 { typedef uchar lane_type; @@ -528,20 +568,272 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) )); } +//////// Dot Product //////// + +// 16 >> 32 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { - int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); - int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); - int32x4x2_t cd = vuzpq_s32(c, d); - return v_int32x4(vaddq_s32(cd.val[0], cd.val[1])); + int16x8_t uzp1, uzp2; + _v128_unzip(a.val, b.val, uzp1, uzp2); + int16x4_t a0 = vget_low_s16(uzp1); + int16x4_t b0 = vget_high_s16(uzp1); + int16x4_t a1 = vget_low_s16(uzp2); + int16x4_t b1 = vget_high_s16(uzp2); + int32x4_t p = vmull_s16(a0, b0); + return v_int32x4(vmlal_s16(p, a1, b1)); } - inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) { - v_int32x4 s = v_dotprod(a, b); - return v_int32x4(vaddq_s32(s.val , c.val)); + int16x8_t uzp1, uzp2; + _v128_unzip(a.val, b.val, uzp1, uzp2); + int16x4_t a0 = vget_low_s16(uzp1); + int16x4_t b0 = vget_high_s16(uzp1); + int16x4_t a1 = vget_low_s16(uzp2); + int16x4_t b1 = vget_high_s16(uzp2); + int32x4_t p = vmlal_s16(c.val, a0, b0); + return v_int32x4(vmlal_s16(p, a1, b1)); } +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) +{ + int32x4_t uzp1, uzp2; + _v128_unzip(a.val, b.val, uzp1, uzp2); + int32x2_t a0 = vget_low_s32(uzp1); + int32x2_t b0 = vget_high_s32(uzp1); + int32x2_t a1 = vget_low_s32(uzp2); + int32x2_t b1 = vget_high_s32(uzp2); + int64x2_t p = vmull_s32(a0, b0); + return v_int64x2(vmlal_s32(p, a1, b1)); +} +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + int32x4_t uzp1, uzp2; + _v128_unzip(a.val, b.val, uzp1, uzp2); + int32x2_t a0 = vget_low_s32(uzp1); + int32x2_t b0 = vget_high_s32(uzp1); + int32x2_t a1 = vget_low_s32(uzp2); + int32x2_t b1 = vget_high_s32(uzp2); + int64x2_t p = vmlal_s32(c.val, a0, b0); + return v_int64x2(vmlal_s32(p, a1, b1)); +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ +#if CV_NEON_DOT + return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val)); +#else + const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0)); + const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF)); + const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0)); + const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF)); + + uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)), + vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero))); + uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8), + vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8)); + + uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)), + vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32))); + uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16), + vshrq_n_u32(vreinterpretq_u32_u16(odd), 16)); + return v_uint32x4(vaddq_u32(s0, s1)); +#endif +} +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, + const v_uint32x4& c) +{ +#if CV_NEON_DOT + return v_uint32x4(vdotq_u32(c.val, a.val, b.val)); +#else + return v_dotprod_expand(a, b) + c; +#endif +} + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ +#if CV_NEON_DOT + return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val)); +#else + int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val)); + int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val)); + int16x8_t uzp1, uzp2; + _v128_unzip(p0, p1, uzp1, uzp2); + int16x8_t sum = vaddq_s16(uzp1, uzp2); + int16x4_t uzpl1, uzpl2; + _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2); + return v_int32x4(vaddl_s16(uzpl1, uzpl2)); +#endif +} +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, + const v_int32x4& c) +{ +#if CV_NEON_DOT + return v_int32x4(vdotq_s32(c.val, a.val, b.val)); +#else + return v_dotprod_expand(a, b) + c; +#endif +} + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0)); + const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF)); + + uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)), + vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero))); + uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16), + vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16)); + uint32x4_t uzp1, uzp2; + _v128_unzip(even, odd, uzp1, uzp2); + uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1)); + uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2)); + return v_uint64x2(vaddq_u64(s0, s1)); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); + int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); + + int32x4_t uzp1, uzp2; + _v128_unzip(p0, p1, uzp1, uzp2); + int32x4_t sum = vaddq_s32(uzp1, uzp2); + + int32x2_t uzpl1, uzpl2; + _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2); + return v_int64x2(vaddl_s32(uzpl1, uzpl2)); +} +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, + const v_int64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +#if CV_SIMD128_64F +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, + const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } +#endif + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ + int16x4_t a0 = vget_low_s16(a.val); + int16x4_t a1 = vget_high_s16(a.val); + int16x4_t b0 = vget_low_s16(b.val); + int16x4_t b1 = vget_high_s16(b.val); + int32x4_t p = vmull_s16(a0, b0); + return v_int32x4(vmlal_s16(p, a1, b1)); +} +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ + int16x4_t a0 = vget_low_s16(a.val); + int16x4_t a1 = vget_high_s16(a.val); + int16x4_t b0 = vget_low_s16(b.val); + int16x4_t b1 = vget_high_s16(b.val); + int32x4_t p = vmlal_s16(c.val, a0, b0); + return v_int32x4(vmlal_s16(p, a1, b1)); +} + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ + int32x2_t a0 = vget_low_s32(a.val); + int32x2_t a1 = vget_high_s32(a.val); + int32x2_t b0 = vget_low_s32(b.val); + int32x2_t b1 = vget_high_s32(b.val); + int64x2_t p = vmull_s32(a0, b0); + return v_int64x2(vmlal_s32(p, a1, b1)); +} +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + int32x2_t a0 = vget_low_s32(a.val); + int32x2_t a1 = vget_high_s32(a.val); + int32x2_t b0 = vget_low_s32(b.val); + int32x2_t b1 = vget_high_s32(b.val); + int64x2_t p = vmlal_s32(c.val, a0, b0); + return v_int64x2(vmlal_s32(p, a1, b1)); +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ +#if CV_NEON_DOT + return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val)); +#else + uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val)); + uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val)); + uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1)); + uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1)); + return v_uint32x4(vaddq_u32(s0, s1)); +#endif +} +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ +#if CV_NEON_DOT + return v_uint32x4(vdotq_u32(c.val, a.val, b.val)); +#else + return v_dotprod_expand_fast(a, b) + c; +#endif +} + +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ +#if CV_NEON_DOT + return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val)); +#else + int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val)); + prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val)); + return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod))); +#endif +} +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ +#if CV_NEON_DOT + return v_int32x4(vdotq_s32(c.val, a.val, b.val)); +#else + return v_dotprod_expand_fast(a, b) + c; +#endif +} + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ + uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val)); + uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)); + uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0)); + uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1)); + return v_uint64x2(vaddq_u64(s0, s1)); +} +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ + int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val)); + prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val)); + return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod))); +} +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 32 >> 64f +#if CV_SIMD128_64F +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod_fast(a, b)); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } +#endif + + #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \ OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \ OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \ @@ -1593,6 +1885,10 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) { return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val))); } + +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ return v_float64x2(vcvtq_f64_s64(a.val)); } + #endif ////////////// Lookup table access //////////////////// diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index f661c58010..c4de1195b5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -225,9 +225,13 @@ struct v_uint64x2 } uint64 get0() const { + #if !defined(__x86_64__) && !defined(_M_X64) int a = _mm_cvtsi128_si32(val); int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); return (unsigned)a | ((uint64)(unsigned)b << 32); + #else + return (uint64)_mm_cvtsi128_si64(val); + #endif } __m128i val; @@ -247,9 +251,13 @@ struct v_int64x2 } int64 get0() const { + #if !defined(__x86_64__) && !defined(_M_X64) int a = _mm_cvtsi128_si32(val); int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32)); return (int64)((unsigned)a | ((uint64)(unsigned)b << 32)); + #else + return _mm_cvtsi128_si64(val); + #endif } __m128i val; @@ -791,15 +799,195 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); } inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); } -inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) -{ - return v_int32x4(_mm_madd_epi16(a.val, b.val)); -} +//////// Dot Product //////// +// 16 >> 32 +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); } inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b) + c; } + +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) { - return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val)); +#if CV_SSE4_1 + __m128i even = _mm_mul_epi32(a.val, b.val); + __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); + return v_int64x2(_mm_add_epi64(even, odd)); +#else + __m128i even_u = _mm_mul_epu32(a.val, b.val); + __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32)); + // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132) + __m128i a_sign = _mm_srai_epi32(a.val, 31); + __m128i b_sign = _mm_srai_epi32(b.val, 31); + // |x * sign of x + __m128i axb = _mm_and_si128(a.val, b_sign); + __m128i bxa = _mm_and_si128(b.val, a_sign); + // sum of sign corrections + __m128i ssum = _mm_add_epi32(bxa, axb); + __m128i even_ssum = _mm_slli_epi64(ssum, 32); + __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0)); + // convert to signed and prod + return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum))); +#endif } +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b) + c; } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ + __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even + __m128i a1 = _mm_srli_epi16(a.val, 8); // odd + __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8); + __m128i b1 = _mm_srli_epi16(b.val, 8); + __m128i p0 = _mm_madd_epi16(a0, b0); + __m128i p1 = _mm_madd_epi16(a1, b1); + return v_uint32x4(_mm_add_epi32(p0, p1)); +} +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even + __m128i a1 = _mm_srai_epi16(a.val, 8); // odd + __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8); + __m128i b1 = _mm_srai_epi16(b.val, 8); + __m128i p0 = _mm_madd_epi16(a0, b0); + __m128i p1 = _mm_madd_epi16(a1, b1); + return v_int32x4(_mm_add_epi32(p0, p1)); +} +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint32x4 c, d; + v_mul_expand(a, b, c, d); + + v_uint64x2 c0, c1, d0, d1; + v_expand(c, c0, c1); + v_expand(d, d0, d1); + + c0 += c1; d0 += d1; + return v_uint64x2(_mm_add_epi64( + _mm_unpacklo_epi64(c0.val, d0.val), + _mm_unpackhi_epi64(c0.val, d0.val) + )); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + v_int32x4 prod = v_dotprod(a, b); + v_int64x2 c, d; + v_expand(prod, c, d); + return v_int64x2(_mm_add_epi64( + _mm_unpacklo_epi64(c.val, d.val), + _mm_unpackhi_epi64(c.val, d.val) + )); +} +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ +#if CV_SSE4_1 + return v_cvt_f64(v_dotprod(a, b)); +#else + v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b); + v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b); + + return v_float64x2(_mm_add_pd( + _mm_unpacklo_pd(c.val, d.val), + _mm_unpackhi_pd(c.val, d.val) + )); +#endif +} +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod(a, b); } +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b) + c; } + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod(a, b); } +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod_fast(a, b) + c; } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ + __m128i a0 = v_expand_low(a).val; + __m128i a1 = v_expand_high(a).val; + __m128i b0 = v_expand_low(b).val; + __m128i b1 = v_expand_high(b).val; + __m128i p0 = _mm_madd_epi16(a0, b0); + __m128i p1 = _mm_madd_epi16(a1, b1); + return v_uint32x4(_mm_add_epi32(p0, p1)); +} +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ +#if CV_SSE4_1 + __m128i a0 = _mm_cvtepi8_epi16(a.val); + __m128i a1 = v_expand_high(a).val; + __m128i b0 = _mm_cvtepi8_epi16(b.val); + __m128i b1 = v_expand_high(b).val; + __m128i p0 = _mm_madd_epi16(a0, b0); + __m128i p1 = _mm_madd_epi16(a1, b1); + return v_int32x4(_mm_add_epi32(p0, p1)); +#else + return v_dotprod_expand(a, b); +#endif +} +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint32x4 c, d; + v_mul_expand(a, b, c, d); + + v_uint64x2 c0, c1, d0, d1; + v_expand(c, c0, c1); + v_expand(d, d0, d1); + + c0 += c1; d0 += d1; + return c0 + d0; +} +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ + v_int32x4 prod = v_dotprod(a, b); + v_int64x2 c, d; + v_expand(prod, c, d); + return c + d; +} +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 32 >> 64f +v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c); +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \ OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \ @@ -2739,6 +2927,32 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val))); } +// from (Mysticial and wim) https://stackoverflow.com/q/41144668 +inline v_float64x2 v_cvt_f64(const v_int64x2& v) +{ + // constants encoded as floating-point + __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63 + __m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52 + __m128d magic_d_all = _mm_castsi128_pd(magic_i_all); + // Blend the 32 lowest significant bits of v with magic_int_lo +#if CV_SSE4_1 + __m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52 + __m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc); +#else + __m128i magic_i_lo = _mm_set1_epi32(0x43300000); // 2^52 + __m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo); +#endif + // Extract the 32 most significant bits of v + __m128i v_hi = _mm_srli_epi64(v.val, 32); + // Flip the msb of v_hi and blend with 0x45300000 + v_hi = _mm_xor_si128(v_hi, magic_i_hi32); + // Compute in double precision + __m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all); + // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition + __m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo)); + return v_float64x2(result); +} + ////////////// Lookup table access //////////////////// inline v_int8x16 v_lut(const schar* tab, const int* idx) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 4f41021875..0d65ca5e7a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -499,12 +499,6 @@ inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d) v_zip(p0, p1, c, d); } -inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d) -{ - c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)); - d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)); -} - inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { vec_int4 p0 = vec_mule(a.val, b.val); @@ -1043,14 +1037,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a) inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); } -// The altivec intrinsic is missing for this 2.06 insn inline v_float64x2 v_cvt_f64(const v_int64x2& a) -{ -vec_double2 out; - -__asm__ ("xvcvsxddp %x0,%x1" : "=wa"(out) : "wa"(a.val)); -return v_float64x2(out); -} +{ return v_float64x2(vec_ctd(a.val)); } ////////////// Lookup table access //////////////////// @@ -1322,12 +1310,134 @@ inline void v_cleanup() {} ////////// Matrix operations ///////// +//////// Dot Product //////// +// 16 >> 32 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); } - inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) { return v_int32x4(vec_msum(a.val, b.val, c.val)); } +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) +{ + vec_dword2 even = vec_mule(a.val, b.val); + vec_dword2 odd = vec_mulo(a.val, b.val); + return v_int64x2(vec_add(even, odd)); +} +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b) + c; } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); } +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); } + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + const vec_ushort8 eight = vec_ushort8_sp(8); + vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even + vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd + vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight); + vec_short8 b1 = vec_sra((vec_short8)b.val, eight); + return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z))); +} + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ + const vec_ushort8 eight = vec_ushort8_sp(8); + vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even + vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd + vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight); + vec_short8 b1 = vec_sra((vec_short8)b.val, eight); + return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val))); +} + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + const vec_uint4 zero = vec_uint4_z; + vec_uint4 even = vec_mule(a.val, b.val); + vec_uint4 odd = vec_mulo(a.val, b.val); + vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero); + vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero); + vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero); + vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero); + vec_udword2 s0 = vec_add(e0, o0); + vec_udword2 s1 = vec_add(e1, o1); + return v_uint64x2(vec_add(s0, s1)); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + v_int32x4 prod = v_dotprod(a, b); + v_int64x2 c, d; + v_expand(prod, c, d); + return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val))); +} +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod(a, b); } +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; } +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod(a, b); } +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; } + +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ + vec_short8 a0 = vec_unpackh(a.val); + vec_short8 a1 = vec_unpackl(a.val); + vec_short8 b0 = vec_unpackh(b.val); + vec_short8 b1 = vec_unpackl(b.val); + return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z))); +} +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b, c); } + +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ + v_int32x4 prod = v_dotprod(a, b); + v_int64x2 c, d; + v_expand(prod, c, d); + return c + d; +} +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } + inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3) diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp index 302d834147..f2da617cfe 100644 --- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp @@ -682,6 +682,29 @@ template inline v_reg::w_type, n return s; } +template inline v_reg::q_type, n/4> + v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg s; + for( int i = 0; i < (n/4); i++ ) + s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + + (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3]; + return s; +} + +template inline v_reg::q_type, n/4> + v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::q_type, n / 4>& c) +{ + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg s; + for( int i = 0; i < (n/4); i++ ) + s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + + (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i]; + return s; +} + template inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg::w_type, n/2>& c, v_reg::w_type, n/2>& d) @@ -1282,6 +1305,14 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) return c; } +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ + v_float64x2 c; + for( int i = 0; i < 2; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx) { v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; @@ -2398,6 +2429,8 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31)); } +//////// Dot Product //////// + inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) { v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16); @@ -2410,15 +2443,140 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) } inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b) + c; } + +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) { - v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16); - v128_t a1 = wasm_i32x4_shr(a.val, 16); - v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16); - v128_t b1 = wasm_i32x4_shr(b.val, 16); - v128_t d = wasm_i32x4_mul(a0, b0); - v128_t e = wasm_i32x4_mul(a1, b1); - return v_int32x4(wasm_i32x4_add(wasm_i32x4_add(d, e), c.val)); +#ifdef __wasm_unimplemented_simd128__ + v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32); + v128_t a1 = wasm_i64x2_shr(a.val, 32); + v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32); + v128_t b1 = wasm_i64x2_shr(b.val, 32); + v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0); + v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1); + return v_int64x2(wasm_i64x2_add(c, d)); +#else + fallback::v_int32x4 a_(a); + fallback::v_int32x4 b_(b); + return fallback::v_dotprod(a_, b_); +#endif } +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ +#ifdef __wasm_unimplemented_simd128__ + return v_dotprod(a, b) + c; +#else + fallback::v_int32x4 a_(a); + fallback::v_int32x4 b_(b); + fallback::v_int64x2 c_(c); + return fallback::v_dotprod(a_, b_, c_); +#endif +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ + v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8); + v128_t a1 = wasm_u16x8_shr(a.val, 8); + v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8); + v128_t b1 = wasm_u16x8_shr(b.val, 8); + return v_uint32x4(( + v_dotprod(v_int16x8(a0), v_int16x8(b0)) + + v_dotprod(v_int16x8(a1), v_int16x8(b1))).val + ); +} +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8); + v128_t a1 = wasm_i16x8_shr(a.val, 8); + v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8); + v128_t b1 = wasm_i16x8_shr(b.val, 8); + return v_int32x4( + v_dotprod(v_int16x8(a0), v_int16x8(b0)) + + v_dotprod(v_int16x8(a1), v_int16x8(b1)) + ); +} +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + fallback::v_uint16x8 a_(a); + fallback::v_uint16x8 b_(b); + return fallback::v_dotprod_expand(a_, b_); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ + fallback::v_uint16x8 a_(a); + fallback::v_uint16x8 b_(b); + fallback::v_uint64x2 c_(c); + return fallback::v_dotprod_expand(a_, b_, c_); +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + fallback::v_int16x8 a_(a); + fallback::v_int16x8 b_(b); + return fallback::v_dotprod_expand(a_, b_); +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ + fallback::v_int16x8 a_(a); + fallback::v_int16x8 b_(b); + fallback::v_int64x2 c_(c); + return fallback::v_dotprod_expand(a_, b_, c_); +} + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod(a, b); } +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod(a, b); } +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b, c); } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \ OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \ @@ -3815,6 +3973,16 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) return fallback::v_cvt_f64_high(a_); } +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ +#ifdef __wasm_unimplemented_simd128__ + return v_float64x2(wasm_convert_f64x2_i64x2(a.val)); +#else + fallback::v_int64x2 a_(a); + return fallback::v_cvt_f64(a_); +#endif +} + ////////////// Lookup table access //////////////////// inline v_int8x16 v_lut(const schar* tab, const int* idx) diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp index 0f1029de83..91669bff31 100644 --- a/modules/core/include/opencv2/core/vsx_utils.hpp +++ b/modules/core/include/opencv2/core/vsx_utils.hpp @@ -144,10 +144,10 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \ VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mulo, __builtin_vec_mulo) // dword2 support arrived in ISA 2.07 and GCC 8+ - VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mule) - VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mule) - VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mulo) - VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mulo) + VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mule) + VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule) + VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mulo) + VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo) #endif diff --git a/modules/core/perf/perf_dot.cpp b/modules/core/perf/perf_dot.cpp index 5321962319..1230220a4c 100644 --- a/modules/core/perf/perf_dot.cpp +++ b/modules/core/perf/perf_dot.cpp @@ -9,7 +9,7 @@ typedef TestBaseWithParam MatType_Length; PERF_TEST_P( MatType_Length, dot, testing::Combine( - testing::Values( CV_8UC1, CV_32SC1, CV_32FC1 ), + testing::Values( CV_8UC1, CV_8SC1, CV_16SC1, CV_16UC1, CV_32SC1, CV_32FC1 ), testing::Values( 32, 64, 128, 256, 512, 1024 ) )) { diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp index 5d188029b1..1360ed46d6 100644 --- a/modules/core/src/matmul.simd.hpp +++ b/modules/core/src/matmul.simd.hpp @@ -2320,26 +2320,22 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len) while (i < len0) { blockSize = std::min(len0 - i, blockSize0); - v_int32 v_sum = vx_setzero_s32(); + v_uint32 v_sum = vx_setzero_u32(); const int cWidth = v_uint16::nlanes; int j = 0; for (; j <= blockSize - cWidth * 2; j += cWidth * 2) { - v_uint16 v_src10, v_src20, v_src11, v_src21; - v_expand(vx_load(src1 + j), v_src10, v_src11); - v_expand(vx_load(src2 + j), v_src20, v_src21); - - v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20)); - v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21)); + v_uint8 v_src1 = vx_load(src1 + j); + v_uint8 v_src2 = vx_load(src2 + j); + v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum); } for (; j <= blockSize - cWidth; j += cWidth) { v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j)); v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j)); - - v_sum += v_dotprod(v_src10, v_src20); + v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20)); } r += (double)v_reduce_sum(v_sum); @@ -2348,48 +2344,6 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len) i += blockSize; } vx_cleanup(); -#elif CV_NEON - if( cv::checkHardwareSupport(CV_CPU_NEON) ) - { - int len0 = len & -8, blockSize0 = (1 << 15), blockSize; - uint32x4_t v_zero = vdupq_n_u32(0u); - CV_DECL_ALIGNED(16) uint buf[4]; - - while( i < len0 ) - { - blockSize = std::min(len0 - i, blockSize0); - uint32x4_t v_sum = v_zero; - - int j = 0; - for( ; j <= blockSize - 16; j += 16 ) - { - uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j); - - uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2)); - v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); - v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); - - v_src10 = vmovl_u8(vget_high_u8(v_src1)); - v_src20 = vmovl_u8(vget_high_u8(v_src2)); - v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); - v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); - } - - for( ; j <= blockSize - 8; j += 8 ) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j)); - v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2)); - v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2)); - } - - vst1q_u32(buf, v_sum); - r += buf[0] + buf[1] + buf[2] + buf[3]; - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } - } #endif return r + dotProd_(src1, src2, len - i); } @@ -2412,20 +2366,16 @@ double dotProd_8s(const schar* src1, const schar* src2, int len) int j = 0; for (; j <= blockSize - cWidth * 2; j += cWidth * 2) { - v_int16 v_src10, v_src20, v_src11, v_src21; - v_expand(vx_load(src1 + j), v_src10, v_src11); - v_expand(vx_load(src2 + j), v_src20, v_src21); - - v_sum += v_dotprod(v_src10, v_src20); - v_sum += v_dotprod(v_src11, v_src21); + v_int8 v_src1 = vx_load(src1 + j); + v_int8 v_src2 = vx_load(src2 + j); + v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum); } for (; j <= blockSize - cWidth; j += cWidth) { - v_int16 v_src10 = vx_load_expand(src1 + j); - v_int16 v_src20 = vx_load_expand(src2 + j); - - v_sum += v_dotprod(v_src10, v_src20); + v_int16 v_src1 = vx_load_expand(src1 + j); + v_int16 v_src2 = vx_load_expand(src2 + j); + v_sum = v_dotprod_fast(v_src1, v_src2, v_sum); } r += (double)v_reduce_sum(v_sum); @@ -2434,87 +2384,6 @@ double dotProd_8s(const schar* src1, const schar* src2, int len) i += blockSize; } vx_cleanup(); -#elif CV_NEON - if( cv::checkHardwareSupport(CV_CPU_NEON) ) - { - int len0 = len & -8, blockSize0 = (1 << 14), blockSize; - int32x4_t v_zero = vdupq_n_s32(0); - CV_DECL_ALIGNED(16) int buf[4]; - - while( i < len0 ) - { - blockSize = std::min(len0 - i, blockSize0); - int32x4_t v_sum = v_zero; - - int j = 0; - for( ; j <= blockSize - 16; j += 16 ) - { - int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j); - - int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2)); - v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); - v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); - - v_src10 = vmovl_s8(vget_high_s8(v_src1)); - v_src20 = vmovl_s8(vget_high_s8(v_src2)); - v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); - v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); - } - - for( ; j <= blockSize - 8; j += 8 ) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j)); - v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2)); - v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2)); - } - - vst1q_s32(buf, v_sum); - r += buf[0] + buf[1] + buf[2] + buf[3]; - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } - } -#elif CV_MSA - int len0 = len & -8, blockSize0 = (1 << 14), blockSize; - v4i32 v_zero = msa_dupq_n_s32(0); - CV_DECL_ALIGNED(16) int buf[4]; - - while( i < len0 ) - { - blockSize = std::min(len0 - i, blockSize0); - v4i32 v_sum = v_zero; - - int j = 0; - for( ; j <= blockSize - 16; j += 16 ) - { - v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j); - - v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2)); - v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20)); - v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20)); - - v_src10 = msa_movl_s8(msa_get_high_s8(v_src1)); - v_src20 = msa_movl_s8(msa_get_high_s8(v_src2)); - v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20)); - v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20)); - } - - for( ; j <= blockSize - 8; j += 8 ) - { - v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j)); - v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2)); - v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2)); - } - - msa_st1q_s32(buf, v_sum); - r += buf[0] + buf[1] + buf[2] + buf[3]; - - src1 += blockSize; - src2 += blockSize; - i += blockSize; - } #endif return r + dotProd_(src1, src2, len - i); @@ -2522,42 +2391,97 @@ double dotProd_8s(const schar* src1, const schar* src2, int len) double dotProd_16u(const ushort* src1, const ushort* src2, int len) { - return dotProd_(src1, src2, len); + double r = 0.0; + int i = 0; + +#if CV_SIMD + int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize; + + while (i < len0) + { + blockSize = std::min(len0 - i, blockSize0); + v_uint64 v_sum = vx_setzero_u64(); + const int cWidth = v_uint16::nlanes; + + int j = 0; + for (; j <= blockSize - cWidth; j += cWidth) + { + v_uint16 v_src1 = vx_load(src1 + j); + v_uint16 v_src2 = vx_load(src2 + j); + v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum); + } + r += (double)v_reduce_sum(v_sum); + + src1 += blockSize; + src2 += blockSize; + i += blockSize; + } + vx_cleanup(); +#endif + return r + dotProd_(src1, src2, len - i); } double dotProd_16s(const short* src1, const short* src2, int len) { - return dotProd_(src1, src2, len); + double r = 0.0; + int i = 0; + +#if CV_SIMD + int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize; + + while (i < len0) + { + blockSize = std::min(len0 - i, blockSize0); + v_int64 v_sum = vx_setzero_s64(); + const int cWidth = v_int16::nlanes; + + int j = 0; + for (; j <= blockSize - cWidth; j += cWidth) + { + v_int16 v_src1 = vx_load(src1 + j); + v_int16 v_src2 = vx_load(src2 + j); + v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum); + } + r += (double)v_reduce_sum(v_sum); + + src1 += blockSize; + src2 += blockSize; + i += blockSize; + } + vx_cleanup(); +#endif + return r + dotProd_(src1, src2, len - i); } double dotProd_32s(const int* src1, const int* src2, int len) { -#if CV_SIMD128_64F - double r = 0.0; +#if CV_SIMD_64F + double r = .0; int i = 0; - int lenAligned = len & -v_int32x4::nlanes; - v_float64x2 a(0.0, 0.0); - v_float64x2 b(0.0, 0.0); - - for( i = 0; i < lenAligned; i += v_int32x4::nlanes ) - { - v_int32x4 s1 = v_load(src1); - v_int32x4 s2 = v_load(src2); - -#if CV_VSX - // Do 32x32->64 multiplies, convert/round to double, accumulate - // Potentially less precise than FMA, but 1.5x faster than fma below. - a += v_cvt_f64(v_int64(vec_mule(s1.val, s2.val))); - b += v_cvt_f64(v_int64(vec_mulo(s1.val, s2.val))); -#else - a = v_fma(v_cvt_f64(s1), v_cvt_f64(s2), a); - b = v_fma(v_cvt_f64_high(s1), v_cvt_f64_high(s2), b); + const int step = v_int32::nlanes; + v_float64 v_sum0 = vx_setzero_f64(); +#if CV_SIMD_WIDTH == 16 + const int wstep = step * 2; + v_float64 v_sum1 = vx_setzero_f64(); + for (; i < len - wstep; i += wstep, src1 += wstep, src2 += wstep) + { + v_int32 v_src10 = vx_load(src1); + v_int32 v_src20 = vx_load(src2); + v_int32 v_src11 = vx_load(src1 + step); + v_int32 v_src21 = vx_load(src2 + step); + v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0); + v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1); + } + v_sum0 += v_sum1; #endif - src1 += v_int32x4::nlanes; - src2 += v_int32x4::nlanes; - } - a += b; - r = v_reduce_sum(a); + for (; i < len - step; i += step, src1 += step, src2 += step) + { + v_int32 v_src1 = vx_load(src1); + v_int32 v_src2 = vx_load(src2); + v_sum0 = v_dotprod_expand_fast(v_src1, v_src2, v_sum0); + } + r = v_reduce_sum(v_sum0); + vx_cleanup(); return r + dotProd_(src1, src2, len - i); #else return dotProd_(src1, src2, len); diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 06aeac867d..fcb6b93a3c 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -603,12 +603,14 @@ template struct TheTest return *this; } - TheTest & test_dot_prod() + TheTest & test_dotprod() { typedef typename V_RegTraits::w_reg Rx2; typedef typename Rx2::lane_type w_type; - Data dataA, dataB(2); + Data dataA, dataB; + dataA += std::numeric_limits::max() - R::nlanes; + dataB += std::numeric_limits::min() + R::nlanes; R a = dataA, b = dataB; Data dataC; @@ -621,12 +623,95 @@ template struct TheTest resE = v_dotprod(a, b, c); const int n = R::nlanes / 2; + w_type sumAB = 0, sumABC = 0, tmp_sum; for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]); - EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]); + + tmp_sum = (w_type)dataA[i*2] * (w_type)dataB[i*2] + + (w_type)dataA[i*2 + 1] * (w_type)dataB[i*2 + 1]; + sumAB += tmp_sum; + EXPECT_EQ(tmp_sum, resD[i]); + + tmp_sum = tmp_sum + dataC[i]; + sumABC += tmp_sum; + EXPECT_EQ(tmp_sum, resE[i]); } + + w_type resF = v_reduce_sum(v_dotprod_fast(a, b)), + resG = v_reduce_sum(v_dotprod_fast(a, b, c)); + EXPECT_EQ(sumAB, resF); + EXPECT_EQ(sumABC, resG); + return *this; + } + + TheTest & test_dotprod_expand() + { + typedef typename V_RegTraits::q_reg Rx4; + typedef typename Rx4::lane_type l4_type; + + Data dataA, dataB; + dataA += std::numeric_limits::max() - R::nlanes; + dataB += std::numeric_limits::min() + R::nlanes; + R a = dataA, b = dataB; + + Data dataC; + Rx4 c = dataC; + + Data resD = v_dotprod_expand(a, b), + resE = v_dotprod_expand(a, b, c); + + l4_type sumAB = 0, sumABC = 0, tmp_sum; + for (int i = 0; i < Rx4::nlanes; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + tmp_sum = (l4_type)dataA[i*4] * (l4_type)dataB[i*4] + + (l4_type)dataA[i*4 + 1] * (l4_type)dataB[i*4 + 1] + + (l4_type)dataA[i*4 + 2] * (l4_type)dataB[i*4 + 2] + + (l4_type)dataA[i*4 + 3] * (l4_type)dataB[i*4 + 3]; + sumAB += tmp_sum; + EXPECT_EQ(tmp_sum, resD[i]); + + tmp_sum = tmp_sum + dataC[i]; + sumABC += tmp_sum; + EXPECT_EQ(tmp_sum, resE[i]); + } + + l4_type resF = v_reduce_sum(v_dotprod_expand_fast(a, b)), + resG = v_reduce_sum(v_dotprod_expand_fast(a, b, c)); + EXPECT_EQ(sumAB, resF); + EXPECT_EQ(sumABC, resG); + + return *this; + } + + TheTest & test_dotprod_expand_f64() + { + #if CV_SIMD_64F + Data dataA, dataB; + dataA += std::numeric_limits::max() - R::nlanes; + dataB += std::numeric_limits::min(); + R a = dataA, b = dataB; + + Data dataC; + v_float64 c = dataC; + + Data resA = v_dotprod_expand(a, a), + resB = v_dotprod_expand(b, b), + resC = v_dotprod_expand(a, b, c); + + const int n = R::nlanes / 2; + for (int i = 0; i < n; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ((double)dataA[i*2] * (double)dataA[i*2] + + (double)dataA[i*2 + 1] * (double)dataA[i*2 + 1], resA[i]); + EXPECT_EQ((double)dataB[i*2] * (double)dataB[i*2] + + (double)dataB[i*2 + 1] * (double)dataB[i*2 + 1], resB[i]); + EXPECT_EQ((double)dataA[i*2] * (double)dataB[i*2] + + (double)dataA[i*2 + 1] * (double)dataB[i*2 + 1] + dataC[i], resC[i]); + } + #endif return *this; } @@ -1165,6 +1250,29 @@ template struct TheTest return *this; } + TheTest & test_cvt64_double() + { +#if CV_SIMD_64F + Data dataA(std::numeric_limits::max()), + dataB(std::numeric_limits::min()); + dataB += R::nlanes; + + R a = dataA, b = dataB; + v_float64 c = v_cvt_f64(a), d = v_cvt_f64(b); + + Data resC = c; + Data resD = d; + + for (int i = 0; i < R::nlanes; ++i) + { + SCOPED_TRACE(cv::format("i=%d", i)); + EXPECT_EQ((double)dataA[i], resC[i]); + EXPECT_EQ((double)dataB[i], resD[i]); + } +#endif + return *this; + } + TheTest & test_matmul() { Data dataV, dataA, dataB, dataC, dataD; @@ -1341,6 +1449,7 @@ void test_hal_intrin_uint8() .test_mul_expand() .test_cmp() .test_logic() + .test_dotprod_expand() .test_min_max() .test_absdiff() .test_reduce_sad() @@ -1378,6 +1487,7 @@ void test_hal_intrin_int8() .test_mul_expand() .test_cmp() .test_logic() + .test_dotprod_expand() .test_min_max() .test_absdiff() .test_absdiffs() @@ -1408,6 +1518,7 @@ void test_hal_intrin_uint16() .test_cmp() .test_shift<1>() .test_shift<8>() + .test_dotprod_expand() .test_logic() .test_min_max() .test_absdiff() @@ -1437,7 +1548,8 @@ void test_hal_intrin_int16() .test_cmp() .test_shift<1>() .test_shift<8>() - .test_dot_prod() + .test_dotprod() + .test_dotprod_expand() .test_logic() .test_min_max() .test_absdiff() @@ -1497,6 +1609,8 @@ void test_hal_intrin_int32() .test_cmp() .test_popcount() .test_shift<1>().test_shift<8>() + .test_dotprod() + .test_dotprod_expand_f64() .test_logic() .test_min_max() .test_absdiff() @@ -1538,6 +1652,7 @@ void test_hal_intrin_int64() .test_logic() .test_extract<0>().test_extract<1>() .test_rotate<0>().test_rotate<1>() + .test_cvt64_double() ; }