From 2112aa31e6f9d2687b590c2d98cee5ab5972ecfb Mon Sep 17 00:00:00 2001 From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com> Date: Tue, 5 Nov 2019 10:52:35 -0500 Subject: [PATCH] Merge pull request #15828 from ChipKerchner:momentsToHal * Convert moments in tile algorithms to HAL (1.3x faster for VSX). * Adding NEON code back in for non 64-bit platforms. * Remove floats from post processing. --- modules/imgproc/src/moments.cpp | 160 +++++++------------------------- 1 file changed, 35 insertions(+), 125 deletions(-) diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 1f1e5922cd..9e7e6d2dfd 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -38,8 +38,10 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ + #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" namespace cv { @@ -211,7 +213,7 @@ struct MomentsInTile_SIMD } }; -#if CV_SSE2 +#if CV_SIMD128 template <> struct MomentsInTile_SIMD @@ -226,115 +228,33 @@ struct MomentsInTile_SIMD int x = 0; { - __m128i dx = _mm_set1_epi16(8); - __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + v_int16x8 dx = v_setall_s16(8), qx = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7); + v_uint32x4 z = v_setzero_u32(), qx0 = z, qx1 = z, qx2 = z, qx3 = z; for( ; x <= len - 8; x += 8 ) { - __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); - __m128i sx = _mm_mullo_epi16(qx, qx); + v_int16x8 p = v_reinterpret_as_s16(v_load_expand(ptr + x)); + v_int16x8 sx = v_mul_wrap(qx, qx); - qx0 = _mm_add_epi16(qx0, p); - qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); - qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); - qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); + qx0 += v_reinterpret_as_u32(p); + qx1 = v_reinterpret_as_u32(v_dotprod(p, qx, v_reinterpret_as_s32(qx1))); + qx2 = v_reinterpret_as_u32(v_dotprod(p, sx, v_reinterpret_as_s32(qx2))); + qx3 = v_reinterpret_as_u32(v_dotprod(v_mul_wrap(p, qx), sx, v_reinterpret_as_s32(qx3))); - qx = _mm_add_epi16(qx, dx); + qx += dx; } - __m128i qx01_lo = _mm_unpacklo_epi32(qx0, qx1); - __m128i qx23_lo = _mm_unpacklo_epi32(qx2, qx3); - __m128i qx01_hi = _mm_unpackhi_epi32(qx0, qx1); - __m128i qx23_hi = _mm_unpackhi_epi32(qx2, qx3); - qx01_lo = _mm_add_epi32(qx01_lo, qx01_hi); - qx23_lo = _mm_add_epi32(qx23_lo, qx23_hi); - __m128i qx0123_lo = _mm_unpacklo_epi64(qx01_lo, qx23_lo); - __m128i qx0123_hi = _mm_unpackhi_epi64(qx01_lo, qx23_lo); - qx0123_lo = _mm_add_epi32(qx0123_lo, qx0123_hi); - _mm_store_si128((__m128i*)buf, qx0123_lo); - - x0 = (buf[0] & 0xffff) + (buf[0] >> 16); - x1 = buf[1]; - x2 = buf[2]; - x3 = buf[3]; + x0 = v_reduce_sum(qx0); + x0 = (x0 & 0xffff) + (x0 >> 16); + x1 = v_reduce_sum(qx1); + x2 = v_reduce_sum(qx2); + x3 = v_reduce_sum(qx3); } return x; } - - int CV_DECL_ALIGNED(16) buf[4]; }; -#elif CV_NEON - -template <> -struct MomentsInTile_SIMD -{ - MomentsInTile_SIMD() - { - ushort CV_DECL_ALIGNED(8) init[4] = { 0, 1, 2, 3 }; - qx_init = vld1_u16(init); - v_step = vdup_n_u16(4); - } - - int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) - { - int x = 0; - - uint32x4_t v_z = vdupq_n_u32(0), v_x0 = v_z, v_x1 = v_z, - v_x2 = v_z, v_x3 = v_z; - uint16x4_t qx = qx_init; - - for( ; x <= len - 8; x += 8 ) - { - uint16x8_t v_src = vmovl_u8(vld1_u8(ptr + x)); - - // first part - uint32x4_t v_qx = vmovl_u16(qx); - uint16x4_t v_p = vget_low_u16(v_src); - uint32x4_t v_px = vmull_u16(qx, v_p); - - v_x0 = vaddw_u16(v_x0, v_p); - v_x1 = vaddq_u32(v_x1, v_px); - v_px = vmulq_u32(v_px, v_qx); - v_x2 = vaddq_u32(v_x2, v_px); - v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); - qx = vadd_u16(qx, v_step); - - // second part - v_qx = vmovl_u16(qx); - v_p = vget_high_u16(v_src); - v_px = vmull_u16(qx, v_p); - - v_x0 = vaddw_u16(v_x0, v_p); - v_x1 = vaddq_u32(v_x1, v_px); - v_px = vmulq_u32(v_px, v_qx); - v_x2 = vaddq_u32(v_x2, v_px); - v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); - - qx = vadd_u16(qx, v_step); - } - - vst1q_u32(buf, v_x0); - x0 = buf[0] + buf[1] + buf[2] + buf[3]; - vst1q_u32(buf, v_x1); - x1 = buf[0] + buf[1] + buf[2] + buf[3]; - vst1q_u32(buf, v_x2); - x2 = buf[0] + buf[1] + buf[2] + buf[3]; - vst1q_u32(buf, v_x3); - x3 = buf[0] + buf[1] + buf[2] + buf[3]; - - return x; - } - - uint CV_DECL_ALIGNED(16) buf[4]; - uint16x4_t qx_init, v_step; -}; - -#endif - -#if CV_SSE4_1 - template <> struct MomentsInTile_SIMD { @@ -348,49 +268,39 @@ struct MomentsInTile_SIMD int x = 0; { - __m128i v_delta = _mm_set1_epi32(4), v_zero = _mm_setzero_si128(), v_x0 = v_zero, - v_x1 = v_zero, v_x2 = v_zero, v_x3 = v_zero, v_ix0 = _mm_setr_epi32(0, 1, 2, 3); + v_int32x4 v_delta = v_setall_s32(4), v_ix0 = v_int32x4(0, 1, 2, 3); + v_uint32x4 z = v_setzero_u32(), v_x0 = z, v_x1 = z, v_x2 = z; + v_uint64x2 v_x3 = v_reinterpret_as_u64(z); for( ; x <= len - 4; x += 4 ) { - __m128i v_src = _mm_loadl_epi64((const __m128i *)(ptr + x)); - v_src = _mm_unpacklo_epi16(v_src, v_zero); + v_int32x4 v_src = v_reinterpret_as_s32(v_load_expand(ptr + x)); - v_x0 = _mm_add_epi32(v_x0, v_src); - v_x1 = _mm_add_epi32(v_x1, _mm_mullo_epi32(v_src, v_ix0)); + v_x0 += v_reinterpret_as_u32(v_src); + v_x1 += v_reinterpret_as_u32(v_src * v_ix0); - __m128i v_ix1 = _mm_mullo_epi32(v_ix0, v_ix0); - v_x2 = _mm_add_epi32(v_x2, _mm_mullo_epi32(v_src, v_ix1)); + v_int32x4 v_ix1 = v_ix0 * v_ix0; + v_x2 += v_reinterpret_as_u32(v_src * v_ix1); - v_ix1 = _mm_mullo_epi32(v_ix0, v_ix1); - v_src = _mm_mullo_epi32(v_src, v_ix1); - v_x3 = _mm_add_epi64(v_x3, _mm_add_epi64(_mm_unpacklo_epi32(v_src, v_zero), _mm_unpackhi_epi32(v_src, v_zero))); + v_ix1 = v_ix0 * v_ix1; + v_src = v_src * v_ix1; + v_uint64x2 v_lo, v_hi; + v_expand(v_reinterpret_as_u32(v_src), v_lo, v_hi); + v_x3 += v_lo + v_hi; - v_ix0 = _mm_add_epi32(v_ix0, v_delta); + v_ix0 += v_delta; } - __m128i v_x01_lo = _mm_unpacklo_epi32(v_x0, v_x1); - __m128i v_x22_lo = _mm_unpacklo_epi32(v_x2, v_x2); - __m128i v_x01_hi = _mm_unpackhi_epi32(v_x0, v_x1); - __m128i v_x22_hi = _mm_unpackhi_epi32(v_x2, v_x2); - v_x01_lo = _mm_add_epi32(v_x01_lo, v_x01_hi); - v_x22_lo = _mm_add_epi32(v_x22_lo, v_x22_hi); - __m128i v_x0122_lo = _mm_unpacklo_epi64(v_x01_lo, v_x22_lo); - __m128i v_x0122_hi = _mm_unpackhi_epi64(v_x01_lo, v_x22_lo); - v_x0122_lo = _mm_add_epi32(v_x0122_lo, v_x0122_hi); - _mm_store_si128((__m128i*)buf64, v_x3); - _mm_store_si128((__m128i*)buf, v_x0122_lo); - - x0 = buf[0]; - x1 = buf[1]; - x2 = buf[2]; + x0 = v_reduce_sum(v_x0); + x1 = v_reduce_sum(v_x1); + x2 = v_reduce_sum(v_x2); + v_store_aligned(buf64, v_reinterpret_as_s64(v_x3)); x3 = buf64[0] + buf64[1]; } return x; } - int CV_DECL_ALIGNED(16) buf[4]; int64 CV_DECL_ALIGNED(16) buf64[2]; };