From ed7e4273cdc207f3f67ffa2692b728cb10de21b5 Mon Sep 17 00:00:00 2001 From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com> Date: Fri, 1 Nov 2019 15:30:48 -0400 Subject: [PATCH] Merge pull request #15555 from ChipKerchner:flipVectorize * Vectorize flipHoriz and flipVert functions. * Change v_load_mirror_1 to use vec_revb for VSX * Only use vec_revb in ISA3.0 * Removing vec_revb code since some of the older compilers don't fully support it. * Use new v_reverse intrinsic and cleanup code. * Ensure there are no alignment issues with copies --- modules/core/src/copy.cpp | 217 +++++++++++++++++++++++++++++++++++--- 1 file changed, 204 insertions(+), 13 deletions(-) diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index c1478de763..3f68a2555a 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -563,25 +563,206 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask) return *this; } +#if CV_SIMD128 +template CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) +{ + typedef typename V::lane_type T; + int end = (int)(size.width*esz); + int width = (end + 1)/2; + int width_1 = width & -v_uint8x16::nlanes; + int i, j; + + for( ; size.height--; src += sstep, dst += dstep ) + { + for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes ) + { + V t0, t1; + + t0 = v_load((T*)((uchar*)src + i)); + t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes)); + t0 = v_reverse(t0); + t1 = v_reverse(t1); + v_store((T*)(dst + j - v_uint8x16::nlanes), t0); + v_store((T*)(dst + i), t1); + } + if (((size_t)src|(size_t)dst) % sizeof(T) == 0) + { + for ( ; i < width; i += sizeof(T), j -= sizeof(T) ) + { + T t0, t1; + + t0 = *((T*)((uchar*)src + i)); + t1 = *((T*)((uchar*)src + j - sizeof(T))); + *((T*)(dst + j - sizeof(T))) = t0; + *((T*)(dst + i)) = t1; + } + } + else + { + for ( ; i < width; i += sizeof(T), j -= sizeof(T) ) + { + for (int k = 0; k < (int)sizeof(T); k++) + { + uchar t0, t1; + + t0 = *((uchar*)src + i + k); + t1 = *((uchar*)src + j + k - sizeof(T)); + *(dst + j + k - sizeof(T)) = t0; + *(dst + i + k) = t1; + } + } + } + } +} + +template CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) +{ + int end = (int)(size.width*esz); + int width = (end + 1)/2; + + for( ; size.height--; src += sstep, dst += dstep ) + { + for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) ) + { + T1 t0, t1; + T2 t2, t3; + + t0 = *((T1*)((uchar*)src + i)); + t2 = *((T2*)((uchar*)src + i + sizeof(T1))); + t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2))); + t3 = *((T2*)((uchar*)src + j - sizeof(T2))); + *((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0; + *((T2*)(dst + j - sizeof(T2))) = t2; + *((T1*)(dst + i)) = t1; + *((T2*)(dst + i + sizeof(T1))) = t3; + } + } +} +#endif static void flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz ) { - int i, j, limit = (int)(((size.width + 1)/2)*esz); - AutoBuffer _tab(size.width*esz); - int* tab = _tab.data(); - - for( i = 0; i < size.width; i++ ) - for( size_t k = 0; k < esz; k++ ) - tab[i*esz + k] = (int)((size.width - i - 1)*esz + k); - - for( ; size.height--; src += sstep, dst += dstep ) +#if CV_SIMD + if (esz == 2 * v_uint8x16::nlanes) { - for( i = 0; i < limit; i++ ) + int end = (int)(size.width*esz); + int width = end/2; + + for( ; size.height--; src += sstep, dst += dstep ) { - j = tab[i]; - uchar t0 = src[i], t1 = src[j]; - dst[i] = t1; dst[j] = t0; + for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes ) + { +#if CV_SIMD256 + v_uint8x32 t0, t1; + + t0 = v256_load((uchar*)src + i); + t1 = v256_load((uchar*)src + j); + v_store(dst + j, t0); + v_store(dst + i, t1); +#else + v_uint8x16 t0, t1, t2, t3; + + t0 = v_load((uchar*)src + i); + t1 = v_load((uchar*)src + i + v_uint8x16::nlanes); + t2 = v_load((uchar*)src + j); + t3 = v_load((uchar*)src + j + v_uint8x16::nlanes); + v_store(dst + j, t0); + v_store(dst + j + v_uint8x16::nlanes, t1); + v_store(dst + i, t2); + v_store(dst + i + v_uint8x16::nlanes, t3); +#endif + } + } + } + else if (esz == v_uint8x16::nlanes) + { + int end = (int)(size.width*esz); + int width = end/2; + + for( ; size.height--; src += sstep, dst += dstep ) + { + for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes ) + { + v_uint8x16 t0, t1; + + t0 = v_load((uchar*)src + i); + t1 = v_load((uchar*)src + j); + v_store(dst + j, t0); + v_store(dst + i, t1); + } + } + } + else if (esz == 8) + { + flipHoriz_single(src, sstep, dst, dstep, size, esz); + } + else if (esz == 4) + { + flipHoriz_single(src, sstep, dst, dstep, size, esz); + } + else if (esz == 2) + { + flipHoriz_single(src, sstep, dst, dstep, size, esz); + } + else if (esz == 1) + { + flipHoriz_single(src, sstep, dst, dstep, size, esz); + } + else if (esz == 24) + { + int end = (int)(size.width*esz); + int width = (end + 1)/2; + + for( ; size.height--; src += sstep, dst += dstep ) + { + for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) ) + { + v_uint8x16 t0, t1; + uint64_t t2, t3; + + t0 = v_load((uchar*)src + i); + t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes)); + t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t)); + t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t))); + v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0); + *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2; + v_store(dst + i, t1); + *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3; + } + } + } + else if (esz == 12) + { + flipHoriz_double(src, sstep, dst, dstep, size, esz); + } + else if (esz == 6) + { + flipHoriz_double(src, sstep, dst, dstep, size, esz); + } + else if (esz == 3) + { + flipHoriz_double(src, sstep, dst, dstep, size, esz); + } + else +#endif + { + int i, j, limit = (int)(((size.width + 1)/2)*esz); + AutoBuffer _tab(size.width*esz); + int* tab = _tab.data(); + + for( i = 0; i < size.width; i++ ) + for( size_t k = 0; k < esz; k++ ) + tab[i*esz + k] = (int)((size.width - i - 1)*esz + k); + + for( ; size.height--; src += sstep, dst += dstep ) + { + for( i = 0; i < limit; i++ ) + { + j = tab[i]; + uchar t0 = src[i], t1 = src[j]; + dst[i] = t1; dst[j] = t0; + } } } } @@ -597,6 +778,16 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, dst0 += dstep, dst1 -= dstep ) { int i = 0; +#if CV_SIMD + for( ; i <= size.width - (v_int32::nlanes * 4); i += v_int32::nlanes * 4 ) + { + v_int32 t0 = vx_load((int*)(src0 + i)); + v_int32 t1 = vx_load((int*)(src1 + i)); + vx_store((int*)(dst0 + i), t1); + vx_store((int*)(dst1 + i), t0); + } +#endif + if( ((size_t)src0|(size_t)dst0|(size_t)src1|(size_t)dst1) % sizeof(int) == 0 ) { for( ; i <= size.width - 16; i += 16 )