From ed7e4273cdc207f3f67ffa2692b728cb10de21b5 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com>
Date: Fri, 1 Nov 2019 15:30:48 -0400
Subject: [PATCH] Merge pull request #15555 from ChipKerchner:flipVectorize

* Vectorize flipHoriz and flipVert functions.

* Change v_load_mirror_1 to use vec_revb for VSX

* Only use vec_revb in ISA3.0

* Removing vec_revb code since some of the older compilers don't fully support it.

* Use new v_reverse intrinsic and cleanup code.

* Ensure there are no alignment issues with copies
---
 modules/core/src/copy.cpp | 217 +++++++++++++++++++++++++++++++++++---
 1 file changed, 204 insertions(+), 13 deletions(-)
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index c1478de763..3f68a2555a 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -563,25 +563,206 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
     return *this;
 }
 
+#if CV_SIMD128
+template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    typedef typename V::lane_type T;
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+    int width_1 = width & -v_uint8x16::nlanes;
+    int i, j;
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+        {
+            V t0, t1;
+
+            t0 = v_load((T*)((uchar*)src + i));
+            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
+            t0 = v_reverse(t0);
+            t1 = v_reverse(t1);
+            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
+            v_store((T*)(dst + i), t1);
+        }
+        if (((size_t)src|(size_t)dst) % sizeof(T) == 0)
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                T t0, t1;
+
+                t0 = *((T*)((uchar*)src + i));
+                t1 = *((T*)((uchar*)src + j - sizeof(T)));
+                *((T*)(dst + j - sizeof(T))) = t0;
+                *((T*)(dst + i)) = t1;
+            }
+        }
+        else
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                for (int k = 0; k < (int)sizeof(T); k++)
+                {
+                    uchar t0, t1;
+
+                    t0 = *((uchar*)src + i + k);
+                    t1 = *((uchar*)src + j + k - sizeof(T));
+                    *(dst + j + k - sizeof(T)) = t0;
+                    *(dst + i + k) = t1;
+                }
+            }
+        }
+    }
+}
+
+template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
+        {
+            T1 t0, t1;
+            T2 t2, t3;
+
+            t0 = *((T1*)((uchar*)src + i));
+            t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
+            t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
+            t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
+            *((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
+            *((T2*)(dst + j - sizeof(T2))) = t2;
+            *((T1*)(dst + i)) = t1;
+            *((T2*)(dst + i + sizeof(T1))) = t3;
+        }
+    }
+}
+#endif
 
 static void
 flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
-    int i, j, limit = (int)(((size.width + 1)/2)*esz);
-    AutoBuffer<int> _tab(size.width*esz);
-    int* tab = _tab.data();
-
-    for( i = 0; i < size.width; i++ )
-        for( size_t k = 0; k < esz; k++ )
-            tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
-
-    for( ; size.height--; src += sstep, dst += dstep )
+#if CV_SIMD
+    if (esz == 2 * v_uint8x16::nlanes)
     {
-        for( i = 0; i < limit; i++ )
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
         {
-            j = tab[i];
-            uchar t0 = src[i], t1 = src[j];
-            dst[i] = t1; dst[j] = t0;
+            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
+            {
+#if CV_SIMD256
+                v_uint8x32 t0, t1;
+
+                t0 = v256_load((uchar*)src + i);
+                t1 = v256_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+#else
+                v_uint8x16 t0, t1, t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
+                t2 = v_load((uchar*)src + j);
+                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
+                v_store(dst + j, t0);
+                v_store(dst + j + v_uint8x16::nlanes, t1);
+                v_store(dst + i, t2);
+                v_store(dst + i + v_uint8x16::nlanes, t3);
+#endif
+            }
+        }
+    }
+    else if (esz == v_uint8x16::nlanes)
+    {
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+            {
+                v_uint8x16 t0, t1;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+            }
+        }
+    }
+    else if (esz == 8)
+    {
+        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 4)
+    {
+        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 2)
+    {
+        flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 1)
+    {
+        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 24)
+    {
+        int end = (int)(size.width*esz);
+        int width = (end + 1)/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
+            {
+                v_uint8x16 t0, t1;
+                uint64_t t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
+                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
+                t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
+                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
+                *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
+                v_store(dst + i, t1);
+                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
+            }
+        }
+    }
+    else if (esz == 12)
+    {
+        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 6)
+    {
+        flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 3)
+    {
+        flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
+    }
+    else
+#endif
+    {
+        int i, j, limit = (int)(((size.width + 1)/2)*esz);
+        AutoBuffer<int> _tab(size.width*esz);
+        int* tab = _tab.data();
+
+        for( i = 0; i < size.width; i++ )
+            for( size_t k = 0; k < esz; k++ )
+                tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( i = 0; i < limit; i++ )
+            {
+                j = tab[i];
+                uchar t0 = src[i], t1 = src[j];
+                dst[i] = t1; dst[j] = t0;
+            }
         }
     }
 }
@@ -597,6 +778,16 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
                                                   dst0 += dstep, dst1 -= dstep )
     {
         int i = 0;
+#if CV_SIMD
+        for( ; i <= size.width - (v_int32::nlanes * 4); i += v_int32::nlanes * 4 )
+        {
+            v_int32 t0 = vx_load((int*)(src0 + i));
+            v_int32 t1 = vx_load((int*)(src1 + i));
+            vx_store((int*)(dst0 + i), t1);
+            vx_store((int*)(dst1 + i), t0);
+        }
+#endif
+
         if( ((size_t)src0|(size_t)dst0|(size_t)src1|(size_t)dst1) % sizeof(int) == 0 )
         {
             for( ; i <= size.width - 16; i += 16 )