From 04b27525faa51941de054526c069bd3b7f9e4af3 Mon Sep 17 00:00:00 2001
From: Trutnev Aleksei <Aleksei.trutnev@intel.com>
Date: Fri, 17 Dec 2021 16:42:47 +0300
Subject: [PATCH] Merge pull request #21231 from
 alexgiving:atrutnev/SIMD_SubRC_fluid

GAPI FLUID: SIMD for SubRC kernel

* SIMD for SubRC

* Reverse subrc
---
 modules/gapi/include/opencv2/gapi/core.hpp    |   2 +-
 .../gapi/src/backends/fluid/gfluidcore.cpp    | 190 +++++-------------
 .../fluid/gfluidcore_func.dispatch.cpp        |  27 +++
 .../src/backends/fluid/gfluidcore_func.hpp    |  27 ++-
 .../backends/fluid/gfluidcore_func.simd.hpp   |  70 ++++++-
 5 files changed, 171 insertions(+), 145 deletions(-)
diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp
index 052c6a944c..791aa4ce56 100644
--- a/modules/gapi/include/opencv2/gapi/core.hpp
+++ b/modules/gapi/include/opencv2/gapi/core.hpp
@@ -707,7 +707,7 @@ GAPI_EXPORTS GMat subC(const GMat& src, const GScalar& c, int ddepth = -1);
 /** @brief Calculates the per-element difference between given scalar and the matrix.
 
 The function can be replaced with matrix expressions:
-    \f[\texttt{dst} =  \texttt{val} - \texttt{src}\f]
+    \f[\texttt{dst} =  \texttt{c} - \texttt{src}\f]
 
 Depth of the output matrix is determined by the ddepth parameter.
 If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix.
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
index 8342a26d0d..5f2dbe37de 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -75,7 +75,7 @@ static inline DST sub(SRC1 x, SRC2 y)
 template<typename DST, typename SRC1, typename SRC2>
 static inline DST subr(SRC1 x, SRC2 y)
 {
-    return saturate<DST>(y - x, roundf); // reverse: y - x
+    return saturate<DST>(y - x, roundf);  // reverse sub
 }
 
 template<typename DST, typename SRC1, typename SRC2>
@@ -844,110 +844,6 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false)
 //
 //--------------------------------------
 
-static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; }
-
-static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; }
-
-static inline int s_subr_8u(uchar x, uchar y) { return y - x; }
-
-static inline float s_subr_32f(float x, float y) { return y - x; }
-
-// manual SIMD if important case 8UC3
-static void run_arithm_s3(uchar out[], const uchar in[], int width, const uchar scalar[],
-                          v_uint16x8 (*v_op)(const v_uint16x8&, const v_uint16x8&),
-                          int (*s_op)(uchar, uchar))
-{
-    int w = 0;
-
-#if CV_SIMD128
-    for (; w <= width-16; w+=16)
-    {
-        v_uint8x16 x, y, z;
-        v_load_deinterleave(&in[3*w], x, y, z);
-
-        v_uint16x8 r0, r1;
-
-        v_expand(x, r0, r1);
-        r0 = v_op(r0, v_setall_u16(scalar[0])); // x + scalar[0]
-        r1 = v_op(r1, v_setall_u16(scalar[0]));
-        x = v_pack(r0, r1);
-
-        v_expand(y, r0, r1);
-        r0 = v_op(r0, v_setall_u16(scalar[1])); // y + scalar[1]
-        r1 = v_op(r1, v_setall_u16(scalar[1]));
-        y = v_pack(r0, r1);
-
-        v_expand(z, r0, r1);
-        r0 = v_op(r0, v_setall_u16(scalar[2])); // z + scalar[2]
-        r1 = v_op(r1, v_setall_u16(scalar[2]));
-        z = v_pack(r0, r1);
-
-        v_store_interleave(&out[3*w], x, y, z);
-    }
-#endif
-    cv::util::suppress_unused_warning(v_op);
-    for (; w < width; w++)
-    {
-        out[3*w    ] = saturate<uchar>( s_op(in[3*w    ], scalar[0]) );
-        out[3*w + 1] = saturate<uchar>( s_op(in[3*w + 1], scalar[1]) );
-        out[3*w + 2] = saturate<uchar>( s_op(in[3*w + 2], scalar[2]) );
-    }
-}
-
-// manually SIMD if rounding 32F into 8U, single channel
-static void run_arithm_s1(uchar out[], const float in[], int width, const float scalar[],
-                          v_float32x4 (*v_op)(const v_float32x4&, const v_float32x4&),
-                          float (*s_op)(float, float))
-{
-    int w = 0;
-
-#if CV_SIMD128
-    for (; w <= width-16; w+=16)
-    {
-        v_float32x4 r0, r1, r2, r3;
-        r0 = v_load(&in[w     ]);
-        r1 = v_load(&in[w +  4]);
-        r2 = v_load(&in[w +  8]);
-        r3 = v_load(&in[w + 12]);
-
-        r0 = v_op(r0, v_setall_f32(scalar[0])); // r + scalar[0]
-        r1 = v_op(r1, v_setall_f32(scalar[0]));
-        r2 = v_op(r2, v_setall_f32(scalar[0]));
-        r3 = v_op(r3, v_setall_f32(scalar[0]));
-
-        v_int32x4 i0, i1, i2, i3;
-        i0 = v_round(r0);
-        i1 = v_round(r1);
-        i2 = v_round(r2);
-        i3 = v_round(r3);
-
-        v_uint16x8 us0, us1;
-        us0 = v_pack_u(i0, i1);
-        us1 = v_pack_u(i2, i3);
-
-        v_uint8x16 uc;
-        uc = v_pack(us0, us1);
-
-        v_store(&out[w], uc);
-    }
-#endif
-    cv::util::suppress_unused_warning(v_op);
-    for (; w < width; w++)
-    {
-        out[w] = saturate<uchar>(s_op(in[w], scalar[0]), roundf);
-    }
-}
-
-static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[])
-{
-    run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr
-}
-
-static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[])
-{
-    run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr
-}
-
 // manually unroll the inner cycle by channels
 template<typename DST, typename SRC, typename SCALAR, typename FUNC>
 static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
@@ -1076,32 +972,20 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
 
     int width  = dst.length();
     int chan   = dst.meta().chan;
-
-    // What if we cast the scalar into the SRC type?
-    const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
-                            static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
-    bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
-                     (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
+    const int length = width * chan;
 
     switch (arithm)
     {
     case ARITHM_SUBTRACT:
-        if (usemyscal)
-        {
-            if (std::is_same<DST,uchar>::value &&
-                std::is_same<SRC,uchar>::value &&
-                chan == 3)
-                run_arithm_s_subr3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
-            else if (std::is_same<DST,uchar>::value &&
-                     std::is_same<SRC,float>::value &&
-                     chan == 1)
-                run_arithm_s_subr1((uchar*)out, (const float*)in, width, (const float*)myscal);
-            else
-                run_arithm_s(out, in, width, chan, myscal, subr<DST,SRC,SRC>);
-        }
-        else
-            run_arithm_s(out, in, width, chan, scalar, subr<DST,SRC,float>);
+    {
+        int w = 0;
+#if CV_SIMD
+        w = subrc_simd(scalar, in, out, length, chan);
+#endif
+        for (; w < length; ++w)
+            out[w] = subr<DST>(in[w], scalar[w % chan]);
         break;
+    }
     // TODO: optimize division
     case ARITHM_DIVIDE:
         for (int w=0; w < width; w++)
@@ -1274,30 +1158,54 @@ GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, true)
     }
 };
 
-GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false)
+GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, true)
 {
     static const int Window = 1;
 
-    static void run(const cv::Scalar &_scalar, const View &src, int /*dtype*/, Buffer &dst)
+    static void run(const cv::Scalar& _scalar, const View& src, int /*dtype*/, Buffer& dst, Buffer& scratch)
     {
-        const float scalar[4] = {
-            static_cast<float>(_scalar[0]),
-            static_cast<float>(_scalar[1]),
-            static_cast<float>(_scalar[2]),
-            static_cast<float>(_scalar[3])
-        };
+        GAPI_Assert(src.meta().chan <= 4);
+
+        if (dst.y() == 0)
+        {
+            const int chan = src.meta().chan;
+            float* sc = scratch.OutLine<float>();
+
+            for (int i = 0; i < scratch.length(); ++i)
+                sc[i] = static_cast<float>(_scalar[i % chan]);
+        }
+
+        const float* scalar = scratch.OutLine<float>();
 
         //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_(uchar ,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_(uchar ,  float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( short,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( float,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
-        UNARY_( float,  float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  uchar,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  short,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar,  float,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, short,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, uchar,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(ushort, float,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  short,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  uchar,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(short,  float,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  short,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(float,  float,  run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
+
+    static void initScratch(const GScalarDesc&, const GMatDesc&, int, Buffer& scratch)
+    {
+        initScratchBuffer(scratch);
+    }
+
+    static void resetScratch(Buffer& /*scratch*/)
+    {
+    }
 };
 
 GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true)
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
index ab6b013694..348c00ed12 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -138,6 +138,33 @@ SUBC_SIMD(float, float)
 
 #undef SUBC_SIMD
 
+#define SUBRC_SIMD(SRC, DST)                                              \
+int subrc_simd(const float scalar[], const SRC in[], DST out[],           \
+               const int length, const int chan)                          \
+{                                                                         \
+    CV_CPU_DISPATCH(subrc_simd, (scalar, in, out, length, chan),          \
+                    CV_CPU_DISPATCH_MODES_ALL);                           \
+}
+
+SUBRC_SIMD(uchar, uchar)
+SUBRC_SIMD(ushort, uchar)
+SUBRC_SIMD(short, uchar)
+SUBRC_SIMD(float, uchar)
+SUBRC_SIMD(short, short)
+SUBRC_SIMD(ushort, short)
+SUBRC_SIMD(uchar, short)
+SUBRC_SIMD(float, short)
+SUBRC_SIMD(ushort, ushort)
+SUBRC_SIMD(uchar, ushort)
+SUBRC_SIMD(short, ushort)
+SUBRC_SIMD(float, ushort)
+SUBRC_SIMD(uchar, float)
+SUBRC_SIMD(ushort, float)
+SUBRC_SIMD(short, float)
+SUBRC_SIMD(float, float)
+
+#undef SUBRC_SIMD
+
 #define MULC_SIMD(SRC, DST)                                               \
 int mulc_simd(const SRC in[], const float scalar[], DST out[],            \
               const int length, const int chan, const float scale)        \
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
index 522d7b8b44..6023a879d9 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -106,8 +106,31 @@ SUBC_SIMD(float, float)
 
 #undef SUBC_SIMD
 
-#define MULC_SIMD(SRC, DST)                                                 \
-int mulc_simd(const SRC in[], const float scalar[], DST out[],              \
+#define SUBRC_SIMD(SRC, DST)                                                             \
+int subrc_simd(const float scalar[], const SRC in[], DST out[],                          \
+               const int length, const int chan);
+
+SUBRC_SIMD(uchar, uchar)
+SUBRC_SIMD(ushort, uchar)
+SUBRC_SIMD(short, uchar)
+SUBRC_SIMD(float, uchar)
+SUBRC_SIMD(short, short)
+SUBRC_SIMD(ushort, short)
+SUBRC_SIMD(uchar, short)
+SUBRC_SIMD(float, short)
+SUBRC_SIMD(ushort, ushort)
+SUBRC_SIMD(uchar, ushort)
+SUBRC_SIMD(short, ushort)
+SUBRC_SIMD(float, ushort)
+SUBRC_SIMD(uchar, float)
+SUBRC_SIMD(ushort, float)
+SUBRC_SIMD(short, float)
+SUBRC_SIMD(float, float)
+
+#undef SUBRC_SIMD
+
+#define MULC_SIMD(SRC, DST)                                                              \
+int mulc_simd(const SRC in[], const float scalar[], DST out[],                           \
               const int length, const int chan, const float scale);
 
 MULC_SIMD(uchar, uchar)
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
index 12b74f8f67..38c47072f4 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -127,6 +127,28 @@ SUBC_SIMD(float, float)
 
 #undef SUBC_SIMD
 
+#define SUBRC_SIMD(SRC, DST)                                                              \
+int subrc_simd(const float scalar[], const SRC in[], DST out[],                           \
+               const int length, const int chan);
+
+SUBRC_SIMD(uchar, uchar)
+SUBRC_SIMD(ushort, uchar)
+SUBRC_SIMD(short, uchar)
+SUBRC_SIMD(float, uchar)
+SUBRC_SIMD(short, short)
+SUBRC_SIMD(ushort, short)
+SUBRC_SIMD(uchar, short)
+SUBRC_SIMD(float, short)
+SUBRC_SIMD(ushort, ushort)
+SUBRC_SIMD(uchar, ushort)
+SUBRC_SIMD(short, ushort)
+SUBRC_SIMD(float, ushort)
+SUBRC_SIMD(uchar, float)
+SUBRC_SIMD(ushort, float)
+SUBRC_SIMD(short, float)
+SUBRC_SIMD(float, float)
+
+#undef SUBRC_SIMD
 
 #define MULC_SIMD(SRC, DST)                                                              \
 int mulc_simd(const SRC in[], const float scalar[], DST out[],                           \
@@ -905,12 +927,13 @@ MUL_SIMD(float, float)
 
 //-------------------------
 //
-// Fluid kernels: AddC, SubC
+// Fluid kernels: AddC, SubC, SubRC
 //
 //-------------------------
 
 struct add_tag {};
 struct sub_tag {};
+struct subr_tag {};
 struct mul_tag {};
 struct absdiff_tag {};
 
@@ -946,6 +969,11 @@ CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc
     return a - sc;
 }
 
+CV_ALWAYS_INLINE v_float32 oper(subr_tag, const v_float32& a, const v_float32& sc)
+{
+    return sc - a;
+}
+
 CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
 {
     return a * sc;
@@ -1218,6 +1246,46 @@ SUBC_SIMD(float, float)
 
 #undef SUBC_SIMD
 
+//-------------------------------------------------------------------------------------------------
+
+#define SUBRC_SIMD(SRC, DST)                                                        \
+int subrc_simd(const float scalar[], const SRC in[], DST out[],                     \
+               const int length, const int chan)                                    \
+{                                                                                   \
+    switch (chan)                                                                   \
+    {                                                                               \
+    case 1:                                                                         \
+    case 2:                                                                         \
+    case 4:                                                                         \
+        return arithmOpScalar_simd_common(subr_tag{}, in, scalar, out, length);     \
+    case 3:                                                                         \
+        return arithmOpScalar_simd_c3(subr_tag{}, in, scalar, out, length);         \
+    default:                                                                        \
+        GAPI_Assert(chan <= 4);                                                     \
+        break;                                                                      \
+    }                                                                               \
+    return 0;                                                                       \
+}
+
+SUBRC_SIMD(uchar, uchar)
+SUBRC_SIMD(ushort, uchar)
+SUBRC_SIMD(short, uchar)
+SUBRC_SIMD(float, uchar)
+SUBRC_SIMD(short, short)
+SUBRC_SIMD(ushort, short)
+SUBRC_SIMD(uchar, short)
+SUBRC_SIMD(float, short)
+SUBRC_SIMD(ushort, ushort)
+SUBRC_SIMD(uchar, ushort)
+SUBRC_SIMD(short, ushort)
+SUBRC_SIMD(float, ushort)
+SUBRC_SIMD(uchar, float)
+SUBRC_SIMD(ushort, float)
+SUBRC_SIMD(short, float)
+SUBRC_SIMD(float, float)
+
+#undef SUBRC_SIMD
+
 //-------------------------
 //
 // Fluid kernels: MulC