Merge pull request #21520 from alexgiving:atrutnev/simd_for_split4

GAPI FLUID: Enable dynamic dispatching for Split4 * Enable dynamic dispatching for split4 * Add tail proc for split3 and split4
2022-01-31 20:45:56 +03:00 · 2022-01-31 20:45:56 +03:00 · 245f6273bd
commit 245f6273bd
parent 870c8d3c4e
4 changed files with 74 additions and 23 deletions
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@ -2537,27 +2537,18 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
    static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4)
    {
-        const auto *in   =  src.InLine<uchar>(0);
+        const auto *in = src.InLine<uchar>(0);
-              auto *out1 = dst1.OutLine<uchar>();
+        auto *out1     = dst1.OutLine<uchar>();
-              auto *out2 = dst2.OutLine<uchar>();
+        auto *out2     = dst2.OutLine<uchar>();
-              auto *out3 = dst3.OutLine<uchar>();
+        auto *out3     = dst3.OutLine<uchar>();
-              auto *out4 = dst4.OutLine<uchar>();
+        auto *out4     = dst4.OutLine<uchar>();
        GAPI_Assert(4 == src.meta().chan);
        int width = src.length();
        int w = 0;
-        int w = 0; // cycle counter
+    #if CV_SIMD
-
+        w = split4_simd(in, out1, out2, out3, out4, width);
    #if CV_SIMD128
        for (; w <= width-16; w+=16)
        {
            v_uint8x16 a, b, c, d;
            v_load_deinterleave(&in[4*w], a, b, c, d);
            v_store(&out1[w], a);
            v_store(&out2[w], b);
            v_store(&out3[w], c);
            v_store(&out4[w], d);
        }
    #endif
        for (; w < width; w++)
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@ -214,6 +214,13 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
                    CV_CPU_DISPATCH_MODES_ALL);
 }
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                uchar out3[], uchar out4[], const int width)
 {
    CV_CPU_DISPATCH(split4_simd, (in, out1, out2, out3, out4, width),
                    CV_CPU_DISPATCH_MODES_ALL);
 }
 } // namespace fluid
 } // namespace gapi
 } // namespace cv
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@ -166,6 +166,9 @@ ABSDIFFC_SIMD(float)
 int split3_simd(const uchar in[], uchar out1[], uchar out2[],
                uchar out3[], const int width);
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                uchar out3[], uchar out4[], const int width);
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@ -187,6 +187,9 @@ ABSDIFFC_SIMD(float)
 int split3_simd(const uchar in[], uchar out1[], uchar out2[],
                uchar out3[], const int width);
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                uchar out3[], uchar out4[], const int width);
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 struct scale_tag {};
@ -1581,14 +1584,61 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
                uchar out3[], const int width)
 {
    constexpr int nlanes = v_uint8::nlanes;
    if (width < nlanes)
        return 0;
    int x = 0;
-    for (; x <= width - nlanes; x += nlanes)
+    for (;;)
    {
-        v_uint8 a, b, c;
+        for (; x <= width - nlanes; x += nlanes)
-        v_load_deinterleave(&in[3 * x], a, b, c);
+        {
-        vx_store(&out1[x], a);
+            v_uint8 a, b, c;
-        vx_store(&out2[x], b);
+            v_load_deinterleave(&in[3 * x], a, b, c);
-        vx_store(&out3[x], c);
+            vx_store(&out1[x], a);
            vx_store(&out2[x], b);
            vx_store(&out3[x], c);
        }
        if (x < width)
        {
            x = width - nlanes;
            continue;
        }
        break;
    }
    return x;
 }
 //-------------------------
 //
 // Fluid kernels: Split4
 //
 //-------------------------
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                uchar out3[], uchar out4[], const int width)
 {
    constexpr int nlanes = v_uint8::nlanes;
    if (width < nlanes)
        return 0;
    int x = 0;
    for (;;)
    {
        for (; x <= width - nlanes; x += nlanes)
        {
            v_uint8 a, b, c, d;
            v_load_deinterleave(&in[4 * x], a, b, c, d);
            vx_store(&out1[x], a);
            vx_store(&out2[x], b);
            vx_store(&out3[x], c);
            vx_store(&out4[x], d);
        }
        if (x < width)
        {
            x = width - nlanes;
            continue;
        }
        break;
    }
    return x;
 }