diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index d701a2ac24..403bcf252d 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -2537,27 +2537,18 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false) static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4) { - const auto *in = src.InLine(0); - auto *out1 = dst1.OutLine(); - auto *out2 = dst2.OutLine(); - auto *out3 = dst3.OutLine(); - auto *out4 = dst4.OutLine(); + const auto *in = src.InLine(0); + auto *out1 = dst1.OutLine(); + auto *out2 = dst2.OutLine(); + auto *out3 = dst3.OutLine(); + auto *out4 = dst4.OutLine(); GAPI_Assert(4 == src.meta().chan); int width = src.length(); + int w = 0; - int w = 0; // cycle counter - - #if CV_SIMD128 - for (; w <= width-16; w+=16) - { - v_uint8x16 a, b, c, d; - v_load_deinterleave(&in[4*w], a, b, c, d); - v_store(&out1[w], a); - v_store(&out2[w], b); - v_store(&out3[w], c); - v_store(&out4[w], d); - } + #if CV_SIMD + w = split4_simd(in, out1, out2, out3, out4, width); #endif for (; w < width; w++) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index 8ba99bae5e..30e3d1f5ea 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -214,6 +214,13 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], CV_CPU_DISPATCH_MODES_ALL); } +int split4_simd(const uchar in[], uchar out1[], uchar out2[], + uchar out3[], uchar out4[], const int width) +{ + CV_CPU_DISPATCH(split4_simd, (in, out1, out2, out3, out4, width), + CV_CPU_DISPATCH_MODES_ALL); +} + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index f0a16e8829..e0fdf812f2 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -166,6 +166,9 @@ ABSDIFFC_SIMD(float) int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width); +int split4_simd(const uchar in[], uchar out1[], uchar out2[], + uchar out3[], uchar out4[], const int width); + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 2f41aa46ea..9f7886f9b0 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -187,6 +187,9 @@ ABSDIFFC_SIMD(float) int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width); +int split4_simd(const uchar in[], uchar out1[], uchar out2[], + uchar out3[], uchar out4[], const int width); + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY struct scale_tag {}; @@ -1581,14 +1584,61 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width) { constexpr int nlanes = v_uint8::nlanes; + if (width < nlanes) + return 0; + int x = 0; - for (; x <= width - nlanes; x += nlanes) + for (;;) { - v_uint8 a, b, c; - v_load_deinterleave(&in[3 * x], a, b, c); - vx_store(&out1[x], a); - vx_store(&out2[x], b); - vx_store(&out3[x], c); + for (; x <= width - nlanes; x += nlanes) + { + v_uint8 a, b, c; + v_load_deinterleave(&in[3 * x], a, b, c); + vx_store(&out1[x], a); + vx_store(&out2[x], b); + vx_store(&out3[x], c); + } + if (x < width) + { + x = width - nlanes; + continue; + } + break; + } + return x; +} + +//------------------------- +// +// Fluid kernels: Split4 +// +//------------------------- + +int split4_simd(const uchar in[], uchar out1[], uchar out2[], + uchar out3[], uchar out4[], const int width) +{ + constexpr int nlanes = v_uint8::nlanes; + if (width < nlanes) + return 0; + + int x = 0; + for (;;) + { + for (; x <= width - nlanes; x += nlanes) + { + v_uint8 a, b, c, d; + v_load_deinterleave(&in[4 * x], a, b, c, d); + vx_store(&out1[x], a); + vx_store(&out2[x], b); + vx_store(&out3[x], c); + vx_store(&out4[x], d); + } + if (x < width) + { + x = width - nlanes; + continue; + } + break; } return x; }