From 04b27525faa51941de054526c069bd3b7f9e4af3 Mon Sep 17 00:00:00 2001 From: Trutnev Aleksei Date: Fri, 17 Dec 2021 16:42:47 +0300 Subject: [PATCH] Merge pull request #21231 from alexgiving:atrutnev/SIMD_SubRC_fluid GAPI FLUID: SIMD for SubRC kernel * SIMD for SubRC * Reverse subrc --- modules/gapi/include/opencv2/gapi/core.hpp | 2 +- .../gapi/src/backends/fluid/gfluidcore.cpp | 190 +++++------------- .../fluid/gfluidcore_func.dispatch.cpp | 27 +++ .../src/backends/fluid/gfluidcore_func.hpp | 27 ++- .../backends/fluid/gfluidcore_func.simd.hpp | 70 ++++++- 5 files changed, 171 insertions(+), 145 deletions(-) diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp index 052c6a944c..791aa4ce56 100644 --- a/modules/gapi/include/opencv2/gapi/core.hpp +++ b/modules/gapi/include/opencv2/gapi/core.hpp @@ -707,7 +707,7 @@ GAPI_EXPORTS GMat subC(const GMat& src, const GScalar& c, int ddepth = -1); /** @brief Calculates the per-element difference between given scalar and the matrix. The function can be replaced with matrix expressions: - \f[\texttt{dst} = \texttt{val} - \texttt{src}\f] + \f[\texttt{dst} = \texttt{c} - \texttt{src}\f] Depth of the output matrix is determined by the ddepth parameter. If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix. diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 8342a26d0d..5f2dbe37de 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -75,7 +75,7 @@ static inline DST sub(SRC1 x, SRC2 y) template static inline DST subr(SRC1 x, SRC2 y) { - return saturate(y - x, roundf); // reverse: y - x + return saturate(y - x, roundf); // reverse sub } template @@ -844,110 +844,6 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false) // //-------------------------------------- -static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; } - -static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; } - -static inline int s_subr_8u(uchar x, uchar y) { return y - x; } - -static inline float s_subr_32f(float x, float y) { return y - x; } - -// manual SIMD if important case 8UC3 -static void run_arithm_s3(uchar out[], const uchar in[], int width, const uchar scalar[], - v_uint16x8 (*v_op)(const v_uint16x8&, const v_uint16x8&), - int (*s_op)(uchar, uchar)) -{ - int w = 0; - -#if CV_SIMD128 - for (; w <= width-16; w+=16) - { - v_uint8x16 x, y, z; - v_load_deinterleave(&in[3*w], x, y, z); - - v_uint16x8 r0, r1; - - v_expand(x, r0, r1); - r0 = v_op(r0, v_setall_u16(scalar[0])); // x + scalar[0] - r1 = v_op(r1, v_setall_u16(scalar[0])); - x = v_pack(r0, r1); - - v_expand(y, r0, r1); - r0 = v_op(r0, v_setall_u16(scalar[1])); // y + scalar[1] - r1 = v_op(r1, v_setall_u16(scalar[1])); - y = v_pack(r0, r1); - - v_expand(z, r0, r1); - r0 = v_op(r0, v_setall_u16(scalar[2])); // z + scalar[2] - r1 = v_op(r1, v_setall_u16(scalar[2])); - z = v_pack(r0, r1); - - v_store_interleave(&out[3*w], x, y, z); - } -#endif - cv::util::suppress_unused_warning(v_op); - for (; w < width; w++) - { - out[3*w ] = saturate( s_op(in[3*w ], scalar[0]) ); - out[3*w + 1] = saturate( s_op(in[3*w + 1], scalar[1]) ); - out[3*w + 2] = saturate( s_op(in[3*w + 2], scalar[2]) ); - } -} - -// manually SIMD if rounding 32F into 8U, single channel -static void run_arithm_s1(uchar out[], const float in[], int width, const float scalar[], - v_float32x4 (*v_op)(const v_float32x4&, const v_float32x4&), - float (*s_op)(float, float)) -{ - int w = 0; - -#if CV_SIMD128 - for (; w <= width-16; w+=16) - { - v_float32x4 r0, r1, r2, r3; - r0 = v_load(&in[w ]); - r1 = v_load(&in[w + 4]); - r2 = v_load(&in[w + 8]); - r3 = v_load(&in[w + 12]); - - r0 = v_op(r0, v_setall_f32(scalar[0])); // r + scalar[0] - r1 = v_op(r1, v_setall_f32(scalar[0])); - r2 = v_op(r2, v_setall_f32(scalar[0])); - r3 = v_op(r3, v_setall_f32(scalar[0])); - - v_int32x4 i0, i1, i2, i3; - i0 = v_round(r0); - i1 = v_round(r1); - i2 = v_round(r2); - i3 = v_round(r3); - - v_uint16x8 us0, us1; - us0 = v_pack_u(i0, i1); - us1 = v_pack_u(i2, i3); - - v_uint8x16 uc; - uc = v_pack(us0, us1); - - v_store(&out[w], uc); - } -#endif - cv::util::suppress_unused_warning(v_op); - for (; w < width; w++) - { - out[w] = saturate(s_op(in[w], scalar[0]), roundf); - } -} - -static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[]) -{ - run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr -} - -static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[]) -{ - run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr -} - // manually unroll the inner cycle by channels template static void run_arithm_s(DST out[], const SRC in[], int width, int chan, @@ -1076,32 +972,20 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A int width = dst.length(); int chan = dst.meta().chan; - - // What if we cast the scalar into the SRC type? - const SRC myscal[4] = { static_cast(scalar[0]), static_cast(scalar[1]), - static_cast(scalar[2]), static_cast(scalar[3]) }; - bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) && - (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]); + const int length = width * chan; switch (arithm) { case ARITHM_SUBTRACT: - if (usemyscal) - { - if (std::is_same::value && - std::is_same::value && - chan == 3) - run_arithm_s_subr3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal); - else if (std::is_same::value && - std::is_same::value && - chan == 1) - run_arithm_s_subr1((uchar*)out, (const float*)in, width, (const float*)myscal); - else - run_arithm_s(out, in, width, chan, myscal, subr); - } - else - run_arithm_s(out, in, width, chan, scalar, subr); + { + int w = 0; +#if CV_SIMD + w = subrc_simd(scalar, in, out, length, chan); +#endif + for (; w < length; ++w) + out[w] = subr(in[w], scalar[w % chan]); break; + } // TODO: optimize division case ARITHM_DIVIDE: for (int w=0; w < width; w++) @@ -1274,30 +1158,54 @@ GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, true) } }; -GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false) +GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, true) { static const int Window = 1; - static void run(const cv::Scalar &_scalar, const View &src, int /*dtype*/, Buffer &dst) + static void run(const cv::Scalar& _scalar, const View& src, int /*dtype*/, Buffer& dst, Buffer& scratch) { - const float scalar[4] = { - static_cast(_scalar[0]), - static_cast(_scalar[1]), - static_cast(_scalar[2]), - static_cast(_scalar[3]) - }; + GAPI_Assert(src.meta().chan <= 4); + + if (dst.y() == 0) + { + const int chan = src.meta().chan; + float* sc = scratch.OutLine(); + + for (int i = 0; i < scratch.length(); ++i) + sc[i] = static_cast(_scalar[i % chan]); + } + + const float* scalar = scratch.OutLine(); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_(uchar , short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_(uchar , float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( short, short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( float, short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); - UNARY_( float, float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, uchar, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(uchar, float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, uchar, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(ushort, float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, uchar, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(short, float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, ushort, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); + UNARY_(float, float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GScalarDesc&, const GMatDesc&, int, Buffer& scratch) + { + initScratchBuffer(scratch); + } + + static void resetScratch(Buffer& /*scratch*/) + { + } }; GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index ab6b013694..348c00ed12 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -138,6 +138,33 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD +#define SUBRC_SIMD(SRC, DST) \ +int subrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan) \ +{ \ + CV_CPU_DISPATCH(subrc_simd, (scalar, in, out, length, chan), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +SUBRC_SIMD(uchar, uchar) +SUBRC_SIMD(ushort, uchar) +SUBRC_SIMD(short, uchar) +SUBRC_SIMD(float, uchar) +SUBRC_SIMD(short, short) +SUBRC_SIMD(ushort, short) +SUBRC_SIMD(uchar, short) +SUBRC_SIMD(float, short) +SUBRC_SIMD(ushort, ushort) +SUBRC_SIMD(uchar, ushort) +SUBRC_SIMD(short, ushort) +SUBRC_SIMD(float, ushort) +SUBRC_SIMD(uchar, float) +SUBRC_SIMD(ushort, float) +SUBRC_SIMD(short, float) +SUBRC_SIMD(float, float) + +#undef SUBRC_SIMD + #define MULC_SIMD(SRC, DST) \ int mulc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan, const float scale) \ diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 522d7b8b44..6023a879d9 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -106,8 +106,31 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD -#define MULC_SIMD(SRC, DST) \ -int mulc_simd(const SRC in[], const float scalar[], DST out[], \ +#define SUBRC_SIMD(SRC, DST) \ +int subrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan); + +SUBRC_SIMD(uchar, uchar) +SUBRC_SIMD(ushort, uchar) +SUBRC_SIMD(short, uchar) +SUBRC_SIMD(float, uchar) +SUBRC_SIMD(short, short) +SUBRC_SIMD(ushort, short) +SUBRC_SIMD(uchar, short) +SUBRC_SIMD(float, short) +SUBRC_SIMD(ushort, ushort) +SUBRC_SIMD(uchar, ushort) +SUBRC_SIMD(short, ushort) +SUBRC_SIMD(float, ushort) +SUBRC_SIMD(uchar, float) +SUBRC_SIMD(ushort, float) +SUBRC_SIMD(short, float) +SUBRC_SIMD(float, float) + +#undef SUBRC_SIMD + +#define MULC_SIMD(SRC, DST) \ +int mulc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan, const float scale); MULC_SIMD(uchar, uchar) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 12b74f8f67..38c47072f4 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -127,6 +127,28 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD +#define SUBRC_SIMD(SRC, DST) \ +int subrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan); + +SUBRC_SIMD(uchar, uchar) +SUBRC_SIMD(ushort, uchar) +SUBRC_SIMD(short, uchar) +SUBRC_SIMD(float, uchar) +SUBRC_SIMD(short, short) +SUBRC_SIMD(ushort, short) +SUBRC_SIMD(uchar, short) +SUBRC_SIMD(float, short) +SUBRC_SIMD(ushort, ushort) +SUBRC_SIMD(uchar, ushort) +SUBRC_SIMD(short, ushort) +SUBRC_SIMD(float, ushort) +SUBRC_SIMD(uchar, float) +SUBRC_SIMD(ushort, float) +SUBRC_SIMD(short, float) +SUBRC_SIMD(float, float) + +#undef SUBRC_SIMD #define MULC_SIMD(SRC, DST) \ int mulc_simd(const SRC in[], const float scalar[], DST out[], \ @@ -905,12 +927,13 @@ MUL_SIMD(float, float) //------------------------- // -// Fluid kernels: AddC, SubC +// Fluid kernels: AddC, SubC, SubRC // //------------------------- struct add_tag {}; struct sub_tag {}; +struct subr_tag {}; struct mul_tag {}; struct absdiff_tag {}; @@ -946,6 +969,11 @@ CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc return a - sc; } +CV_ALWAYS_INLINE v_float32 oper(subr_tag, const v_float32& a, const v_float32& sc) +{ + return sc - a; +} + CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc) { return a * sc; @@ -1218,6 +1246,46 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD +//------------------------------------------------------------------------------------------------- + +#define SUBRC_SIMD(SRC, DST) \ +int subrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan) \ +{ \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + return arithmOpScalar_simd_common(subr_tag{}, in, scalar, out, length); \ + case 3: \ + return arithmOpScalar_simd_c3(subr_tag{}, in, scalar, out, length); \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ +} + +SUBRC_SIMD(uchar, uchar) +SUBRC_SIMD(ushort, uchar) +SUBRC_SIMD(short, uchar) +SUBRC_SIMD(float, uchar) +SUBRC_SIMD(short, short) +SUBRC_SIMD(ushort, short) +SUBRC_SIMD(uchar, short) +SUBRC_SIMD(float, short) +SUBRC_SIMD(ushort, ushort) +SUBRC_SIMD(uchar, ushort) +SUBRC_SIMD(short, ushort) +SUBRC_SIMD(float, ushort) +SUBRC_SIMD(uchar, float) +SUBRC_SIMD(ushort, float) +SUBRC_SIMD(short, float) +SUBRC_SIMD(float, float) + +#undef SUBRC_SIMD + //------------------------- // // Fluid kernels: MulC