diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp index a01a8902b6..b1338ddf72 100644 --- a/modules/gapi/include/opencv2/gapi/core.hpp +++ b/modules/gapi/include/opencv2/gapi/core.hpp @@ -298,8 +298,8 @@ namespace core { } }; - G_TYPED_KERNEL(GAbsDiffC, , "org.opencv.core.matrixop.absdiffC") { - static GMatDesc outMeta(GMatDesc a, GScalarDesc) { + G_TYPED_KERNEL(GAbsDiffC, , "org.opencv.core.matrixop.absdiffC") { + static GMatDesc outMeta(const GMatDesc& a, const GScalarDesc&) { return a; } }; diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index 874b75b679..6be1e1a8c4 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -147,7 +147,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestFluid, AbsDiffPerfTest, INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestFluid, AbsDiffCPerfTest, Combine(Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), + Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2, + CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3, + CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4), Values(cv::compile_args(CORE_FLUID)))); // INSTANTIATE_TEST_CASE_P(SumPerfTestFluid, SumPerfTest, diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index f885f8db18..e1e9332d5e 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -97,7 +97,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1) // Fluid kernels: addWeighted // //--------------------------- -#if CV_SSE2 +#if CV_SIMD CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in) { return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in))); @@ -112,7 +112,9 @@ CV_ALWAYS_INLINE v_float32 v_load_f32(const uchar* in) { return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in))); } +#endif +#if CV_SSE2 CV_ALWAYS_INLINE void addw_short_store(short* out, const v_int32& c1, const v_int32& c2) { vx_store(out, v_pack(c1, c2)); @@ -972,6 +974,262 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan, CV_Error(cv::Error::StsBadArg, "unsupported number of channels"); } +#if CV_SIMD +CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(short* out_ptr, const v_int32& c1, const v_int32& c2) +{ + vx_store(out_ptr, v_pack(c1, c2)); +} + +CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(ushort* out_ptr, const v_int32& c1, const v_int32& c2) +{ + vx_store(out_ptr, v_pack_u(c1, c2)); +} + +template +CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const T in[], T out[], + const v_float32& s, const int length) +{ + static_assert((std::is_same::value) || (std::is_same::value), + "This templated overload is only for short or ushort type combinations."); + + constexpr int nlanes = (std::is_same::value) ? static_cast(v_uint16::nlanes) : + static_cast(v_int16::nlanes); + if (length < nlanes) + return 0; + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = v_load_f32(in + x); + v_float32 a2 = v_load_f32(in + x + nlanes / 2); + + absdiffc_short_store_c1c2c4(&out[x], v_round(v_absdiff(a1, s)), + v_round(v_absdiff(a2, s))); + } + + if (x < length && (in != out)) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +template<> +CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const uchar in[], uchar out[], + const v_float32& s, const int length) +{ + constexpr int nlanes = static_cast(v_uint8::nlanes); + + if (length < nlanes) + return 0; + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + v_float32 a1 = v_load_f32(in + x); + v_float32 a2 = v_load_f32(in + x + nlanes / 4); + v_float32 a3 = v_load_f32(in + x + nlanes / 2); + v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 4); + + vx_store(&out[x], v_pack_u(v_pack(v_round(v_absdiff(a1, s)), + v_round(v_absdiff(a2, s))), + v_pack(v_round(v_absdiff(a3, s)), + v_round(v_absdiff(a4, s))))); + } + + if (x < length && (in != out)) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +CV_ALWAYS_INLINE void absdiffc_short_store_c3(short* out_ptr, const v_int32& c1, + const v_int32& c2, const v_int32& c3, + const v_int32& c4, const v_int32& c5, + const v_int32& c6) +{ + constexpr int nlanes = static_cast(v_int16::nlanes); + vx_store(out_ptr, v_pack(c1, c2)); + vx_store(out_ptr + nlanes, v_pack(c3, c4)); + vx_store(out_ptr + 2*nlanes, v_pack(c5, c6)); +} + +CV_ALWAYS_INLINE void absdiffc_short_store_c3(ushort* out_ptr, const v_int32& c1, + const v_int32& c2, const v_int32& c3, + const v_int32& c4, const v_int32& c5, + const v_int32& c6) +{ + constexpr int nlanes = static_cast(v_uint16::nlanes); + vx_store(out_ptr, v_pack_u(c1, c2)); + vx_store(out_ptr + nlanes, v_pack_u(c3, c4)); + vx_store(out_ptr + 2*nlanes, v_pack_u(c5, c6)); +} + +template +CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const T in[], T out[], + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const int length) +{ + static_assert((std::is_same::value) || (std::is_same::value), + "This templated overload is only for short or ushort type combinations."); + + constexpr int nlanes = (std::is_same::value) ? static_cast(v_uint16::nlanes): + static_cast(v_int16::nlanes); + + if (length < 3 * nlanes) + return 0; + + int x = 0; + for (;;) + { + for (; x <= length - 3 * nlanes; x += 3 * nlanes) + { + v_float32 a1 = v_load_f32(in + x); + v_float32 a2 = v_load_f32(in + x + nlanes / 2); + v_float32 a3 = v_load_f32(in + x + nlanes); + v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 2); + v_float32 a5 = v_load_f32(in + x + 2 * nlanes); + v_float32 a6 = v_load_f32(in + x + 5 * nlanes / 2); + + absdiffc_short_store_c3(&out[x], v_round(v_absdiff(a1, s1)), + v_round(v_absdiff(a2, s2)), + v_round(v_absdiff(a3, s3)), + v_round(v_absdiff(a4, s1)), + v_round(v_absdiff(a5, s2)), + v_round(v_absdiff(a6, s3))); + } + + if (x < length && (in != out)) + { + x = length - 3 * nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +template<> +CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const uchar in[], uchar out[], + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const int length) +{ + constexpr int nlanes = static_cast(v_uint8::nlanes); + + if (length < 3 * nlanes) + return 0; + + int x = 0; + + for (;;) + { + for (; x <= length - 3 * nlanes; x += 3 * nlanes) + { + vx_store(&out[x], + v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x), s1)), + v_round(v_absdiff(v_load_f32(in + x + nlanes/4), s2))), + v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes/2), s3)), + v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/4), s1))))); + + vx_store(&out[x + nlanes], + v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes), s2)), + v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/4), s3))), + v_pack(v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/2), s1)), + v_round(v_absdiff(v_load_f32(in + x + 7*nlanes/4), s2))))); + + vx_store(&out[x + 2 * nlanes], + v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + 2*nlanes), s3)), + v_round(v_absdiff(v_load_f32(in + x + 9*nlanes/4), s1))), + v_pack(v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/2), s2)), + v_round(v_absdiff(v_load_f32(in + x + 11*nlanes/4), s3))))); + } + + if (x < length && (in != out)) + { + x = length - 3 * nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +template +CV_ALWAYS_INLINE int absdiffc_simd_channels(const T in[], const float scalar[], T out[], + const int width, int chan) +{ + int length = width * chan; + v_float32 s = vx_load(scalar); + + return absdiffc_simd_c1c2c4(in, out, s, length); +} + +template +CV_ALWAYS_INLINE int absdiffc_simd_c3(const T in[], const float scalar[], T out[], int width) +{ + constexpr int chan = 3; + int length = width * chan; + + v_float32 s1 = vx_load(scalar); +#if CV_SIMD_WIDTH == 32 + v_float32 s2 = vx_load(scalar + 2); + v_float32 s3 = vx_load(scalar + 1); +#else + v_float32 s2 = vx_load(scalar + 1); + v_float32 s3 = vx_load(scalar + 2); +#endif + + return absdiffc_simd_c3_impl(in, out, s1, s2, s3, length); +} + +template +CV_ALWAYS_INLINE int absdiffc_simd(const T in[], const float scalar[], T out[], int width, int chan) +{ + switch (chan) + { + case 1: + case 2: + case 4: + return absdiffc_simd_channels(in, scalar, out, width, chan); + case 3: + return absdiffc_simd_c3(in, scalar, out, width); + default: + break; + } + + return 0; +} +#endif // CV_SIMD + +template +static void run_absdiffc(Buffer &dst, const View &src, const float scalar[]) +{ + const auto *in = src.InLine(0); + auto *out = dst.OutLine(); + + int width = dst.length(); + int chan = dst.meta().chan; + + int w = 0; +#if CV_SIMD + w = absdiffc_simd(in, scalar, out, width, chan); +#endif + + for (; w < width*chan; ++w) + out[w] = absdiff(in[w], scalar[w%chan]); +} + template static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm, float scale=1) @@ -990,11 +1248,6 @@ static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Ar switch (arithm) { - case ARITHM_ABSDIFF: - for (int w=0; w < width; w++) - for (int c=0; c < chan; c++) - out[chan*w + c] = absdiff(in[chan*w + c], scalar[c]); - break; case ARITHM_ADD: if (usemyscal) { @@ -1089,26 +1342,47 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A } } -GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, false) +GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true) { static const int Window = 1; - static void run(const View &src, const cv::Scalar &_scalar, Buffer &dst) + static void run(const View &src, const cv::Scalar& _scalar, Buffer &dst, Buffer& scratch) { - const float scalar[4] = { - static_cast(_scalar[0]), - static_cast(_scalar[1]), - static_cast(_scalar[2]), - static_cast(_scalar[3]) - }; + if (dst.y() == 0) + { + const int chan = src.meta().chan; + float* sc = scratch.OutLine(); + + for (int i = 0; i < scratch.length(); ++i) + sc[i] = static_cast(_scalar[i % chan]); + } + + const float* scalar = scratch.OutLine(); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF); - UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF); - UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF); + UNARY_(uchar, uchar, run_absdiffc, dst, src, scalar); + UNARY_(ushort, ushort, run_absdiffc, dst, src, scalar); + UNARY_(short, short, run_absdiffc, dst, src, scalar); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GMatDesc&, const GScalarDesc&, Buffer& scratch) + { +#if CV_SIMD + constexpr int buflen = static_cast(v_float32::nlanes) + 2; // buffer size +#else + constexpr int buflen = 4; +#endif + cv::Size bufsize(buflen, 1); + GMatDesc bufdesc = { CV_32F, 1, bufsize }; + Buffer buffer(bufdesc); + scratch = std::move(buffer); + } + + static void resetScratch(Buffer& /* scratch */) + { + } }; GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false) diff --git a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp index 87e447aae6..4434f6ebe2 100644 --- a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp +++ b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp @@ -105,7 +105,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffTestFluid, AbsDiffTest, Values(CORE_FLUID))); INSTANTIATE_TEST_CASE_P(AbsDiffCTestFluid, AbsDiffCTest, - Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1), + Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2, + CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3, + CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4), Values(cv::Size(1280, 720), cv::Size(640, 480), cv::Size(128, 128)),