diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp index a2805b35aa..bdd11b1214 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp @@ -25,6 +25,9 @@ #include "gfluidimgproc_func.hpp" +#if CV_AVX2 +#include "gfluidimgproc_simd_avx2.hpp" +#endif #if CV_SSE4_1 #include "gfluidcore_simd_sse41.hpp" #endif @@ -2132,11 +2135,25 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in, { auto index0 = mapsy[outY + l] - inY; auto index1 = mapsy[outSz.height + outY + l] - inY; + src0[l] = in.InLine(index0); src1[l] = in.InLine(index1); dst[l] = out.OutLine(l); } +#if CV_AVX2 + // number floats in AVX2 SIMD vector. + constexpr int nlanes = 8; + + if (inSz.width >= nlanes && outSz.width >= nlanes) + { + avx2::calcRowLinear32FC1Impl(dst, src0, src1, alpha, mapsx, beta, + inSz, outSz, lpi); + + return; + } +#endif // CV_AVX2 + using alpha_type = typename Mapper::alpha_type; for (int l = 0; l < lpi; ++l) { @@ -2150,6 +2167,7 @@ CV_ALWAYS_INLINE void calcRowLinear(const cv::gapi::fluid::View& in, auto alpha1 = saturate_cast(unity - alpha[x]); auto sx0 = mapsx[x]; auto sx1 = sx0 + 1; + float tmp0 = resize_main_calculation(b0, src0[l][sx0], b1, src1[l][sx0]); float tmp1 = resize_main_calculation(b0, src0[l][sx1], b1, src1[l][sx1]); dst[l][x] = resize_main_calculation(alpha0, tmp0, alpha1, tmp1); @@ -2174,6 +2192,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true) GAPI_Assert((in.depth == CV_8U && in.chan == 3) || (in.depth == CV_32F && in.chan == 1)); GAPI_Assert(interp == cv::INTER_LINEAR); + int outSz_w; int outSz_h; if (outSz.width == 0 || outSz.height == 0) @@ -2212,6 +2231,7 @@ GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::imgproc::GResize, true) GAPI_Assert((in.meta().depth == CV_8U && in.meta().chan == 3) || (in.meta().depth == CV_32F && in.meta().chan == 1)); GAPI_Assert(interp == cv::INTER_LINEAR); + const int channels = in.meta().chan; const int depth = in.meta().depth; diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp new file mode 100644 index 0000000000..e246f0613b --- /dev/null +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_simd_avx2.hpp @@ -0,0 +1,181 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2022 Intel Corporation + +#if !defined(GAPI_STANDALONE) + +#include "opencv2/gapi/own/saturate.hpp" + +#include + +#include "opencv2/core.hpp" + +#include + +#include +#include + +#include +#include +#include + +namespace cv { +namespace gapi { +namespace fluid { +namespace avx2 { + +CV_ALWAYS_INLINE void v_gather_pairs(const float src[], const int* mapsx, + v_float32x8& low, v_float32x8& high) +{ + low.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast(&src[mapsx[0]]), + *reinterpret_cast(&src[mapsx[1]]), + *reinterpret_cast(&src[mapsx[2]]), + *reinterpret_cast(&src[mapsx[3]]))); + high.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast(&src[mapsx[4]]), + *reinterpret_cast(&src[mapsx[5]]), + *reinterpret_cast(&src[mapsx[6]]), + *reinterpret_cast(&src[mapsx[7]]))); +} + +CV_ALWAYS_INLINE void v_deinterleave(const v_float32x8& low, const v_float32x8& high, + v_float32x8& even, v_float32x8& odd) +{ + __m256 tmp0 = _mm256_unpacklo_ps(low.val, high.val); + __m256 tmp1 = _mm256_unpackhi_ps(low.val, high.val); + __m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); + __m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); + even.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp2), 216 /*11011000*/)); + odd.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp3), 216 /*11011000*/)); +} + +// Resize (bi-linear, 32FC1) +CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(float *dst[], + const float *src0[], + const float *src1[], + const float alpha[], + const int mapsx[], + const float beta[], + const Size& inSz, + const Size& outSz, + const int lpi) +{ + bool xRatioEq1 = inSz.width == outSz.width; + bool yRatioEq1 = inSz.height == outSz.height; + + constexpr int nlanes = v_float32x8::nlanes; + + if (!xRatioEq1 && !yRatioEq1) + { + for (int line = 0; line < lpi; ++line) { + float beta0 = beta[line]; + float beta1 = 1 - beta0; + v_float32x8 v_beta0 = v256_setall_f32(beta0); + int x = 0; + + v_float32x8 low1, high1, s00, s01; + v_float32x8 low2, high2, s10, s11; + for (; x <= outSz.width - nlanes; x += nlanes) + { + v_float32x8 alpha0 = v256_load(&alpha[x]); + // v_float32 alpha1 = 1.f - alpha0; + + v_gather_pairs(src0[line], &mapsx[x], low1, high1); + v_deinterleave(low1, high1, s00, s01); + + // v_float32 res0 = s00*alpha0 + s01*alpha1; + v_float32x8 res0 = v_fma(s00 - s01, alpha0, s01); + + v_gather_pairs(src1[line], &mapsx[x], low2, high2); + v_deinterleave(low2, high2, s10, s11); + + // v_float32 res1 = s10*alpha0 + s11*alpha1; + v_float32x8 res1 = v_fma(s10 - s11, alpha0, s11); + // v_float32 d = res0*beta0 + res1*beta1; + v_float32x8 d = v_fma(res0 - res1, v_beta0, res1); + + v_store(&dst[line][x], d); + } + + for (; x < outSz.width; ++x) + { + float alpha0 = alpha[x]; + float alpha1 = 1 - alpha0; + int sx0 = mapsx[x]; + int sx1 = sx0 + 1; + float res0 = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1; + float res1 = src1[line][sx0] * alpha0 + src1[line][sx1] * alpha1; + dst[line][x] = beta0 * res0 + beta1 * res1; + } + } + } + else if (!xRatioEq1) + { + + for (int line = 0; line < lpi; ++line) { + int x = 0; + + v_float32x8 low, high, s00, s01; + for (; x <= outSz.width - nlanes; x += nlanes) + { + v_float32x8 alpha0 = v256_load(&alpha[x]); + // v_float32 alpha1 = 1.f - alpha0; + + v_gather_pairs(src0[line], &mapsx[x], low, high); + v_deinterleave(low, high, s00, s01); + + // v_float32 d = s00*alpha0 + s01*alpha1; + v_float32x8 d = v_fma(s00 - s01, alpha0, s01); + + v_store(&dst[line][x], d); + } + + for (; x < outSz.width; ++x) { + float alpha0 = alpha[x]; + float alpha1 = 1 - alpha0; + int sx0 = mapsx[x]; + int sx1 = sx0 + 1; + dst[line][x] = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1; + } + } + + } + else if (!yRatioEq1) + { + int length = inSz.width; // == outSz.width + + for (int line = 0; line < lpi; ++line) { + float beta0 = beta[line]; + float beta1 = 1 - beta0; + v_float32x8 v_beta0 = v256_setall_f32(beta0); + int x = 0; + + for (; x <= length - nlanes; x += nlanes) + { + v_float32x8 s0 = v256_load(&src0[line][x]); + v_float32x8 s1 = v256_load(&src1[line][x]); + + // v_float32 d = s0*beta0 + s1*beta1; + v_float32x8 d = v_fma(s0 - s1, v_beta0, s1); + + v_store(&dst[line][x], d); + } + + for (; x < length; ++x) { + dst[line][x] = beta0 * src0[line][x] + beta1 * src1[line][x]; + } + } + + } + else + { + int length = inSz.width; // == outSz.width + memcpy(dst[0], src0[0], length * sizeof(float)*lpi); + } +} +} // namespace avx2 +} // namespace fliud +} // namespace gapi +} // namespace cv +#endif // !defined(GAPI_STANDALONE)