diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp index b7f65b6719..e6077dfcef 100644 --- a/modules/gapi/include/opencv2/gapi/core.hpp +++ b/modules/gapi/include/opencv2/gapi/core.hpp @@ -398,7 +398,7 @@ namespace core { }; G_TYPED_KERNEL(GResize, , "org.opencv.core.transform.resize") { - static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int) { + static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int /*interp*/) { if (sz.width != 0 && sz.height != 0) { return in.withSize(sz); diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp index 3fd543baed..eb6d99d92e 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -81,7 +81,9 @@ namespace opencv_test cv::GCompileArgs>> {}; class TransposePerfTest : public TestPerfParams> {}; class ResizePerfTest : public TestPerfParams> {}; + class BottleneckKernelsConstInputPerfTest : public TestPerfParams> {}; class ResizeFxFyPerfTest : public TestPerfParams> {}; + class ResizeInSimpleGraphPerfTest : public TestPerfParams> {}; class ParseSSDBLPerfTest : public TestPerfParams>, public ParserSSDTest {}; class ParseSSDPerfTest : public TestPerfParams>, public ParserSSDTest {}; class ParseYoloPerfTest : public TestPerfParams>, public ParserYoloTest {}; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index 6dfc0b2e2f..f8f309a90b 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -2151,6 +2151,89 @@ PERF_TEST_P_(ResizeFxFyPerfTest, TestPerformance) { cc(gin(in_mat1), gout(out_mat_gapi)); } + // Comparison //////////////////////////////////////////////////////////// + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } + + SANITY_CHECK_NOTHING(); +} + +//------------------------------------------------------------------------------ + +// This test cases were created to control performance result of test scenario mentioned here: +// https://stackoverflow.com/questions/60629331/opencv-gapi-performance-not-good-as-expected + +PERF_TEST_P_(BottleneckKernelsConstInputPerfTest, TestPerformance) +{ + compare_f cmpF = get<0>(GetParam()); + std::string fileName = get<1>(GetParam()); + cv::GCompileArgs compile_args = get<2>(GetParam()); + + in_mat1 = cv::imread(findDataFile(fileName)); + + cv::Mat cvvga; + cv::Mat cvgray; + cv::Mat cvblurred; + + cv::resize(in_mat1, cvvga, cv::Size(), 0.5, 0.5); + cv::cvtColor(cvvga, cvgray, cv::COLOR_BGR2GRAY); + cv::blur(cvgray, cvblurred, cv::Size(3, 3)); + cv::Canny(cvblurred, out_mat_ocv, 32, 128, 3); + + cv::GMat in; + cv::GMat vga = cv::gapi::resize(in, cv::Size(), 0.5, 0.5, INTER_LINEAR); + cv::GMat gray = cv::gapi::BGR2Gray(vga); + cv::GMat blurred = cv::gapi::blur(gray, cv::Size(3, 3)); + cv::GMat out = cv::gapi::Canny(blurred, 32, 128, 3); + cv::GComputation ac(in, out); + + auto cc = ac.compile(descr_of(gin(in_mat1)), + std::move(compile_args)); + cc(gin(in_mat1), gout(out_mat_gapi)); + + TEST_CYCLE() + { + cc(gin(in_mat1), gout(out_mat_gapi)); + } + + // Comparison //////////////////////////////////////////////////////////// + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } + + SANITY_CHECK_NOTHING(); +} + +//------------------------------------------------------------------------------ + +PERF_TEST_P_(ResizeInSimpleGraphPerfTest, TestPerformance) +{ + compare_f cmpF = get<0>(GetParam()); + MatType type = get<1>(GetParam()); + cv::Size sz_in = get<2>(GetParam()); + cv::GCompileArgs compile_args = get<3>(GetParam()); + + initMatsRandU(type, sz_in, type, false); + + cv::Mat add_res_ocv; + + cv::add(in_mat1, in_mat2, add_res_ocv); + cv::resize(add_res_ocv, out_mat_ocv, cv::Size(), 0.5, 0.5); + + cv::GMat in1, in2; + cv::GMat add_res_gapi = cv::gapi::add(in1, in2); + cv::GMat out = cv::gapi::resize(add_res_gapi, cv::Size(), 0.5, 0.5, INTER_LINEAR); + cv::GComputation ac(GIn(in1, in2), GOut(out)); + + auto cc = ac.compile(descr_of(gin(in_mat1, in_mat2)), + std::move(compile_args)); + cc(gin(in_mat1, in_mat2), gout(out_mat_gapi)); + + TEST_CYCLE() + { + cc(gin(in_mat1, in_mat2), gout(out_mat_gapi)); + } // Comparison //////////////////////////////////////////////////////////// { diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index 871d41792b..121b8acf7f 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -321,16 +321,28 @@ INSTANTIATE_TEST_CASE_P(TransposePerfTestCPU, TransposePerfTest, INSTANTIATE_TEST_CASE_P(ResizePerfTestCPU, ResizePerfTest, Combine(Values(AbsExact().to_compare_f()), - Values(CV_8UC1, CV_16UC1, CV_16SC1), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA), Values(szSmall128, szVGA, sz720p, sz1080p), Values(cv::Size(64, 64), - cv::Size(30, 30)), + cv::Size(32, 32)), + Values(cv::compile_args(CORE_CPU)))); + +INSTANTIATE_TEST_CASE_P(BottleneckKernelsPerfTestCPU, BottleneckKernelsConstInputPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values("cv/optflow/frames/1080p_00.png", "cv/optflow/frames/720p_00.png", + "cv/optflow/frames/VGA_00.png", "cv/dnn_face/recognition/Aaron_Tippin_0001.jpg"), + Values(cv::compile_args(CORE_CPU)))); + +INSTANTIATE_TEST_CASE_P(ResizeInSimpleGraphPerfTestCPU, ResizeInSimpleGraphPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC3), + Values(szSmall128, szVGA, sz720p, sz1080p), Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(ResizeFxFyPerfTestCPU, ResizeFxFyPerfTest, Combine(Values(AbsExact().to_compare_f()), - Values(CV_8UC1, CV_16UC1, CV_16SC1), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA), Values(szSmall128, szVGA, sz720p, sz1080p), Values(0.5, 0.1), diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index 6be1e1a8c4..df3a2ea7b0 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -277,18 +277,31 @@ INSTANTIATE_TEST_CASE_P(ConvertToPerfTestFluid, ConvertToPerfTest, Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(ResizePerfTestFluid, ResizePerfTest, - Combine(Values(AbsExact().to_compare_f()), - Values(CV_8UC3/*CV_8UC1, CV_16UC1, CV_16SC1*/), - Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/), + Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()), + Values(CV_8UC3), + Values(cv::INTER_LINEAR), Values(szSmall128, szVGA, sz720p, sz1080p), Values(cv::Size(64, 64), cv::Size(30, 30)), Values(cv::compile_args(CORE_FLUID)))); +#define IMGPROC_FLUID cv::gapi::imgproc::fluid::kernels() +INSTANTIATE_TEST_CASE_P(BottleneckKernelsPerfTestFluid, BottleneckKernelsConstInputPerfTest, + Combine(Values(AbsSimilarPoints(0, 1).to_compare_f()), + Values("cv/optflow/frames/1080p_00.png", "cv/optflow/frames/720p_00.png", + "cv/optflow/frames/VGA_00.png", "cv/dnn_face/recognition/Aaron_Tippin_0001.jpg"), + Values(cv::compile_args(CORE_FLUID, IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(ResizeInSimpleGraphPerfTestFluid, ResizeInSimpleGraphPerfTest, + Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()), + Values(CV_8UC3), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(cv::compile_args(CORE_FLUID, IMGPROC_FLUID)))); + INSTANTIATE_TEST_CASE_P(ResizeFxFyPerfTestFluid, ResizeFxFyPerfTest, - Combine(Values(AbsExact().to_compare_f()), - Values(CV_8UC3/*CV_8UC1, CV_16UC1, CV_16SC1*/), - Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/), + Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()), + Values(CV_8UC3), + Values(cv::INTER_LINEAR), Values(szSmall128, szVGA, sz720p, sz1080p), Values(0.5, 0.1), Values(0.5, 0.1), diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index e1e9332d5e..6cf76b2f58 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -19,6 +19,10 @@ #include #include +#if CV_SSE4_1 +#include "gfluidcore_simd_sse41.hpp" +#endif + #include "gfluidbuffer_priv.hpp" #include "gfluidbackend.hpp" #include "gfluidutils.hpp" @@ -2949,109 +2953,295 @@ GAPI_FLUID_KERNEL(GFluidPhase, cv::gapi::core::GPhase, false) } }; +template +struct LinearScratchDesc { + using alpha_t = typename Mapper::alpha_type; + using index_t = typename Mapper::index_type; + + alpha_t* alpha; + alpha_t* clone; + index_t* mapsx; + alpha_t* beta; + index_t* mapsy; + T* tmp; + + LinearScratchDesc(int /*inW*/, int /*inH*/, int outW, int outH, void* data) { + alpha = reinterpret_cast(data); + clone = reinterpret_cast(alpha + outW); + mapsx = reinterpret_cast(clone + outW*4); + beta = reinterpret_cast(mapsx + outW); + mapsy = reinterpret_cast(beta + outH); + tmp = reinterpret_cast (mapsy + outH*2); + } + + static int bufSize(int inW, int /*inH*/, int outW, int outH, int lpi) { + auto size = outW * sizeof(alpha_t) + + outW * sizeof(alpha_t) * 4 + // alpha clones + outW * sizeof(index_t) + + outH * sizeof(alpha_t) + + outH * sizeof(index_t) * 2 + + inW * sizeof(T) * lpi * chanNum; + + return static_cast(size); + } +}; +static inline double invRatio(int inSz, int outSz) { + return static_cast(outSz) / inSz; +} + +static inline double ratio(int inSz, int outSz) { + return 1 / invRatio(inSz, outSz); +} + +template +static inline void initScratchLinear(const cv::GMatDesc& in, + const Size& outSz, + cv::gapi::fluid::Buffer& scratch, + int lpi) { + using alpha_type = typename Mapper::alpha_type; + static const auto unity = Mapper::unity; + + auto inSz = in.size; + auto sbufsize = LinearScratchDesc::bufSize(inSz.width, inSz.height, outSz.width, outSz.height, lpi); + + Size scratch_size{sbufsize, 1}; + + cv::GMatDesc desc; + desc.chan = 1; + desc.depth = CV_8UC1; + desc.size = scratch_size; + + cv::gapi::fluid::Buffer buffer(desc); + scratch = std::move(buffer); + + double hRatio = ratio(in.size.width, outSz.width); + double vRatio = ratio(in.size.height, outSz.height); + + LinearScratchDesc scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB()); + + auto *alpha = scr.alpha; + auto *clone = scr.clone; + auto *index = scr.mapsx; + + for (int x = 0; x < outSz.width; x++) { + auto map = Mapper::map(hRatio, 0, in.size.width, x); + auto alpha0 = map.alpha0; + auto index0 = map.index0; + + // TRICK: + // Algorithm takes pair of input pixels, sx0'th and sx1'th, + // and compute result as alpha0*src[sx0] + alpha1*src[sx1]. + // By definition: sx1 == sx0 + 1 either sx1 == sx0, and + // alpha0 + alpha1 == unity (scaled appropriately). + // Here we modify formulas for alpha0 and sx1: by assuming + // that sx1 == sx0 + 1 always, and patching alpha0 so that + // result remains intact. + // Note that we need in.size.width >= 2, for both sx0 and + // sx0+1 were indexing pixels inside the input's width. + if (map.index1 != map.index0 + 1) { + GAPI_DbgAssert(map.index1 == map.index0); + GAPI_DbgAssert(in.size.width >= 2); + if (map.index0 < in.size.width-1) { + // sx1=sx0+1 fits inside row, + // make sure alpha0=unity and alpha1=0, + // so that result equals src[sx0]*unity + alpha0 = saturate_cast(unity); + } else { + // shift sx0 to left by 1 pixel, + // and make sure that alpha0=0 and alpha1==1, + // so that result equals to src[sx0+1]*unity + alpha0 = 0; + index0--; + } + } + + alpha[x] = alpha0; + index[x] = index0; + + for (int l = 0; l < 4; l++) { + clone[4*x + l] = alpha0; + } + } + + auto *beta = scr.beta; + auto *index_y = scr.mapsy; + + for (int y = 0; y < outSz.height; y++) { + auto mapY = Mapper::map(vRatio, 0, in.size.height, y); + beta[y] = mapY.alpha0; + index_y[y] = mapY.index0; + index_y[outSz.height + y] = mapY.index1; + } +} + +template +struct MapperUnit { + F alpha0, alpha1; + I index0, index1; +}; + +inline static uint8_t calc(short alpha0, uint8_t src0, short alpha1, uint8_t src1) { + constexpr static const int half = 1 << 14; + return (src0 * alpha0 + src1 * alpha1 + half) >> 15; +} +struct Mapper { + constexpr static const int ONE = 1 << 15; + typedef short alpha_type; + typedef short index_type; + constexpr static const int unity = ONE; + + typedef MapperUnit Unit; + + static inline Unit map(double ratio, int start, int max, int outCoord) { + float f = static_cast((outCoord + 0.5) * ratio - 0.5); + int s = cvFloor(f); + f -= s; + + Unit u; + + u.index0 = static_cast(std::max(s - start, 0)); + u.index1 = static_cast(((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1); + + u.alpha0 = saturate_cast(ONE * (1.0f - f)); + u.alpha1 = saturate_cast(ONE * f); + + return u; + } +}; + +template +static void calcRowLinearC(const cv::gapi::fluid::View & in, + cv::gapi::fluid::Buffer& out, + cv::gapi::fluid::Buffer& scratch) { + using alpha_type = typename Mapper::alpha_type; + + auto inSz = in.meta().size; + auto outSz = out.meta().size; + + auto inY = in.y(); + int outY = out.y(); + int lpi = out.lpi(); + + GAPI_DbgAssert(outY + lpi <= outSz.height); + GAPI_DbgAssert(lpi <= 4); + + LinearScratchDesc scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB()); + + const auto *alpha = scr.alpha; + const auto *mapsx = scr.mapsx; + const auto *beta_0 = scr.beta; + const auto *mapsy = scr.mapsy; + + const auto *beta = beta_0 + outY; + const T *src0[4]; + const T *src1[4]; + T* dst[4]; + + for (int l = 0; l < lpi; l++) { + auto index0 = mapsy[outY + l] - inY; + auto index1 = mapsy[outSz.height + outY + l] - inY; + src0[l] = in.InLine(index0); + src1[l] = in.InLine(index1); + dst[l] = out.OutLine(l); + } + +#if CV_SSE4_1 + const auto* clone = scr.clone; + auto* tmp = scr.tmp; + + if (inSz.width >= 16 && outSz.width >= 16) + { + sse42::calcRowLinear_8UC_Impl_(reinterpret_cast(dst), + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); + + return; + } +#endif // CV_SSE4_1 + int length = out.length(); + for (int l = 0; l < lpi; l++) { + constexpr static const auto unity = Mapper::unity; + + auto beta0 = beta[l]; + auto beta1 = saturate_cast(unity - beta[l]); + + for (int x = 0; x < length; x++) { + auto alpha0 = alpha[x]; + auto alpha1 = saturate_cast(unity - alpha[x]); + auto sx0 = mapsx[x]; + auto sx1 = sx0 + 1; + + for (int c = 0; c < numChan; c++) { + auto idx0 = numChan*sx0 + c; + auto idx1 = numChan*sx1 + c; + T tmp0 = calc(beta0, src0[l][idx0], beta1, src1[l][idx0]); + T tmp1 = calc(beta0, src0[l][idx1], beta1, src1[l][idx1]); + dst[l][numChan * x + c] = calc(alpha0, tmp0, alpha1, tmp1); + } + } + } +} + GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::core::GResize, true) { static const int Window = 1; + static const int LPI = 4; static const auto Kind = GFluidKernel::Kind::Resize; constexpr static const int INTER_RESIZE_COEF_BITS = 11; constexpr static const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS; constexpr static const short ONE = INTER_RESIZE_COEF_SCALE; - struct ResizeUnit + static void initScratch(const cv::GMatDesc& in, + cv::Size outSz, double fx, double fy, int /*interp*/, + cv::gapi::fluid::Buffer &scratch) { - short alpha0; - short alpha1; - int s0; - int s1; - }; + int outSz_w; + int outSz_h; + if (outSz.width == 0 || outSz.height == 0) + { + outSz_w = static_cast(round(in.size.width * fx)); + outSz_h = static_cast(round(in.size.height * fy)); + } + else + { + outSz_w = outSz.width; + outSz_h = outSz.height; + } + cv::Size outSize(outSz_w, outSz_h); - static ResizeUnit map(double ratio, int start, int max, int outCoord) - { - float f = static_cast((outCoord + 0.5f) * ratio - 0.5f); - int s = cvFloor(f); - f -= s; - - ResizeUnit ru; - - ru.s0 = std::max(s - start, 0); - ru.s1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1; - - ru.alpha0 = saturate_cast((1.0f - f) * INTER_RESIZE_COEF_SCALE); - ru.alpha1 = saturate_cast((f) * INTER_RESIZE_COEF_SCALE); - - return ru; - } - - static void initScratch(const cv::GMatDesc& in, - cv::Size outSz, double fx, double fy, int /*interp*/, - cv::gapi::fluid::Buffer &scratch) - { - GAPI_Assert(in.depth == CV_8U && in.chan == 3); - - if (outSz.area() == 0) - { - outSz.width = static_cast(round(in.size.width * fx)); - outSz.height = static_cast(round(in.size.height * fy)); - } - - cv::Size scratch_size{static_cast(outSz.width * sizeof(ResizeUnit)), 1}; - - cv::GMatDesc desc; - desc.chan = 1; - desc.depth = CV_8UC1; - desc.size = scratch_size; - - cv::gapi::fluid::Buffer buffer(desc); - scratch = std::move(buffer); - - ResizeUnit* mapX = scratch.OutLine(); - double hRatio = (double)in.size.width / outSz.width; - - for (int x = 0, w = outSz.width; x < w; x++) - { - mapX[x] = map(hRatio, 0, in.size.width, x); - } + if (in.chan == 3) + { + initScratchLinear(in, outSize, scratch, LPI); + } + else if (in.chan == 4) + { + initScratchLinear(in, outSize, scratch, LPI); + } } static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {} - static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/, - cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) - { - double vRatio = (double)in.meta().size.height / out.meta().size.height; - auto mapY = map(vRatio, in.y(), in.meta().size.height, out.y()); + static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int interp, + cv::gapi::fluid::Buffer& out, + cv::gapi::fluid::Buffer& scratch) { + const int channels = in.meta().chan; + GAPI_Assert((channels == 3 || channels == 4) && (interp == cv::INTER_LINEAR)); - auto beta0 = mapY.alpha0; - auto beta1 = mapY.alpha1; - - const auto src0 = in.InLine (mapY.s0); - const auto src1 = in.InLine (mapY.s1); - - auto dst = out.OutLine(); - - ResizeUnit* mapX = scratch.OutLine(); - - for (int x = 0; x < out.length(); x++) + if (channels == 3) { - short alpha0 = mapX[x].alpha0; - short alpha1 = mapX[x].alpha1; - int sx0 = mapX[x].s0; - int sx1 = mapX[x].s1; - - int res00 = src0[3*sx0 ]*alpha0 + src0[3*(sx1) ]*alpha1; - int res10 = src1[3*sx0 ]*alpha0 + src1[3*(sx1) ]*alpha1; - - int res01 = src0[3*sx0 + 1]*alpha0 + src0[3*(sx1) + 1]*alpha1; - int res11 = src1[3*sx0 + 1]*alpha0 + src1[3*(sx1) + 1]*alpha1; - - int res02 = src0[3*sx0 + 2]*alpha0 + src0[3*(sx1) + 2]*alpha1; - int res12 = src1[3*sx0 + 2]*alpha0 + src1[3*(sx1) + 2]*alpha1; - - dst[3*x ] = uchar(( ((beta0 * (res00 >> 4)) >> 16) + ((beta1 * (res10 >> 4)) >> 16) + 2)>>2); - dst[3*x + 1] = uchar(( ((beta0 * (res01 >> 4)) >> 16) + ((beta1 * (res11 >> 4)) >> 16) + 2)>>2); - dst[3*x + 2] = uchar(( ((beta0 * (res02 >> 4)) >> 16) + ((beta1 * (res12 >> 4)) >> 16) + 2)>>2); + calcRowLinearC(in, out, scratch); + } + else if (channels == 4) + { + calcRowLinearC(in, out, scratch); } } }; diff --git a/modules/gapi/src/backends/fluid/gfluidcore_simd_sse41.hpp b/modules/gapi/src/backends/fluid/gfluidcore_simd_sse41.hpp new file mode 100644 index 0000000000..02fff30977 --- /dev/null +++ b/modules/gapi/src/backends/fluid/gfluidcore_simd_sse41.hpp @@ -0,0 +1,733 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2021 Intel Corporation + +#if !defined(GAPI_STANDALONE) + +#include "opencv2/gapi/own/saturate.hpp" + +#include + +#include "opencv2/core.hpp" + +#include + +#include +#include + +#include +#include +#include + +#if defined __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wstrict-overflow" +#endif +namespace cv { +namespace gapi { +namespace fluid { +namespace sse42 { + +CV_ALWAYS_INLINE void v_gather_pixel_map(v_uint8x16& vec, const uchar src[], const short* index, const int pos) +{ + const int chanNum = 4; + + // pixel_1 (rgbx) + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&src[chanNum * (*index + pos)]), 0); + // pixel_2 (rgbx) + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&src[chanNum * (*(index + 1) + pos)]), 1); + // pixel_3 + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&src[chanNum * (*(index + 2) + pos)]), 2); + // pixel_4 + vec.val = _mm_insert_epi32(vec.val, *reinterpret_cast(&src[chanNum * (*(index + 3) + pos)]), 3); +} + +CV_ALWAYS_INLINE void resize_vertical_anyLPI(const uchar* src0, const uchar* src1, + uchar* dst, const int inLength, + const short beta) { + constexpr int nlanes = 16; + __m128i zero = _mm_setzero_si128(); + __m128i b = _mm_set1_epi16(beta); + + for (int w = 0; inLength >= nlanes;) + { + for (; w <= inLength - nlanes; w += nlanes) + { + __m128i s0 = _mm_lddqu_si128(reinterpret_cast(&src0[w])); + __m128i s1 = _mm_lddqu_si128(reinterpret_cast(&src1[w])); + __m128i a1 = _mm_unpacklo_epi8(s0, zero); + __m128i b1 = _mm_unpacklo_epi8(s1, zero); + __m128i a2 = _mm_unpackhi_epi8(s0, zero); + __m128i b2 = _mm_unpackhi_epi8(s1, zero); + __m128i r1 = _mm_mulhrs_epi16(_mm_sub_epi16(a1, b1), b); + __m128i r2 = _mm_mulhrs_epi16(_mm_sub_epi16(a2, b2), b); + __m128i res1 = _mm_add_epi16(r1, b1); + __m128i res2 = _mm_add_epi16(r2, b2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + w), _mm_packus_epi16(res1, res2)); + } + + if (w < inLength) { + w = inLength - nlanes; + continue; + } + break; + } +} + + +CV_ALWAYS_INLINE void resize_horizontal_anyLPI(uint8_t* dst, + const uchar* src, const short mapsx[], + const short alpha[], const int width) +{ + constexpr int nlanes = 16; + constexpr int chanNum = 3; + __m128i zero = _mm_setzero_si128(); + + for (int x = 0; width >= nlanes;) + { + for (; x <= width - nlanes; x += nlanes) + { + __m128i a012 = _mm_setr_epi16(alpha[x], alpha[x], alpha[x], alpha[x + 1], + alpha[x + 1], alpha[x + 1], alpha[x + 2], alpha[x + 2]); + __m128i a2345 = _mm_setr_epi16(alpha[x + 2], alpha[x + 3], alpha[x + 3], alpha[x + 3], + alpha[x + 4], alpha[x + 4], alpha[x + 4], alpha[x + 5]); + + __m128i a567 = _mm_setr_epi16(alpha[x + 5], alpha[x + 5], alpha[x + 6], alpha[x + 6], + alpha[x + 6], alpha[x + 7], alpha[x + 7], alpha[x + 7]); + __m128i a8910 = _mm_setr_epi16(alpha[x + 8], alpha[x + 8], alpha[x + 8], alpha[x + 9], + alpha[x + 9], alpha[x + 9], alpha[x + 10], alpha[x + 10]); + + __m128i a10111213 = _mm_setr_epi16(alpha[x + 10], alpha[x + 11], alpha[x + 11], alpha[x + 11], + alpha[x + 12], alpha[x + 12], alpha[x + 12], alpha[x + 13]); + __m128i a131415 = _mm_setr_epi16(alpha[x + 13], alpha[x + 13], alpha[x + 14], alpha[x + 14], + alpha[x + 14], alpha[x + 15], alpha[x + 15], alpha[x + 15]); + + __m128i a1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 0)], src[chanNum * (mapsx[x] + 0) + 1], src[chanNum * (mapsx[x] + 0) + 2], + src[chanNum * (mapsx[x + 1] + 0)], src[chanNum * (mapsx[x + 1] + 0) + 1], src[chanNum * (mapsx[x + 1] + 0) + 2], + src[chanNum * (mapsx[x + 2] + 0)], src[chanNum * (mapsx[x + 2] + 0) + 1], src[chanNum * (mapsx[x + 2] + 0) + 2], + src[chanNum * (mapsx[x + 3] + 0)], src[chanNum * (mapsx[x + 3] + 0) + 1], src[chanNum * (mapsx[x + 3] + 0) + 2], + src[chanNum * (mapsx[x + 4] + 0)], src[chanNum * (mapsx[x + 4] + 0) + 1], src[chanNum * (mapsx[x + 4] + 0) + 2], + src[chanNum * (mapsx[x + 5] + 0)]); + __m128i b1 = _mm_setr_epi8(src[chanNum * (mapsx[x] + 1)], src[chanNum * (mapsx[x] + 1) + 1], src[chanNum * (mapsx[x] + 1) + 2], + src[chanNum * (mapsx[x + 1] + 1)], src[chanNum * (mapsx[x + 1] + 1) + 1], src[chanNum * (mapsx[x + 1] + 1) + 2], + src[chanNum * (mapsx[x + 2] + 1)], src[chanNum * (mapsx[x + 2] + 1) + 1], src[chanNum * (mapsx[x + 2] + 1) + 2], + src[chanNum * (mapsx[x + 3] + 1)], src[chanNum * (mapsx[x + 3] + 1) + 1], src[chanNum * (mapsx[x + 3] + 1) + 2], + src[chanNum * (mapsx[x + 4] + 1)], src[chanNum * (mapsx[x + 4] + 1) + 1], src[chanNum * (mapsx[x + 4] + 1) + 2], + src[chanNum * (mapsx[x + 5] + 1)]); + + __m128i a2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 0) + 1], src[chanNum * (mapsx[x + 5] + 0) + 2], src[chanNum * (mapsx[x + 6] + 0)], + src[chanNum * (mapsx[x + 6] + 0) + 1], src[chanNum * (mapsx[x + 6] + 0) + 2], src[chanNum * (mapsx[x + 7] + 0)], + src[chanNum * (mapsx[x + 7] + 0) + 1], src[chanNum * (mapsx[x + 7] + 0) + 2], src[chanNum * (mapsx[x + 8] + 0)], + src[chanNum * (mapsx[x + 8] + 0) + 1], src[chanNum * (mapsx[x + 8] + 0) + 2], src[chanNum * (mapsx[x + 9] + 0)], + src[chanNum * (mapsx[x + 9] + 0) + 1], src[chanNum * (mapsx[x + 9] + 0) + 2], src[chanNum * (mapsx[x + 10] + 0)], + src[chanNum * (mapsx[x + 10] + 0) + 1]); + + __m128i b2 = _mm_setr_epi8(src[chanNum * (mapsx[x + 5] + 1) + 1], src[chanNum * (mapsx[x + 5] + 1) + 2], src[chanNum * (mapsx[x + 6] + 1)], + src[chanNum * (mapsx[x + 6] + 1) + 1], src[chanNum * (mapsx[x + 6] + 1) + 2], src[chanNum * (mapsx[x + 7] + 1)], + src[chanNum * (mapsx[x + 7] + 1) + 1], src[chanNum * (mapsx[x + 7] + 1) + 2], src[chanNum * (mapsx[x + 8] + 1)], + src[chanNum * (mapsx[x + 8] + 1) + 1], src[chanNum * (mapsx[x + 8] + 1) + 2], src[chanNum * (mapsx[x + 9] + 1)], + src[chanNum * (mapsx[x + 9] + 1) + 1], src[chanNum * (mapsx[x + 9] + 1) + 2], src[chanNum * (mapsx[x + 10] + 1)], + src[chanNum * (mapsx[x + 10] + 1) + 1]); + + __m128i a3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 0) + 2], src[chanNum * (mapsx[x + 11] + 0)], src[chanNum * (mapsx[x + 11] + 0) + 1], + src[chanNum * (mapsx[x + 11] + 0) + 2], src[chanNum * (mapsx[x + 12] + 0)], src[chanNum * (mapsx[x + 12] + 0) + 1], + src[chanNum * (mapsx[x + 12] + 0) + 2], src[chanNum * (mapsx[x + 13] + 0)], src[chanNum * (mapsx[x + 13] + 0) + 1], + src[chanNum * (mapsx[x + 13] + 0) + 2], src[chanNum * (mapsx[x + 14] + 0)], src[chanNum * (mapsx[x + 14] + 0) + 1], + src[chanNum * (mapsx[x + 14] + 0) + 2], src[chanNum * (mapsx[x + 15] + 0)], src[chanNum * (mapsx[x + 15] + 0) + 1], + src[chanNum * (mapsx[x + 15] + 0) + 2]); + + __m128i b3 = _mm_setr_epi8(src[chanNum * (mapsx[x + 10] + 1) + 2], src[chanNum * (mapsx[x + 11] + 1)], src[chanNum * (mapsx[x + 11] + 1) + 1], + src[chanNum * (mapsx[x + 11] + 1) + 2], src[chanNum * (mapsx[x + 12] + 1)], src[chanNum * (mapsx[x + 12] + 1) + 1], + src[chanNum * (mapsx[x + 12] + 1) + 2], src[chanNum * (mapsx[x + 13] + 1)], src[chanNum * (mapsx[x + 13] + 1) + 1], + src[chanNum * (mapsx[x + 13] + 1) + 2], src[chanNum * (mapsx[x + 14] + 1)], src[chanNum * (mapsx[x + 14] + 1) + 1], + src[chanNum * (mapsx[x + 14] + 1) + 2], src[chanNum * (mapsx[x + 15] + 1)], src[chanNum * (mapsx[x + 15] + 1) + 1], + src[chanNum * (mapsx[x + 15] + 1) + 2]); + + __m128i a11 = _mm_unpacklo_epi8(a1, zero); + __m128i a12 = _mm_unpackhi_epi8(a1, zero); + __m128i a21 = _mm_unpacklo_epi8(a2, zero); + __m128i a22 = _mm_unpackhi_epi8(a2, zero); + __m128i a31 = _mm_unpacklo_epi8(a3, zero); + __m128i a32 = _mm_unpackhi_epi8(a3, zero); + __m128i b11 = _mm_unpacklo_epi8(b1, zero); + __m128i b12 = _mm_unpackhi_epi8(b1, zero); + __m128i b21 = _mm_unpacklo_epi8(b2, zero); + __m128i b22 = _mm_unpackhi_epi8(b2, zero); + __m128i b31 = _mm_unpacklo_epi8(b3, zero); + __m128i b32 = _mm_unpackhi_epi8(b3, zero); + + __m128i r1 = _mm_mulhrs_epi16(_mm_sub_epi16(a11, b11), a012); + __m128i r2 = _mm_mulhrs_epi16(_mm_sub_epi16(a12, b12), a2345); + __m128i r3 = _mm_mulhrs_epi16(_mm_sub_epi16(a21, b21), a567); + __m128i r4 = _mm_mulhrs_epi16(_mm_sub_epi16(a22, b22), a8910); + __m128i r5 = _mm_mulhrs_epi16(_mm_sub_epi16(a31, b31), a10111213); + __m128i r6 = _mm_mulhrs_epi16(_mm_sub_epi16(a32, b32), a131415); + + __m128i r_1 = _mm_add_epi16(b11, r1); + __m128i r_2 = _mm_add_epi16(b12, r2); + __m128i r_3 = _mm_add_epi16(b21, r3); + __m128i r_4 = _mm_add_epi16(b22, r4); + __m128i r_5 = _mm_add_epi16(b31, r5); + __m128i r_6 = _mm_add_epi16(b32, r6); + + __m128i res1 = _mm_packus_epi16(r_1, r_2); + __m128i res2 = _mm_packus_epi16(r_3, r_4); + __m128i res3 = _mm_packus_epi16(r_5, r_6); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x]), res1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 16]), res2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[chanNum * x + 32]), res3); + } + if (x < width) { + x = width - nlanes; + continue; + } + break; + } +} + +template +CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(uint8_t**, + const uint8_t**, + const uint8_t**, + const short* , + const short* , + const short*, + const short* , + uint8_t*, + const Size& , + const Size& , + const int ) +{ + static_assert(chanNum != 3, "Unsupported number of channel"); +} +template<> +CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_<3>(uint8_t* dst[], + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short* clone, // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + const int lpi) { + bool xRatioEq = inSz.width == outSz.width; + bool yRatioEq = inSz.height == outSz.height; + constexpr int nlanes = 16; + constexpr int half_nlanes = 16 / 2; + constexpr int chanNum = 3; + + if (!xRatioEq && !yRatioEq) { + int inLength = inSz.width * chanNum; + + if (lpi == 4) + { + // vertical pass + __m128i b0 = _mm_set1_epi16(beta[0]); + __m128i b1 = _mm_set1_epi16(beta[1]); + __m128i b2 = _mm_set1_epi16(beta[2]); + __m128i b3 = _mm_set1_epi16(beta[3]); + __m128i zero = _mm_setzero_si128(); + __m128i vertical_shuf_mask = _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15); + + for (int w = 0; w < inSz.width * chanNum; ) { + for (; w <= inSz.width * chanNum - half_nlanes && w >= 0; w += half_nlanes) { +#ifdef __i386__ + __m128i val0lo = _mm_castpd_si128(_mm_loadh_pd( + _mm_load_sd(reinterpret_cast(&src0[0][w])), + reinterpret_cast(&src0[1][w]))); + __m128i val0hi = _mm_castpd_si128(_mm_loadh_pd( + _mm_load_sd(reinterpret_cast(&src0[2][w])), + reinterpret_cast(&src0[3][w]))); + __m128i val1lo = _mm_castpd_si128(_mm_loadh_pd( + _mm_load_sd(reinterpret_cast(&src1[0][w])), + reinterpret_cast(&src1[1][w]))); + __m128i val1hi = _mm_castpd_si128(_mm_loadh_pd( + _mm_load_sd(reinterpret_cast(&src1[2][w])), + reinterpret_cast(&src1[3][w]))); +#else + __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast(&src0[0][w])), + *reinterpret_cast(&src0[1][w]), 1); + __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast(&src0[2][w])), + *reinterpret_cast(&src0[3][w]), 1); + __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast(&src1[0][w])), + *reinterpret_cast(&src1[1][w]), 1); + __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast(&src1[2][w])), + *reinterpret_cast(&src1[3][w]), 1); +#endif + __m128i val0_0 = _mm_cvtepu8_epi16(val0lo); + __m128i val0_2 = _mm_cvtepu8_epi16(val0hi); + __m128i val1_0 = _mm_cvtepu8_epi16(val1lo); + __m128i val1_2 = _mm_cvtepu8_epi16(val1hi); + + __m128i val0_1 = _mm_unpackhi_epi8(val0lo, zero); + __m128i val0_3 = _mm_unpackhi_epi8(val0hi, zero); + __m128i val1_1 = _mm_unpackhi_epi8(val1lo, zero); + __m128i val1_3 = _mm_unpackhi_epi8(val1hi, zero); + + __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0); + __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1); + __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2); + __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3); + + __m128i r0 = _mm_add_epi16(val1_0, t0); + __m128i r1 = _mm_add_epi16(val1_1, t1); + __m128i r2 = _mm_add_epi16(val1_2, t2); + __m128i r3 = _mm_add_epi16(val1_3, t3); + + __m128i q0 = _mm_packus_epi16(r0, r1); + __m128i q1 = _mm_packus_epi16(r2, r3); + + __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/); + __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/); + + __m128i q4 = _mm_shuffle_epi8(q2, vertical_shuf_mask); + __m128i q5 = _mm_shuffle_epi8(q3, vertical_shuf_mask); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[4 * w + 0]), q4); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&tmp[4 * w + 16]), q5); + } + + if (w < inSz.width * chanNum) { + w = inSz.width * chanNum - half_nlanes; + } + } + + // horizontal pass + __m128i horizontal_shuf_mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + + for (int x = 0; outSz.width >= nlanes; ) + { + for (; x <= outSz.width - nlanes; x += nlanes) + { +#ifdef _WIN64 + __m128i a00 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * x]), *reinterpret_cast(&clone[4 * x])); + __m128i a01 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * x]), *reinterpret_cast(&clone[4 * (x + 1)])); + __m128i a11 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 1)]), *reinterpret_cast(&clone[4 * (x + 1)])); + __m128i a22 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 2)]), *reinterpret_cast(&clone[4 * (x + 2)])); + __m128i a23 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 2)]), *reinterpret_cast(&clone[4 * (x + 3)])); + __m128i a33 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 3)]), *reinterpret_cast(&clone[4 * (x + 3)])); + __m128i a44 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 4)]), *reinterpret_cast(&clone[4 * (x + 4)])); + __m128i a45 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 4)]), *reinterpret_cast(&clone[4 * (x + 5)])); + __m128i a55 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 5)]), *reinterpret_cast(&clone[4 * (x + 5)])); + __m128i a66 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 6)]), *reinterpret_cast(&clone[4 * (x + 6)])); + __m128i a67 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 6)]), *reinterpret_cast(&clone[4 * (x + 7)])); + __m128i a77 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 7)]), *reinterpret_cast(&clone[4 * (x + 7)])); + __m128i a88 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 8)]), *reinterpret_cast(&clone[4 * (x + 8)])); + __m128i a89 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 8)]), *reinterpret_cast(&clone[4 * (x + 9)])); + __m128i a99 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 9)]), *reinterpret_cast(&clone[4 * (x + 9)])); + __m128i a1010 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 10)]), *reinterpret_cast(&clone[4 * (x + 10)])); + __m128i a1011 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 10)]), *reinterpret_cast(&clone[4 * (x + 11)])); + __m128i a1111 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 11)]), *reinterpret_cast(&clone[4 * (x + 11)])); + __m128i a1212 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 12)]), *reinterpret_cast(&clone[4 * (x + 12)])); + __m128i a1213 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 12)]), *reinterpret_cast(&clone[4 * (x + 13)])); + __m128i a1313 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 13)]), *reinterpret_cast(&clone[4 * (x + 13)])); + __m128i a1414 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 14)]), *reinterpret_cast(&clone[4 * (x + 14)])); + __m128i a1415 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 14)]), *reinterpret_cast(&clone[4 * (x + 15)])); + __m128i a1515 = _mm_setr_epi64x(*reinterpret_cast(&clone[4 * (x + 15)]), *reinterpret_cast(&clone[4 * (x + 15)])); +#else + __m128i a00 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * x]), *reinterpret_cast(&clone[4 * x])); + __m128i a01 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * x]), *reinterpret_cast(&clone[4 * (x + 1)])); + __m128i a11 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 1)]), *reinterpret_cast(&clone[4 * (x + 1)])); + __m128i a22 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 2)]), *reinterpret_cast(&clone[4 * (x + 2)])); + __m128i a23 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 2)]), *reinterpret_cast(&clone[4 * (x + 3)])); + __m128i a33 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 3)]), *reinterpret_cast(&clone[4 * (x + 3)])); + __m128i a44 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 4)]), *reinterpret_cast(&clone[4 * (x + 4)])); + __m128i a45 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 4)]), *reinterpret_cast(&clone[4 * (x + 5)])); + __m128i a55 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 5)]), *reinterpret_cast(&clone[4 * (x + 5)])); + __m128i a66 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 6)]), *reinterpret_cast(&clone[4 * (x + 6)])); + __m128i a67 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 6)]), *reinterpret_cast(&clone[4 * (x + 7)])); + __m128i a77 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 7)]), *reinterpret_cast(&clone[4 * (x + 7)])); + __m128i a88 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 8)]), *reinterpret_cast(&clone[4 * (x + 8)])); + __m128i a89 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 8)]), *reinterpret_cast(&clone[4 * (x + 9)])); + __m128i a99 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 9)]), *reinterpret_cast(&clone[4 * (x + 9)])); + __m128i a1010 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 10)]), *reinterpret_cast(&clone[4 * (x + 10)])); + __m128i a1011 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 10)]), *reinterpret_cast(&clone[4 * (x + 11)])); + __m128i a1111 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 11)]), *reinterpret_cast(&clone[4 * (x + 11)])); + __m128i a1212 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 12)]), *reinterpret_cast(&clone[4 * (x + 12)])); + __m128i a1213 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 12)]), *reinterpret_cast(&clone[4 * (x + 13)])); + __m128i a1313 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 13)]), *reinterpret_cast(&clone[4 * (x + 13)])); + __m128i a1414 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 14)]), *reinterpret_cast(&clone[4 * (x + 14)])); + __m128i a1415 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 14)]), *reinterpret_cast(&clone[4 * (x + 15)])); + __m128i a1515 = _mm_setr_epi64(*reinterpret_cast(&clone[4 * (x + 15)]), *reinterpret_cast(&clone[4 * (x + 15)])); +#endif + + // load 3 channels of first pixel from first pair of 4-couple scope + __m128i pix1 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x])])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix1 = _mm_insert_epi32(pix1, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 1])]), 3); + + // load 3 channels of neighbor pixel from first pair of 4-couple scope + __m128i pix2 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x] + 1))])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix2 = _mm_insert_epi32(pix2, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 1] + 1))]), 3); + + // expand 8-bit data to 16-bit + __m128i val_0 = _mm_unpacklo_epi8(pix1, zero); + __m128i val_1 = _mm_unpacklo_epi8(pix2, zero); + + // expand 8-bit data to 16-bit + __m128i val_2 = _mm_unpackhi_epi8(pix1, zero); + __m128i val_3 = _mm_unpackhi_epi8(pix2, zero); + + // the main calculations + __m128i t0_0 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a00); + __m128i t1_0 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a01); + __m128i r0_0 = _mm_add_epi16(val_1, t0_0); + __m128i r1_0 = _mm_add_epi16(val_3, t1_0); + + // pack 16-bit data to 8-bit + __m128i q0_0 = _mm_packus_epi16(r0_0, r1_0); + // gather data from the same lines together + __m128i res1 = _mm_shuffle_epi8(q0_0, horizontal_shuf_mask); + + val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 1] + 1)]), 0), zero); + val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 1] + 1) + 1)]), 0), zero); + + val_2 = _mm_insert_epi64(val_2, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 2])]), 0); + val_3 = _mm_insert_epi64(val_3, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 2] + 1))]), 0); + + val_2 = _mm_unpacklo_epi8(val_2, zero); + val_3 = _mm_unpacklo_epi8(val_3, zero); + + __m128i t0_1 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a11); + __m128i t1_1 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a22); + __m128i r0_1 = _mm_add_epi16(val_1, t0_1); + __m128i r1_1 = _mm_add_epi16(val_3, t1_1); + + __m128i q0_1 = _mm_packus_epi16(r0_1, r1_1); + __m128i res2 = _mm_shuffle_epi8(q0_1, horizontal_shuf_mask); + + __m128i pix7 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 3] - 1) + 2)])); + pix7 = _mm_insert_epi32(pix7, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 2] + 2)]), 0); + + __m128i pix8 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 3] + 2)])); + pix8 = _mm_insert_epi32(pix8, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 2] + 1) + 2)]), 0); + + val_0 = _mm_unpacklo_epi8(pix7, zero); + val_1 = _mm_unpacklo_epi8(pix8, zero); + + val_2 = _mm_unpackhi_epi8(pix7, zero); + val_3 = _mm_unpackhi_epi8(pix8, zero); + + // the main calculations + __m128i t0_2 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a23); + __m128i t1_2 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a33); + __m128i r0_2 = _mm_add_epi16(val_1, t0_2); + __m128i r1_2 = _mm_add_epi16(val_3, t1_2); + + // pack 16-bit data to 8-bit + __m128i q0_2 = _mm_packus_epi16(r0_2, r1_2); + __m128i res3 = _mm_shuffle_epi8(q0_2, horizontal_shuf_mask); + + __m128i pix9 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 4])])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix9 = _mm_insert_epi32(pix9, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 5])]), 3); + + // load 3 channels of neighbor pixel from first pair of 4-couple scope + __m128i pix10 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 4] + 1))])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix10 = _mm_insert_epi32(pix10, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 5] + 1))]), 3); + + // expand 8-bit data to 16-bit + val_0 = _mm_unpacklo_epi8(pix9, zero); + val_1 = _mm_unpacklo_epi8(pix10, zero); + + // expand 8-bit data to 16-bit + val_2 = _mm_unpackhi_epi8(pix9, zero); + val_3 = _mm_unpackhi_epi8(pix10, zero); + + // the main calculations + __m128i t0_3 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a44); + __m128i t1_3 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a45); + __m128i r0_3 = _mm_add_epi16(val_1, t0_3); + __m128i r1_3 = _mm_add_epi16(val_3, t1_3); + + // pack 16-bit data to 8-bit + __m128i q0_3 = _mm_packus_epi16(r0_3, r1_3); + // gather data from the same lines together + __m128i res4 = _mm_shuffle_epi8(q0_3, horizontal_shuf_mask); + + val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 5] + 1)]), 0), zero); + val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 5] + 1) + 1)]), 0), zero); + + val_2 = _mm_insert_epi64(val_2, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 6])]), 0); + val_3 = _mm_insert_epi64(val_3, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 6] + 1))]), 0); + + val_2 = _mm_unpacklo_epi8(val_2, zero); + val_3 = _mm_unpacklo_epi8(val_3, zero); + + __m128i t0_4 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a55); + __m128i t1_4 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a66); + __m128i r0_4 = _mm_add_epi16(val_1, t0_4); + __m128i r1_4 = _mm_add_epi16(val_3, t1_4); + + __m128i q0_4 = _mm_packus_epi16(r0_4, r1_4); + __m128i res5 = _mm_shuffle_epi8(q0_4, horizontal_shuf_mask); + + __m128i pix15 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 7] - 1) + 2)])); + pix15 = _mm_insert_epi32(pix15, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 6] + 2)]), 0); + + __m128i pix16 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 7] + 2)])); + pix16 = _mm_insert_epi32(pix16, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 6] + 1) + 2)]), 0); + + val_0 = _mm_unpacklo_epi8(pix15, zero); + val_1 = _mm_unpacklo_epi8(pix16, zero); + + val_2 = _mm_unpackhi_epi8(pix15, zero); + val_3 = _mm_unpackhi_epi8(pix16, zero); + + // the main calculations + __m128i t0_5 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a67); + __m128i t1_5 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a77); + __m128i r0_5 = _mm_add_epi16(val_1, t0_5); + __m128i r1_5 = _mm_add_epi16(val_3, t1_5); + + // pack 16-bit data to 8-bit + __m128i q0_5 = _mm_packus_epi16(r0_5, r1_5); + __m128i res6 = _mm_shuffle_epi8(q0_5, horizontal_shuf_mask); + + __m128i bl1 = _mm_blend_epi16(res1, _mm_slli_si128(res2, 4), 0xCC /*0b11001100*/); + __m128i bl2 = _mm_blend_epi16(_mm_srli_si128(res1, 4), res2, 0xCC /*0b11001100*/); + + __m128i bl3 = _mm_blend_epi16(res3, _mm_slli_si128(res4, 4), 0xCC /*0b11001100*/); + __m128i bl4 = _mm_blend_epi16(_mm_srli_si128(res3, 4), res4, 0xCC /*0b11001100*/); + + __m128i bl5 = _mm_blend_epi16(res5, _mm_slli_si128(res6, 4), 0xCC /*0b11001100*/); + __m128i bl6 = _mm_blend_epi16(_mm_srli_si128(res5, 4), res6, 0xCC /*0b11001100*/); + + __m128i bl13 = _mm_blend_epi16(bl1, _mm_slli_si128(bl3, 8), 0xF0 /*0b11110000*/); + __m128i bl31 = _mm_blend_epi16(_mm_srli_si128(bl1, 8), bl3, 0xF0 /*0b11110000*/); + + __m128i bl24 = _mm_blend_epi16(bl2, _mm_slli_si128(bl4, 8), 0xF0 /*0b11110000*/); + __m128i bl42 = _mm_blend_epi16(_mm_srli_si128(bl2, 8), bl4, 0xF0 /*0b11110000*/); + + // load 3 channels of first pixel from first pair of 4-couple scope + __m128i pix17 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 8])])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix17 = _mm_insert_epi32(pix17, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 9])]), 3); + + // load 3 channels of neighbor pixel from first pair of 4-couple scope + __m128i pix18 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 8] + 1))])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix18 = _mm_insert_epi32(pix18, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 9] + 1))]), 3); + + // expand 8-bit data to 16-bit + val_0 = _mm_unpacklo_epi8(pix17, zero); + val_1 = _mm_unpacklo_epi8(pix18, zero); + + // expand 8-bit data to 16-bit + val_2 = _mm_unpackhi_epi8(pix17, zero); + val_3 = _mm_unpackhi_epi8(pix18, zero); + + // the main calculations + __m128i t0_6 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a88); + __m128i t1_6 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a89); + __m128i r0_6 = _mm_add_epi16(val_1, t0_6); + __m128i r1_6 = _mm_add_epi16(val_3, t1_6); + + // pack 16-bit data to 8-bit + __m128i q0_6 = _mm_packus_epi16(r0_6, r1_6); + // gather data from the same lines together + __m128i res7 = _mm_shuffle_epi8(q0_6, horizontal_shuf_mask); + + val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 9] + 1)]), 0), zero); + val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 9] + 1) + 1)]), 0), zero); + + val_2 = _mm_insert_epi64(val_2, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 10])]), 0); + val_3 = _mm_insert_epi64(val_3, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 10] + 1))]), 0); + + val_2 = _mm_unpacklo_epi8(val_2, zero); + val_3 = _mm_unpacklo_epi8(val_3, zero); + + __m128i t0_7 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a99); + __m128i t1_7 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1010); + __m128i r0_7 = _mm_add_epi16(val_1, t0_7); + __m128i r1_7 = _mm_add_epi16(val_3, t1_7); + + __m128i q0_7 = _mm_packus_epi16(r0_7, r1_7); + __m128i res8 = _mm_shuffle_epi8(q0_7, horizontal_shuf_mask); + + __m128i pix21 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 11] - 1) + 2)])); + pix21 = _mm_insert_epi32(pix21, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 10] + 2)]), 0); + + __m128i pix22 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 11] + 2)])); + pix22 = _mm_insert_epi32(pix22, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 10] + 1) + 2)]), 0); + + val_0 = _mm_unpacklo_epi8(pix21, zero); + val_1 = _mm_unpacklo_epi8(pix22, zero); + + val_2 = _mm_unpackhi_epi8(pix21, zero); + val_3 = _mm_unpackhi_epi8(pix22, zero); + + // the main calculations + __m128i t0_8 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1011); + __m128i t1_8 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1111); + __m128i r0_8 = _mm_add_epi16(val_1, t0_8); + __m128i r1_8 = _mm_add_epi16(val_3, t1_8); + + // pack 16-bit data to 8-bit + __m128i q0_8 = _mm_packus_epi16(r0_8, r1_8); + __m128i res9 = _mm_shuffle_epi8(q0_8, horizontal_shuf_mask); + + __m128i pix23 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 12])])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix23 = _mm_insert_epi32(pix23, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 13])]), 3); + + // load 3 channels of neighbor pixel from first pair of 4-couple scope + __m128i pix24 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 12] + 1))])); + // insert first channel from next couple of pixels to completely fill the simd vector + pix24 = _mm_insert_epi32(pix24, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 13] + 1))]), 3); + + // expand 8-bit data to 16-bit + val_0 = _mm_unpacklo_epi8(pix23, zero); + val_1 = _mm_unpacklo_epi8(pix24, zero); + + // expand 8-bit data to 16-bit + val_2 = _mm_unpackhi_epi8(pix23, zero); + val_3 = _mm_unpackhi_epi8(pix24, zero); + + // the main calculations + __m128i t0_9 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1212); + __m128i t1_9 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1213); + __m128i r0_9 = _mm_add_epi16(val_1, t0_9); + __m128i r1_9 = _mm_add_epi16(val_3, t1_9); + + // pack 16-bit data to 8-bit + __m128i q0_9 = _mm_packus_epi16(r0_9, r1_9); + // gather data from the same lines together + __m128i res10 = _mm_shuffle_epi8(q0_9, horizontal_shuf_mask); + + val_0 = _mm_unpacklo_epi8(_mm_insert_epi64(val_0, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 13] + 1)]), 0), zero); + val_1 = _mm_unpacklo_epi8(_mm_insert_epi64(val_1, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 13] + 1) + 1)]), 0), zero); + + val_2 = _mm_insert_epi64(val_2, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 14])]), 0); + val_3 = _mm_insert_epi64(val_3, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 14] + 1))]), 0); + + val_2 = _mm_unpacklo_epi8(val_2, zero); + val_3 = _mm_unpacklo_epi8(val_3, zero); + + __m128i t0_10 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1313); + __m128i t1_10 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1414); + __m128i r0_10 = _mm_add_epi16(val_1, t0_10); + __m128i r1_10 = _mm_add_epi16(val_3, t1_10); + + __m128i q0_10 = _mm_packus_epi16(r0_10, r1_10); + __m128i res11 = _mm_shuffle_epi8(q0_10, horizontal_shuf_mask); + + __m128i pix27 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 15] - 1) + 2)])); + pix27 = _mm_insert_epi32(pix27, *reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 14] + 2)]), 0); + + __m128i pix28 = _mm_lddqu_si128(reinterpret_cast(&tmp[4 * (chanNum * mapsx[x + 15] + 2)])); + pix28 = _mm_insert_epi32(pix28, *reinterpret_cast(&tmp[4 * (chanNum * (mapsx[x + 14] + 1) + 2)]), 0); + + val_0 = _mm_unpacklo_epi8(pix27, zero); + val_1 = _mm_unpacklo_epi8(pix28, zero); + + val_2 = _mm_unpackhi_epi8(pix27, zero); + val_3 = _mm_unpackhi_epi8(pix28, zero); + + // the main calculations + __m128i t0_11 = _mm_mulhrs_epi16(_mm_sub_epi16(val_0, val_1), a1415); + __m128i t1_11 = _mm_mulhrs_epi16(_mm_sub_epi16(val_2, val_3), a1515); + __m128i r0_11 = _mm_add_epi16(val_1, t0_11); + __m128i r1_11 = _mm_add_epi16(val_3, t1_11); + + // pack 16-bit data to 8-bit + __m128i q0_11 = _mm_packus_epi16(r0_11, r1_11); + __m128i res12 = _mm_shuffle_epi8(q0_11, horizontal_shuf_mask); + + __m128i bl7 = _mm_blend_epi16(res7, _mm_slli_si128(res8, 4), 0xCC /*0b11001100*/); + __m128i bl8 = _mm_blend_epi16(_mm_srli_si128(res7, 4), res8, 0xCC /*0b11001100*/); + + __m128i bl9 = _mm_blend_epi16(res9, _mm_slli_si128(res10, 4), 0xCC /*0b11001100*/); + __m128i bl10 = _mm_blend_epi16(_mm_srli_si128(res9, 4), res10, 0xCC /*0b11001100*/); + + __m128i bl11 = _mm_blend_epi16(res11, _mm_slli_si128(res12, 4), 0xCC /*0b11001100*/); + __m128i bl12 = _mm_blend_epi16(_mm_srli_si128(res11, 4), res12, 0xCC /*0b11001100*/); + + __m128i bl57 = _mm_blend_epi16(bl5, _mm_slli_si128(bl7, 8), 0xF0 /*0b11110000*/); + __m128i bl75 = _mm_blend_epi16(_mm_srli_si128(bl5, 8), bl7, 0xF0 /*0b11110000*/); + + __m128i bl68 = _mm_blend_epi16(bl6, _mm_slli_si128(bl8, 8), 0xF0 /*0b11110000*/); + __m128i bl86 = _mm_blend_epi16(_mm_srli_si128(bl6, 8), bl8, 0xF0 /*0b11110000*/); + + __m128i bl911 = _mm_blend_epi16(bl9, _mm_slli_si128(bl11, 8), 0xF0 /*0b11110000*/); + __m128i bl119 = _mm_blend_epi16(_mm_srli_si128(bl9, 8), bl11, 0xF0 /*0b11110000*/); + + __m128i bl1012 = _mm_blend_epi16(bl10, _mm_slli_si128(bl12, 8), 0xF0 /*0b11110000*/); + __m128i bl1210 = _mm_blend_epi16(_mm_srli_si128(bl10, 8), bl12, 0xF0 /*0b11110000*/); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[0][3 * x]), bl13); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[1][3 * x]), bl24); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[2][3 * x]), bl31); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[3][3 * x]), bl42); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[0][3 * x + 16]), bl57); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[1][3 * x + 16]), bl68); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[2][3 * x + 16]), bl75); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[3][3 * x + 16]), bl86); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[0][3 * x + 32]), bl911); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[1][3 * x + 32]), bl1012); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[2][3 * x + 32]), bl119); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dst[3][3 * x + 32]), bl1210); + } + + if (x < outSz.width) { + x = outSz.width - nlanes; + continue; + } + break; + } + } + else + { // if any lpi + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + const uchar* s0 = src0[l]; + const uchar* s1 = src1[l]; + + // vertical pass + resize_vertical_anyLPI(s0, s1, tmp, inLength, beta0); + + // horizontal pass + resize_horizontal_anyLPI(dst[l], tmp, mapsx, alpha, outSz.width); + } + } + } else if (!xRatioEq) { + GAPI_DbgAssert(yRatioEq); + + for (int l = 0; l < lpi; ++l) { + const uchar* src = src0[l]; + + // horizontal pass + resize_horizontal_anyLPI(dst[l], src, mapsx, alpha, outSz.width); + } + } else if (!yRatioEq) { + GAPI_DbgAssert(xRatioEq); + int inLength = inSz.width*chanNum; // == outSz.width + + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + const uchar* s0 = src0[l]; + const uchar* s1 = src1[l]; + + // vertical pass + resize_vertical_anyLPI(s0, s1, dst[l], inLength, beta0); + } + } else { + GAPI_DbgAssert(xRatioEq && yRatioEq); + int length = inSz.width *chanNum; + + for (int l = 0; l < lpi; ++l) { + memcpy(dst[l], src0[l], length); + } + } +} +} // namespace sse42 +} // namespace fliud +} // namespace gapi +} // namespace cv +#endif // !defined(GAPI_STANDALONE) diff --git a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp index 4434f6ebe2..d7d4abc46a 100644 --- a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp +++ b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp @@ -393,7 +393,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestFluid, ResizeTest, cv::Size(30, 30)), Values(-1), Values(CORE_FLUID), - Values(AbsExact().to_compare_obj()), + Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_obj()), Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/), Values(cv::Size(1280, 720), cv::Size(640, 480), @@ -410,7 +410,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestFxFyFluid, ResizeTestFxFy, cv::Size(30, 30)), Values(-1), Values(CORE_FLUID), - Values(AbsExact().to_compare_obj()), + Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_obj()), Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/), Values(0.5, 1, 2), Values(0.5, 1, 2))); diff --git a/modules/gapi/test/gapi_fluid_resize_test.cpp b/modules/gapi/test/gapi_fluid_resize_test.cpp index a0d30c634b..7bac668b7c 100644 --- a/modules/gapi/test/gapi_fluid_resize_test.cpp +++ b/modules/gapi/test/gapi_fluid_resize_test.cpp @@ -8,6 +8,7 @@ #include "test_precomp.hpp" #include "gapi_fluid_test_kernels.hpp" +#include "common/gapi_tests_common.hpp" namespace opencv_test { @@ -749,8 +750,7 @@ TEST_P(NV12PlusResizeTest, Test) cv::Mat rgb_mat; cv::cvtColor(in_mat, rgb_mat, cv::COLOR_YUV2RGB_NV12); cv::resize(rgb_mat, out_mat_ocv, out_sz, 0, 0, interp); - - EXPECT_EQ(0, cvtest::norm(out_mat(roi), out_mat_ocv(roi), NORM_INF)); + EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat(roi), out_mat_ocv(roi))); } INSTANTIATE_TEST_CASE_P(Fluid, NV12PlusResizeTest, diff --git a/modules/gapi/test/internal/gapi_int_recompilation_test.cpp b/modules/gapi/test/internal/gapi_int_recompilation_test.cpp index 49cedad173..e4171c5df0 100644 --- a/modules/gapi/test/internal/gapi_int_recompilation_test.cpp +++ b/modules/gapi/test/internal/gapi_int_recompilation_test.cpp @@ -6,6 +6,7 @@ #include "../test_precomp.hpp" +#include "../common/gapi_tests_common.hpp" #include "api/gcomputation_priv.hpp" #include @@ -115,9 +116,10 @@ TEST(GComputationCompile, FluidReshapeResizeDownScale) cv::Mat cv_out_mat1, cv_out_mat2; cv::resize(in_mat1, cv_out_mat1, szOut); cv::resize(in_mat2, cv_out_mat2, szOut); - - EXPECT_EQ(0, cvtest::norm(out_mat1, cv_out_mat1, NORM_INF)); - EXPECT_EQ(0, cvtest::norm(out_mat2, cv_out_mat2, NORM_INF)); + // Fluid's and OpenCV's resizes aren't bit exact. + // So 1 is here because it is max difference between them. + EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat1, cv_out_mat1)); + EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat2, cv_out_mat2)); } TEST(GComputationCompile, FluidReshapeSwitchToUpscaleFromDownscale) @@ -150,10 +152,11 @@ TEST(GComputationCompile, FluidReshapeSwitchToUpscaleFromDownscale) cv::resize(in_mat1, cv_out_mat1, szOut); cv::resize(in_mat2, cv_out_mat2, szOut); cv::resize(in_mat3, cv_out_mat3, szOut); - - EXPECT_EQ(0, cvtest::norm(out_mat1, cv_out_mat1, NORM_INF)); - EXPECT_EQ(0, cvtest::norm(out_mat2, cv_out_mat2, NORM_INF)); - EXPECT_EQ(0, cvtest::norm(out_mat3, cv_out_mat3, NORM_INF)); + // Fluid's and OpenCV's Resizes aren't bit exact. + // So 1 is here because it is max difference between them. + EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat1, cv_out_mat1)); + EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat2, cv_out_mat2)); + EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat3, cv_out_mat3)); } TEST(GComputationCompile, ReshapeBlur) @@ -224,8 +227,9 @@ TEST(GComputationCompile, ReshapeRois) cv::Mat blur_mat, cv_out_mat; cv::blur(in_mat, blur_mat, kernelSize); cv::resize(blur_mat, cv_out_mat, szOut); - - EXPECT_EQ(0, cvtest::norm(out_mat(roi), cv_out_mat(roi), NORM_INF)); + // Fluid's and OpenCV's resizes aren't bit exact. + // So 1 is here because it is max difference between them. + EXPECT_TRUE(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()(out_mat(roi), cv_out_mat(roi))); } } diff --git a/modules/gapi/test/streaming/gapi_streaming_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_tests.cpp index 57e061861c..9eb9470dd7 100644 --- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp +++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp @@ -353,7 +353,9 @@ TEST_P(GAPI_Streaming, SmokeTest_ConstInput_GMat) // With constant inputs, the stream is endless so // the blocking pull() should never return `false`. EXPECT_TRUE(ccomp.pull(cv::gout(out_mat_gapi))); - EXPECT_EQ(0, cvtest::norm(out_mat_gapi, out_mat_ocv, NORM_INF)); + // Fluid's and OpenCV's Resizes aren't bit exact. + // So 1% is here because it is max difference between them. + EXPECT_TRUE(AbsSimilarPoints(0, 1).to_compare_f()(out_mat_gapi, out_mat_ocv)); } EXPECT_TRUE(ccomp.running()); @@ -405,7 +407,9 @@ TEST_P(GAPI_Streaming, SmokeTest_VideoInput_GMat) frames++; cv::Mat out_mat_ocv; opencv_ref(in_mat_gapi, out_mat_ocv); - EXPECT_EQ(0, cvtest::norm(out_mat_gapi, out_mat_ocv, NORM_INF)); + // Fluid's and OpenCV's Resizes aren't bit exact. + // So 1% is here because it is max difference between them. + EXPECT_TRUE(AbsSimilarPoints(0, 1).to_compare_f()(out_mat_gapi, out_mat_ocv)); } EXPECT_LT(0u, frames); EXPECT_FALSE(ccomp.running());