From 7366eebebb61a879803be293f7a7b63011d1e54a Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 5 Mar 2019 16:25:38 +0300 Subject: [PATCH 01/21] core: fix condition in OutputArray::create(allowTransposed=True) --- modules/core/src/matrix_wrap.cpp | 42 ++++++++++++-------------------- modules/core/test/test_misc.cpp | 7 ++++++ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp index e64d097aad..1f5d861cdd 100644 --- a/modules/core/src/matrix_wrap.cpp +++ b/modules/core/src/matrix_wrap.cpp @@ -1287,17 +1287,12 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i, { CV_Assert( i < 0 ); Mat& m = *(Mat*)obj; - if( allowTransposed ) + if (allowTransposed && !m.empty() && + d == 2 && m.dims == 2 && + m.type() == mtype && m.rows == sizes[1] && m.cols == sizes[0] && + m.isContinuous()) { - if( !m.isContinuous() ) - { - CV_Assert(!fixedType() && !fixedSize()); - m.release(); - } - - if( d == 2 && m.dims == 2 && m.data && - m.type() == mtype && m.rows == sizes[1] && m.cols == sizes[0] ) - return; + return; } if(fixedType()) @@ -1305,13 +1300,13 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i, if(CV_MAT_CN(mtype) == m.channels() && ((1 << CV_MAT_TYPE(flags)) & fixedDepthMask) != 0 ) mtype = m.type(); else - CV_Assert(CV_MAT_TYPE(mtype) == m.type()); + CV_CheckTypeEQ(m.type(), CV_MAT_TYPE(mtype), ""); } if(fixedSize()) { - CV_Assert(m.dims == d); + CV_CheckEQ(m.dims, d, ""); for(int j = 0; j < d; ++j) - CV_Assert(m.size[j] == sizes[j]); + CV_CheckEQ(m.size[j], sizes[j], ""); } m.create(d, sizes, mtype); return; @@ -1321,17 +1316,12 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i, { CV_Assert( i < 0 ); UMat& m = *(UMat*)obj; - if( allowTransposed ) + if (allowTransposed && !m.empty() && + d == 2 && m.dims == 2 && + m.type() == mtype && m.rows == sizes[1] && m.cols == sizes[0] && + m.isContinuous()) { - if( !m.isContinuous() ) - { - CV_Assert(!fixedType() && !fixedSize()); - m.release(); - } - - if( d == 2 && m.dims == 2 && !m.empty() && - m.type() == mtype && m.rows == sizes[1] && m.cols == sizes[0] ) - return; + return; } if(fixedType()) @@ -1339,13 +1329,13 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i, if(CV_MAT_CN(mtype) == m.channels() && ((1 << CV_MAT_TYPE(flags)) & fixedDepthMask) != 0 ) mtype = m.type(); else - CV_Assert(CV_MAT_TYPE(mtype) == m.type()); + CV_CheckTypeEQ(m.type(), CV_MAT_TYPE(mtype), ""); } if(fixedSize()) { - CV_Assert(m.dims == d); + CV_CheckEQ(m.dims, d, ""); for(int j = 0; j < d; ++j) - CV_Assert(m.size[j] == sizes[j]); + CV_CheckEQ(m.size[j], sizes[j], ""); } m.create(d, sizes, mtype); return; diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp index b98ec4ed34..46f10b694c 100644 --- a/modules/core/test/test_misc.cpp +++ b/modules/core/test/test_misc.cpp @@ -177,6 +177,13 @@ TEST(Core_OutputArray, FixedType) EXPECT_EQ(2, num_defaultResult); } +TEST(Core_OutputArrayCreate, _13772) +{ + cv::Mat1d mat; + cv::OutputArray o(mat); + ASSERT_NO_THROW(o.create(3, 5, CV_64F, -1, true)); +} + TEST(Core_String, find_last_of__with__empty_string) From 74574dfae47711405a126971c87fb142b042f740 Mon Sep 17 00:00:00 2001 From: Alexander Nesterov Date: Fri, 8 Feb 2019 13:12:33 -0100 Subject: [PATCH 02/21] Added optimization fuse --- modules/dnn/src/layers/convolution_layer.cpp | 90 +++++++++++++++----- modules/dnn/test/test_backends.cpp | 9 +- modules/dnn/test/test_torch_importer.cpp | 8 ++ 3 files changed, 86 insertions(+), 21 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 90645d531f..b872130ccd 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -61,6 +61,8 @@ namespace dnn class BaseConvolutionLayerImpl : public ConvolutionLayer { public: + bool newWeightAndBias; + std::vector weightsMultipliers; BaseConvolutionLayerImpl(const LayerParams ¶ms) { setParamsFrom(params); @@ -84,6 +86,8 @@ public: CV_Assert(numOutput % ngroups == 0); CV_Assert(adjustPad.width < stride.width && adjustPad.height < stride.height); + + newWeightAndBias = false; } void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE @@ -134,6 +138,20 @@ public: (dilation.height == 1 && dilation.width == 1); } + virtual bool tryFuse(Ptr& top) CV_OVERRIDE + { + Mat w, b; + top->getScaleShift(w, b); + if (!w.empty() || !b.empty()) + { + fuseWeights(w, b); + return true; + } + return false; + } + + virtual void fuseWeights(const Mat& w_, const Mat& b_) = 0; + virtual void applyHalideScheduler(Ptr& node, const std::vector &inputs, const std::vector &outputs, @@ -184,11 +202,9 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl public: enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F }; Mat weightsMat; - std::vector weightsMultipliers; std::vector biasvec; std::vector reluslope; Ptr activ; - bool newWeightAndBias; bool fusedBias; #ifdef HAVE_OPENCL @@ -200,7 +216,6 @@ public: #endif ConvolutionLayerImpl(const LayerParams ¶ms) : BaseConvolutionLayerImpl(params) { - newWeightAndBias = false; fusedBias = false; #ifdef HAVE_OPENCL newActiv = false; @@ -346,19 +361,7 @@ public: return !activ.empty(); } - virtual bool tryFuse(Ptr& top) CV_OVERRIDE - { - Mat w, b; - top->getScaleShift(w, b); - if (!w.empty() || !b.empty()) - { - fuseWeights(w, b); - return true; - } - return false; - } - - void fuseWeights(const Mat& w_, const Mat& b_) + void fuseWeights(const Mat& w_, const Mat& b_) CV_OVERRIDE { // Convolution weights have OIHW data layout. Parameters fusion in case of // (conv(I) + b1 ) * w + b2 @@ -1238,6 +1241,45 @@ public: pad.width = pad_l; pad.height = pad_t; + + weightsMultipliers.assign(numOutput, 1.0); + if (weightsMat.empty()) + { + transpose(blobs[0].reshape(1, blobs[0].size[0]), weightsMat); + biasesMat = hasBias() ? blobs[1].reshape(1, numOutput) + : Mat::zeros(numOutput, 1, CV_32F); + } + } + + void fuseWeights(const Mat& w_, const Mat& b_) CV_OVERRIDE + { + Mat w = w_.total() == 1 ? Mat(1, numOutput, CV_32F, Scalar(w_.at(0))) : w_; + Mat b = b_.total() == 1 ? Mat(1, numOutput, CV_32F, Scalar(b_.at(0))) : b_; + + CV_Assert_N(!weightsMat.empty(), + w.empty() || numOutput == w.total(), + b.empty() || numOutput == b.total()); + + if (!w.empty()) + { + transpose(blobs[0].reshape(1, blobs[0].size[0]), weightsMat); + weightsMat = weightsMat.reshape(1, numOutput); + for (int i = 0; i < numOutput; ++i) + { + double wi = w.at(i); + weightsMultipliers[i] *= wi; + cv::multiply(weightsMat.row(i), weightsMultipliers[i], weightsMat.row(i)); + biasesMat.at(i) *= wi; + } + weightsMat = weightsMat.reshape(1, weightsMat.total() / blobs[0].size[0]); + } + + if (!b.empty()) + { + cv::add(biasesMat, b.reshape(1, numOutput), biasesMat); + } + + newWeightAndBias = !w.empty() || !b.empty(); } class MatMulInvoker : public ParallelLoopBody @@ -1505,11 +1547,19 @@ public: if (umat_weights.empty()) { - transpose(blobs[0].reshape(1, inpCn), umat_weights); - if (hasBias()) - blobs[1].reshape(1, outCn).copyTo(umat_biases); + if (newWeightAndBias) + { + weightsMat.copyTo(umat_weights); + biasesMat.copyTo(umat_biases); + } else - umat_biases = UMat::zeros(outCn, 1, CV_32F); + { + transpose(blobs[0].reshape(1, inpCn), umat_weights); + if (hasBias()) + blobs[1].reshape(1, outCn).copyTo(umat_biases); + else + umat_biases = UMat::zeros(outCn, 1, CV_32F); + } } String buildopt = format("-DT=%s ", ocl::typeToStr(inputs[0].type())); diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index 8485bbedad..10f7b02e11 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -305,9 +305,16 @@ TEST_P(DNNTestNetwork, DenseNet_121) TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16) { if (backend == DNN_BACKEND_HALIDE || - (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) || (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)) throw SkipTestException(""); + +#if defined(INF_ENGINE_RELEASE) +#if INF_ENGINE_RELEASE <= 2018050000 + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) + throw SkipTestException(""); +#endif +#endif + Mat img = imread(findDataFile("dnn/googlenet_1.png", false)); Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false); // Output image has values in range [-143.526, 148.539]. diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index 4e00b10279..11e6ee49e8 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -394,6 +394,14 @@ TEST_P(Test_Torch_nets, ENet_accuracy) TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy) { checkBackend(); + +#if defined(INF_ENGINE_RELEASE) +#if INF_ENGINE_RELEASE <= 2018050000 + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) + throw SkipTestException(""); +#endif +#endif + std::string models[] = {"dnn/fast_neural_style_eccv16_starry_night.t7", "dnn/fast_neural_style_instance_norm_feathers.t7"}; std::string targets[] = {"dnn/lena_starry_night.png", "dnn/lena_feathers.png"}; From d5a2fe51807a5e9343502de71096f5712bc88af8 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 6 Mar 2019 15:52:23 +0300 Subject: [PATCH 03/21] perf: ignore _ovx tests --- modules/imgproc/perf/perf_filter2d.cpp | 2 +- modules/imgproc/perf/perf_pyramids.cpp | 2 +- modules/imgproc/perf/perf_warp.cpp | 4 ++-- modules/video/perf/perf_optflowpyrlk.cpp | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/imgproc/perf/perf_filter2d.cpp b/modules/imgproc/perf/perf_filter2d.cpp index 6095eaad2f..249afd94c4 100644 --- a/modules/imgproc/perf/perf_filter2d.cpp +++ b/modules/imgproc/perf/perf_filter2d.cpp @@ -39,7 +39,7 @@ PERF_TEST_P( TestFilter2d, Filter2d, SANITY_CHECK(dst, 1); } -PERF_TEST_P(TestFilter2d, Filter2d_ovx, +PERF_TEST_P(TestFilter2d, DISABLED_Filter2d_ovx, Combine( Values(Size(320, 240), sz1080p), Values(3, 5), diff --git a/modules/imgproc/perf/perf_pyramids.cpp b/modules/imgproc/perf/perf_pyramids.cpp index 22c3fa9064..5dd5b9cece 100644 --- a/modules/imgproc/perf/perf_pyramids.cpp +++ b/modules/imgproc/perf/perf_pyramids.cpp @@ -26,7 +26,7 @@ PERF_TEST_P(Size_MatType, pyrDown, testing::Combine( SANITY_CHECK(dst, eps, error_type); } -PERF_TEST_P(Size_MatType, pyrDown_ovx, testing::Combine( +PERF_TEST_P(Size_MatType, DISABLED_pyrDown_ovx, testing::Combine( testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD), testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16SC1, CV_16SC3, CV_16SC4, CV_32FC1, CV_32FC3, CV_32FC4) ) diff --git a/modules/imgproc/perf/perf_warp.cpp b/modules/imgproc/perf/perf_warp.cpp index b51e9ae75c..53bacc16d3 100644 --- a/modules/imgproc/perf/perf_warp.cpp +++ b/modules/imgproc/perf/perf_warp.cpp @@ -48,7 +48,7 @@ PERF_TEST_P( TestWarpAffine, WarpAffine, #endif } -PERF_TEST_P(TestWarpAffine, WarpAffine_ovx, +PERF_TEST_P(TestWarpAffine, DISABLED_WarpAffine_ovx, Combine( Values(szVGA, sz720p, sz1080p), InterType::all(), @@ -116,7 +116,7 @@ PERF_TEST_P( TestWarpPerspective, WarpPerspective, #endif } -PERF_TEST_P(TestWarpPerspective, WarpPerspective_ovx, +PERF_TEST_P(TestWarpPerspective, DISABLED_WarpPerspective_ovx, Combine( Values(szVGA, sz720p, sz1080p), InterType::all(), diff --git a/modules/video/perf/perf_optflowpyrlk.cpp b/modules/video/perf/perf_optflowpyrlk.cpp index 3a58ca9067..528512bffc 100644 --- a/modules/video/perf/perf_optflowpyrlk.cpp +++ b/modules/video/perf/perf_optflowpyrlk.cpp @@ -97,7 +97,7 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize, OpticalFlowPyrLK_full, testing::Combine( typedef tuple, int> Path_Idx_NPoints_WSize_t; typedef TestBaseWithParam Path_Idx_NPoints_WSize; -PERF_TEST_P(Path_Idx_NPoints_WSize, OpticalFlowPyrLK_ovx, testing::Combine( +PERF_TEST_P(Path_Idx_NPoints_WSize, DISABLED_OpticalFlowPyrLK_ovx, testing::Combine( testing::Values("cv/optflow/frames/VGA_%02d.png", "cv/optflow/frames/720p_%02d.png"), testing::Range(1, 3), testing::Values(make_tuple(9, 9), make_tuple(15, 15)), From 796b0fec7d9005d70c5f5b1b970780c28f83dca8 Mon Sep 17 00:00:00 2001 From: "Christopher N. Hesse" Date: Wed, 6 Mar 2019 17:14:59 +0100 Subject: [PATCH 04/21] videoio: gst: Fix gst assertion on null msg According to the gstreamer docs [1], the GstMessage pointer returned by gst_bus_pop() is nullable, meaning NULL is a valid return value. Previously, gst_is_missing_plugin_message would throw an assert when its message object parameter would fail the GST_IS_MESSAGE macro check, crashing the entire process (unless running in a try-catch block of course). Instead of relying on valid messages, check if the message object itself is valid before passing it to other gstreamer functions. [1] https://gstreamer.freedesktop.org/data/doc/gstreamer/head/gstreamer/html/GstBus.html#gst-bus-pop Signed-off-by: Christopher N. Hesse --- modules/videoio/src/cap_gstreamer.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp index 23d39d247a..95ff159cc3 100644 --- a/modules/videoio/src/cap_gstreamer.cpp +++ b/modules/videoio/src/cap_gstreamer.cpp @@ -1811,6 +1811,10 @@ void handleMessage(GstElement * pipeline) while(gst_bus_have_pending(bus)) { msg = gst_bus_pop(bus); + if (!msg || !GST_IS_MESSAGE(msg)) + { + continue; + } //printf("\t\tGot %s message\n", GST_MESSAGE_TYPE_NAME(msg)); From fcfb29766be71f6339c691eaeb8b55f079bfca7b Mon Sep 17 00:00:00 2001 From: Easton Liu Date: Thu, 7 Mar 2019 09:55:48 +0800 Subject: [PATCH 05/21] Add ability to read thresh and nms_threshold from YOLO layer in YOLOV3 cfg file. Currently the thresh is hard-coded to be 0.2 and nms_threshold as 0.4. --- modules/dnn/src/darknet/darknet_io.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp index 815b84f651..54a53fd867 100644 --- a/modules/dnn/src/darknet/darknet_io.cpp +++ b/modules/dnn/src/darknet/darknet_io.cpp @@ -371,7 +371,7 @@ namespace cv { fused_layer_names.push_back(last_layer); } - void setYolo(int classes, const std::vector& mask, const std::vector& anchors) + void setYolo(int classes, const std::vector& mask, const std::vector& anchors, float thresh, float nms_threshold) { cv::dnn::LayerParams region_param; region_param.name = "Region-name"; @@ -382,6 +382,8 @@ namespace cv { region_param.set("classes", classes); region_param.set("anchors", numAnchors); region_param.set("logistic", true); + region_param.set("thresh", thresh); + region_param.set("nms_threshold", nms_threshold); std::vector usedAnchors(numAnchors * 2); for (int i = 0; i < numAnchors; ++i) @@ -646,6 +648,8 @@ namespace cv { { int classes = getParam(layer_params, "classes", -1); int num_of_anchors = getParam(layer_params, "num", -1); + float thresh = getParam(layer_params, "thresh", 0.2); + float nms_threshold = getParam(layer_params, "nms_threshold", 0.4); std::string anchors_values = getParam(layer_params, "anchors", std::string()); CV_Assert(!anchors_values.empty()); @@ -658,7 +662,7 @@ namespace cv { CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size()); setParams.setPermute(false); - setParams.setYolo(classes, mask_vec, anchors_vec); + setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold); } else { CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type); From db588bb831cb62813a14b81ee0080d63431c4a58 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 6 Mar 2019 17:59:27 +0300 Subject: [PATCH 06/21] imgproc: clone color*.simd.hpp --- modules/imgproc/src/{color.hpp => color.simd_helpers.hpp} | 0 modules/imgproc/src/{color_hsv.cpp => color_hsv.simd.hpp} | 0 modules/imgproc/src/{color_rgb.cpp => color_rgb.simd.hpp} | 0 modules/imgproc/src/{color_yuv.cpp => color_yuv.simd.hpp} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename modules/imgproc/src/{color.hpp => color.simd_helpers.hpp} (100%) rename modules/imgproc/src/{color_hsv.cpp => color_hsv.simd.hpp} (100%) rename modules/imgproc/src/{color_rgb.cpp => color_rgb.simd.hpp} (100%) rename modules/imgproc/src/{color_yuv.cpp => color_yuv.simd.hpp} (100%) diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.simd_helpers.hpp similarity index 100% rename from modules/imgproc/src/color.hpp rename to modules/imgproc/src/color.simd_helpers.hpp diff --git a/modules/imgproc/src/color_hsv.cpp b/modules/imgproc/src/color_hsv.simd.hpp similarity index 100% rename from modules/imgproc/src/color_hsv.cpp rename to modules/imgproc/src/color_hsv.simd.hpp diff --git a/modules/imgproc/src/color_rgb.cpp b/modules/imgproc/src/color_rgb.simd.hpp similarity index 100% rename from modules/imgproc/src/color_rgb.cpp rename to modules/imgproc/src/color_rgb.simd.hpp diff --git a/modules/imgproc/src/color_yuv.cpp b/modules/imgproc/src/color_yuv.simd.hpp similarity index 100% rename from modules/imgproc/src/color_yuv.cpp rename to modules/imgproc/src/color_yuv.simd.hpp From f26912960f1a5293f267d75080706d8b520f51f4 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 6 Mar 2019 18:00:43 +0300 Subject: [PATCH 07/21] imgproc: clone color*.dispatch.cpp --- modules/imgproc/src/{color_hsv.cpp => color_hsv.dispatch.cpp} | 0 modules/imgproc/src/{color_rgb.cpp => color_rgb.dispatch.cpp} | 0 modules/imgproc/src/{color_yuv.cpp => color_yuv.dispatch.cpp} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename modules/imgproc/src/{color_hsv.cpp => color_hsv.dispatch.cpp} (100%) rename modules/imgproc/src/{color_rgb.cpp => color_rgb.dispatch.cpp} (100%) rename modules/imgproc/src/{color_yuv.cpp => color_yuv.dispatch.cpp} (100%) diff --git a/modules/imgproc/src/color_hsv.cpp b/modules/imgproc/src/color_hsv.dispatch.cpp similarity index 100% rename from modules/imgproc/src/color_hsv.cpp rename to modules/imgproc/src/color_hsv.dispatch.cpp diff --git a/modules/imgproc/src/color_rgb.cpp b/modules/imgproc/src/color_rgb.dispatch.cpp similarity index 100% rename from modules/imgproc/src/color_rgb.cpp rename to modules/imgproc/src/color_rgb.dispatch.cpp diff --git a/modules/imgproc/src/color_yuv.cpp b/modules/imgproc/src/color_yuv.dispatch.cpp similarity index 100% rename from modules/imgproc/src/color_yuv.cpp rename to modules/imgproc/src/color_yuv.dispatch.cpp From 8b541e450b511fde9dd363fa55a30fbb6fc0ace6 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Thu, 7 Mar 2019 13:25:37 +0300 Subject: [PATCH 08/21] imgproc: dispatch color* Lab/XYZ modes have been postponed (color_lab.cpp): - need to split code for tables initialization and for pixels processing first - no significant performance improvements for switching between SSE42 / AVX2 code generation --- .../include/opencv2/core/cv_cpu_dispatch.h | 4 + modules/core/include/opencv2/core/private.hpp | 4 +- modules/imgproc/CMakeLists.txt | 3 + modules/imgproc/src/color.cpp | 1 + modules/imgproc/src/color.hpp | 129 +- modules/imgproc/src/color.simd_helpers.hpp | 538 +---- modules/imgproc/src/color_hsv.dispatch.cpp | 1233 +---------- modules/imgproc/src/color_hsv.simd.hpp | 352 +-- modules/imgproc/src/color_lab.cpp | 4 + modules/imgproc/src/color_rgb.dispatch.cpp | 1101 +--------- modules/imgproc/src/color_rgb.simd.hpp | 552 +---- modules/imgproc/src/color_yuv.dispatch.cpp | 1878 +---------------- modules/imgproc/src/color_yuv.simd.hpp | 364 +--- 13 files changed, 238 insertions(+), 5925 deletions(-) diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 08909f8b28..7f6d6b0fb9 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -124,6 +124,10 @@ #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX struct VZeroUpperGuard { +#ifdef __GNUC__ + __attribute__((always_inline)) +#endif + inline VZeroUpperGuard() { _mm256_zeroupper(); } #ifdef __GNUC__ __attribute__((always_inline)) #endif diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp index 1ea8c28643..c3f5b87267 100644 --- a/modules/core/include/opencv2/core/private.hpp +++ b/modules/core/include/opencv2/core/private.hpp @@ -796,9 +796,9 @@ CV_EXPORTS InstrNode* getCurrentNode(); #endif #ifdef __CV_AVX_GUARD -#define CV_INSTRUMENT_REGION(); __CV_AVX_GUARD CV_INSTRUMENT_REGION_(); +#define CV_INSTRUMENT_REGION() __CV_AVX_GUARD CV_INSTRUMENT_REGION_(); #else -#define CV_INSTRUMENT_REGION(); CV_INSTRUMENT_REGION_(); +#define CV_INSTRUMENT_REGION() CV_INSTRUMENT_REGION_(); #endif namespace cv { diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 1caadbbbad..6232aa5fab 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,3 +1,6 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 38d35c014d..8f268e07e0 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -3,6 +3,7 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" +#include "opencl_kernels_imgproc.hpp" #include "color.hpp" namespace cv diff --git a/modules/imgproc/src/color.hpp b/modules/imgproc/src/color.hpp index 70e7844277..8c1f19fa8a 100644 --- a/modules/imgproc/src/color.hpp +++ b/modules/imgproc/src/color.hpp @@ -3,59 +3,17 @@ // of this distribution and at http://opencv.org/license.html #include "opencv2/imgproc.hpp" -#include "opencv2/core/utility.hpp" -#include -#include "opencl_kernels_imgproc.hpp" #include "hal_replacement.hpp" -#include "opencv2/core/hal/intrin.hpp" -#include "opencv2/core/softfloat.hpp" -#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) - -namespace cv -{ - -//constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601 -const float B2YF = 0.114f; -const float G2YF = 0.587f; -const float R2YF = 0.299f; - -enum -{ - yuv_shift = 14, - xyz_shift = 12, - R2Y = 4899, // == R2YF*16384 - G2Y = 9617, // == G2YF*16384 - B2Y = 1868, // == B2YF*16384 - BLOCK_SIZE = 256 -}; - -template struct ColorChannel -{ - typedef float worktype_f; - static _Tp max() { return std::numeric_limits<_Tp>::max(); } - static _Tp half() { return (_Tp)(max()/2 + 1); } -}; - -template<> struct ColorChannel -{ - typedef float worktype_f; - static float max() { return 1.f; } - static float half() { return 0.5f; } -}; - -/*template<> struct ColorChannel -{ - typedef double worktype_f; - static double max() { return 1.; } - static double half() { return 0.5; } -};*/ +namespace cv { // // Helper functions // -namespace { +namespace impl { + +#include "color.simd_helpers.hpp" inline bool isHSV(int code) { @@ -209,40 +167,9 @@ inline int uIndex(int code) } } // namespace:: +using namespace impl; -template -struct Set -{ - static bool contains(int i) - { - return (i == i0 || i == i1 || i == i2); - } -}; - -template -struct Set -{ - static bool contains(int i) - { - return (i == i0 || i == i1); - } -}; - -template -struct Set -{ - static bool contains(int i) - { - return (i == i0); - } -}; - -enum SizePolicy -{ - TO_YUV, FROM_YUV, NONE -}; - -template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > +/*template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > struct CvtHelper { CvtHelper(InputArray _src, OutputArray _dst, int dcn) @@ -282,7 +209,7 @@ struct CvtHelper Mat src, dst; int depth, scn; Size dstSz; -}; +};*/ #ifdef HAVE_OPENCL @@ -380,49 +307,7 @@ struct OclHelper #endif -///////////////////////////// Top-level template function //////////////////////////////// -template -class CvtColorLoop_Invoker : public ParallelLoopBody -{ - typedef typename Cvt::channel_type _Tp; -public: - - CvtColorLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt) : - ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), - width(width_), cvt(_cvt) - { - } - - virtual void operator()(const Range& range) const CV_OVERRIDE - { - CV_TRACE_FUNCTION(); - - const uchar* yS = src_data + static_cast(range.start) * src_step; - uchar* yD = dst_data + static_cast(range.start) * dst_step; - - for( int i = range.start; i < range.end; ++i, yS += src_step, yD += dst_step ) - cvt(reinterpret_cast(yS), reinterpret_cast<_Tp*>(yD), width); - } - -private: - const uchar * src_data; - const size_t src_step; - uchar * dst_data; - const size_t dst_step; - const int width; - const Cvt& cvt; - - const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&); -}; - -template -void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) -{ - parallel_for_(Range(0, height), - CvtColorLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt), - (width * height) / static_cast(1<<16)); -} #if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) # define NEED_IPP 1 diff --git a/modules/imgproc/src/color.simd_helpers.hpp b/modules/imgproc/src/color.simd_helpers.hpp index 70e7844277..343491f2c6 100644 --- a/modules/imgproc/src/color.simd_helpers.hpp +++ b/modules/imgproc/src/color.simd_helpers.hpp @@ -2,23 +2,14 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html -#include "opencv2/imgproc.hpp" -#include "opencv2/core/utility.hpp" -#include -#include "opencl_kernels_imgproc.hpp" -#include "hal_replacement.hpp" -#include "opencv2/core/hal/intrin.hpp" -#include "opencv2/core/softfloat.hpp" - #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) -namespace cv -{ +namespace { //constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601 -const float B2YF = 0.114f; -const float G2YF = 0.587f; -const float R2YF = 0.299f; +static const float B2YF = 0.114f; +static const float G2YF = 0.587f; +static const float R2YF = 0.299f; enum { @@ -33,15 +24,15 @@ enum template struct ColorChannel { typedef float worktype_f; - static _Tp max() { return std::numeric_limits<_Tp>::max(); } - static _Tp half() { return (_Tp)(max()/2 + 1); } + static inline _Tp max() { return std::numeric_limits<_Tp>::max(); } + static inline _Tp half() { return (_Tp)(max()/2 + 1); } }; template<> struct ColorChannel { typedef float worktype_f; - static float max() { return 1.f; } - static float half() { return 0.5f; } + static inline float max() { return 1.f; } + static inline float half() { return 0.5f; } }; /*template<> struct ColorChannel @@ -51,169 +42,11 @@ template<> struct ColorChannel static double half() { return 0.5; } };*/ -// -// Helper functions -// - -namespace { - -inline bool isHSV(int code) -{ - switch(code) - { - case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: - case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: - return true; - default: - return false; - } -} - -inline bool isLab(int code) -{ - switch (code) - { - case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Lab2LBGR: case COLOR_Lab2LRGB: - case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_LBGR2Lab: case COLOR_LRGB2Lab: - return true; - default: - return false; - } -} - -inline bool is_sRGB(int code) -{ - switch (code) - { - case COLOR_BGR2Lab: case COLOR_RGB2Lab: case COLOR_BGR2Luv: case COLOR_RGB2Luv: - case COLOR_Lab2BGR: case COLOR_Lab2RGB: case COLOR_Luv2BGR: case COLOR_Luv2RGB: - return true; - default: - return false; - } -} - -inline bool swapBlue(int code) -{ - switch (code) - { - case COLOR_BGR2BGRA: case COLOR_BGRA2BGR: - case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: - case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: - case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY: - case COLOR_BGR2YCrCb: case COLOR_BGR2YUV: - case COLOR_YCrCb2BGR: case COLOR_YUV2BGR: - case COLOR_BGR2XYZ: case COLOR_XYZ2BGR: - case COLOR_BGR2HSV: case COLOR_BGR2HLS: case COLOR_BGR2HSV_FULL: case COLOR_BGR2HLS_FULL: - case COLOR_YUV2BGR_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2BGRA_IYUV: - case COLOR_YUV2BGR_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2BGRA_NV12: - case COLOR_Lab2BGR: case COLOR_Luv2BGR: case COLOR_Lab2LBGR: case COLOR_Luv2LBGR: - case COLOR_BGR2Lab: case COLOR_BGR2Luv: case COLOR_LBGR2Lab: case COLOR_LBGR2Luv: - case COLOR_HSV2BGR: case COLOR_HLS2BGR: case COLOR_HSV2BGR_FULL: case COLOR_HLS2BGR_FULL: - case COLOR_YUV2BGR_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2BGR_YUY2: - case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2BGRA_YVYU: - case COLOR_BGR2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: case COLOR_BGR2YUV_YV12: case COLOR_BGRA2YUV_YV12: - return false; - default: - return true; - } -} - -inline bool isFullRangeHSV(int code) -{ - switch (code) - { - case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL: - case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL: - return true; - default: - return false; - } -} - -inline int dstChannels(int code) -{ - switch( code ) - { - case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2RGBA: - case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA: - case COLOR_GRAY2BGRA: - case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12: - case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV: - case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU: - case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: - - return 4; - - case COLOR_BGRA2BGR: case COLOR_RGBA2BGR: case COLOR_RGB2BGR: - case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB: - case COLOR_GRAY2BGR: - case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: - case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: - case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: - case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: - - return 3; - - default: - return 0; - } -} - -inline int greenBits(int code) -{ - switch( code ) - { - case COLOR_BGR2BGR565: case COLOR_RGB2BGR565: case COLOR_BGRA2BGR565: case COLOR_RGBA2BGR565: - case COLOR_BGR5652BGR: case COLOR_BGR5652RGB: case COLOR_BGR5652BGRA: case COLOR_BGR5652RGBA: - case COLOR_BGR5652GRAY: case COLOR_GRAY2BGR565: - - return 6; - - case COLOR_BGR2BGR555: case COLOR_RGB2BGR555: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR555: - case COLOR_BGR5552BGR: case COLOR_BGR5552RGB: case COLOR_BGR5552BGRA: case COLOR_BGR5552RGBA: - case COLOR_BGR5552GRAY: case COLOR_GRAY2BGR555: - - return 5; - - default: - return 0; - } -} - -inline int uIndex(int code) -{ - switch( code ) - { - case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12: - - return 2; - - case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU: - case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV: - case COLOR_YUV2BGR_NV21: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGRA_NV21: case COLOR_YUV2RGBA_NV21: - case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12: - - return 1; - - case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12: - case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV: - case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY: - case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: - - return 0; - - default: - return -1; - } -} - -} // namespace:: template struct Set { - static bool contains(int i) + static inline bool contains(int i) { return (i == i0 || i == i1 || i == i2); } @@ -222,7 +55,7 @@ struct Set template struct Set { - static bool contains(int i) + static inline bool contains(int i) { return (i == i0 || i == i1); } @@ -231,7 +64,7 @@ struct Set template struct Set { - static bool contains(int i) + static inline bool contains(int i) { return (i == i0); } @@ -284,101 +117,6 @@ struct CvtHelper Size dstSz; }; -#ifdef HAVE_OPENCL - -template< typename VScn, typename VDcn, typename VDepth, SizePolicy sizePolicy = NONE > -struct OclHelper -{ - OclHelper( InputArray _src, OutputArray _dst, int dcn) : - nArgs(0) - { - src = _src.getUMat(); - Size sz = src.size(), dstSz; - int scn = src.channels(); - int depth = src.depth(); - - CV_Assert( VScn::contains(scn) && VDcn::contains(dcn) && VDepth::contains(depth) ); - switch (sizePolicy) - { - case TO_YUV: - CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 ); - dstSz = Size(sz.width, sz.height / 2 * 3); - break; - case FROM_YUV: - CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 ); - dstSz = Size(sz.width, sz.height * 2 / 3); - break; - case NONE: - default: - dstSz = sz; - break; - } - - _dst.create(dstSz, CV_MAKETYPE(depth, dcn)); - dst = _dst.getUMat(); - } - - bool createKernel(cv::String name, ocl::ProgramSource& source, cv::String options) - { - ocl::Device dev = ocl::Device::getDefault(); - int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1; - int pxPerWIx = 1; - - cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ", - src.depth(), src.channels(), pxPerWIy); - - switch (sizePolicy) - { - case TO_YUV: - if (dev.isIntel() && - src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 && - dst.step % 4 == 0 && dst.offset % 4 == 0) - { - pxPerWIx = 2; - } - globalSize[0] = (size_t)dst.cols/(2*pxPerWIx); - globalSize[1] = ((size_t)dst.rows/3 + pxPerWIy - 1) / pxPerWIy; - baseOptions += format("-D PIX_PER_WI_X=%d ", pxPerWIx); - break; - case FROM_YUV: - globalSize[0] = (size_t)dst.cols/2; - globalSize[1] = ((size_t)dst.rows/2 + pxPerWIy - 1) / pxPerWIy; - break; - case NONE: - default: - globalSize[0] = (size_t)src.cols; - globalSize[1] = ((size_t)src.rows + pxPerWIy - 1) / pxPerWIy; - break; - } - - k.create(name.c_str(), source, baseOptions + options); - - if(k.empty()) - return false; - - nArgs = k.set(0, ocl::KernelArg::ReadOnlyNoSize(src)); - nArgs = k.set(nArgs, ocl::KernelArg::WriteOnly(dst)); - return true; - } - - bool run() - { - return k.run(2, globalSize, NULL, false); - } - - template - void setArg(const T& arg) - { - nArgs = k.set(nArgs, arg); - } - - UMat src, dst; - ocl::Kernel k; - size_t globalSize[2]; - int nArgs; -}; - -#endif ///////////////////////////// Top-level template function //////////////////////////////// @@ -413,261 +151,17 @@ private: const int width; const Cvt& cvt; - const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&); + CvtColorLoop_Invoker(const CvtColorLoop_Invoker&); // = delete; + const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&); // = delete; }; -template +template static inline void CvtColorLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) { + CV_AVX_GUARD parallel_for_(Range(0, height), CvtColorLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt), (width * height) / static_cast(1<<16)); } -#if defined (HAVE_IPP) && (IPP_VERSION_X100 >= 700) -# define NEED_IPP 1 -#else -# define NEED_IPP 0 -#endif - -#if NEED_IPP - -#define MAX_IPP8u 255 -#define MAX_IPP16u 65535 -#define MAX_IPP32f 1.0 - -typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *); -typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize); -typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *); - -template -class CvtColorIPPLoop_Invoker : - public ParallelLoopBody -{ -public: - - CvtColorIPPLoop_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, const Cvt& _cvt, bool *_ok) : - ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), cvt(_cvt), ok(_ok) - { - *ok = true; - } - - virtual void operator()(const Range& range) const CV_OVERRIDE - { - const void *yS = src_data + src_step * range.start; - void *yD = dst_data + dst_step * range.start; - if( !cvt(yS, static_cast(src_step), yD, static_cast(dst_step), width, range.end - range.start) ) - *ok = false; - else - { - CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); - } - } - -private: - const uchar * src_data; - const size_t src_step; - uchar * dst_data; - const size_t dst_step; - const int width; - const Cvt& cvt; - bool *ok; - - const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&); -}; - - -template -bool CvtColorIPPLoop(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) -{ - bool ok; - parallel_for_(Range(0, height), CvtColorIPPLoop_Invoker(src_data, src_step, dst_data, dst_step, width, cvt, &ok), (width * height)/(double)(1<<16) ); - return ok; -} - - -template -bool CvtColorIPPLoopCopy(const uchar * src_data, size_t src_step, int src_type, uchar * dst_data, size_t dst_step, int width, int height, const Cvt& cvt) -{ - Mat temp; - Mat src(Size(width, height), src_type, const_cast(src_data), src_step); - Mat source = src; - if( src_data == dst_data ) - { - src.copyTo(temp); - source = temp; - } - bool ok; - parallel_for_(Range(0, source.rows), - CvtColorIPPLoop_Invoker(source.data, source.step, dst_data, dst_step, - source.cols, cvt, &ok), - source.total()/(double)(1<<16) ); - return ok; -} - - -struct IPPGeneralFunctor -{ - IPPGeneralFunctor(ippiGeneralFunc _func) : ippiColorConvertGeneral(_func){} - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiColorConvertGeneral ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false; - } -private: - ippiGeneralFunc ippiColorConvertGeneral; -}; - - -struct IPPReorderFunctor -{ - IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : ippiColorConvertReorder(_func) - { - order[0] = _order0; - order[1] = _order1; - order[2] = _order2; - order[3] = 3; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiColorConvertReorder ? CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false; - } -private: - ippiReorderFunc ippiColorConvertReorder; - int order[4]; -}; - - -struct IPPReorderGeneralFunctor -{ - IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) : - ippiColorConvertReorder(_func1), ippiColorConvertGeneral(_func2), depth(_depth) - { - order[0] = _order0; - order[1] = _order1; - order[2] = _order2; - order[3] = 3; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - if (ippiColorConvertReorder == 0 || ippiColorConvertGeneral == 0) - return false; - - Mat temp; - temp.create(rows, cols, CV_MAKETYPE(depth, 3)); - if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0) - return false; - return CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0; - } -private: - ippiReorderFunc ippiColorConvertReorder; - ippiGeneralFunc ippiColorConvertGeneral; - int order[4]; - int depth; -}; - - -struct IPPGeneralReorderFunctor -{ - IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) : - ippiColorConvertGeneral(_func1), ippiColorConvertReorder(_func2), depth(_depth) - { - order[0] = _order0; - order[1] = _order1; - order[2] = _order2; - order[3] = 3; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - if (ippiColorConvertGeneral == 0 || ippiColorConvertReorder == 0) - return false; - - Mat temp; - temp.create(rows, cols, CV_MAKETYPE(depth, 3)); - if(CV_INSTRUMENT_FUN_IPP(ippiColorConvertGeneral, src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0) - return false; - return CV_INSTRUMENT_FUN_IPP(ippiColorConvertReorder, temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0; - } -private: - ippiGeneralFunc ippiColorConvertGeneral; - ippiReorderFunc ippiColorConvertReorder; - int order[4]; - int depth; -}; - -extern ippiReorderFunc ippiSwapChannelsC3C4RTab[8]; -extern ippiReorderFunc ippiSwapChannelsC4C3RTab[8]; -extern ippiReorderFunc ippiSwapChannelsC3RTab[8]; - -#endif - -#ifdef HAVE_OPENCL - -bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb ); -bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb ); -bool oclCvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb); -bool oclCvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool srgb); -bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx ); -bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ); - -bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ); -bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ); -bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full ); -bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full ); - -bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse ); -bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits ); -bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits ); -bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits ); -bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits ); -bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx ); -bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn ); -bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst ); -bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst ); - -bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx); -bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx); -bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx ); -bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ); - -bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx ); -bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ); -bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ); -bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx ); -bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ); - -#endif - -void cvtColorBGR2Lab( InputArray _src, OutputArray _dst, bool swapb, bool srgb); -void cvtColorBGR2Luv( InputArray _src, OutputArray _dst, bool swapb, bool srgb); -void cvtColorLab2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb ); -void cvtColorLuv2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool srgb ); -void cvtColorBGR2XYZ( InputArray _src, OutputArray _dst, bool swapb ); -void cvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb ); - -void cvtColorBGR2YUV( InputArray _src, OutputArray _dst, bool swapb, bool crcb); -void cvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb); - -void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn); -void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ); -void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx ); -void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ); -void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx); -void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ); -void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi ); - -void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ); -void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ); -void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange); -void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange); - -void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb); -void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits); -void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits); -void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb); -void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn); -void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits); -void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits); -void cvtColorRGBA2mRGBA(InputArray _src, OutputArray _dst); -void cvtColormRGBA2RGBA(InputArray _src, OutputArray _dst); - -} //namespace cv +} //namespace diff --git a/modules/imgproc/src/color_hsv.dispatch.cpp b/modules/imgproc/src/color_hsv.dispatch.cpp index f0a4c87558..f1678f5deb 100644 --- a/modules/imgproc/src/color_hsv.dispatch.cpp +++ b/modules/imgproc/src/color_hsv.dispatch.cpp @@ -3,1194 +3,15 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" + +#include "opencl_kernels_imgproc.hpp" + #include "color.hpp" -namespace cv -{ +#include "color_hsv.simd.hpp" +#include "color_hsv.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content -////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// - - -struct RGB2HSV_b -{ - typedef uchar channel_type; - - RGB2HSV_b(int _srccn, int _blueIdx, int _hrange) - : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) - { - CV_Assert( hrange == 180 || hrange == 256 ); - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int i, bidx = blueIdx, scn = srccn; - const int hsv_shift = 12; - - static int sdiv_table[256]; - static int hdiv_table180[256]; - static int hdiv_table256[256]; - static volatile bool initialized = false; - - int hr = hrange; - const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256; - n *= 3; - - if( !initialized ) - { - sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0; - for( i = 1; i < 256; i++ ) - { - sdiv_table[i] = saturate_cast((255 << hsv_shift)/(1.*i)); - hdiv_table180[i] = saturate_cast((180 << hsv_shift)/(6.*i)); - hdiv_table256[i] = saturate_cast((256 << hsv_shift)/(6.*i)); - } - initialized = true; - } - - for( i = 0; i < n; i += 3, src += scn ) - { - int b = src[bidx], g = src[1], r = src[bidx^2]; - int h, s, v = b; - int vmin = b; - int vr, vg; - - CV_CALC_MAX_8U( v, g ); - CV_CALC_MAX_8U( v, r ); - CV_CALC_MIN_8U( vmin, g ); - CV_CALC_MIN_8U( vmin, r ); - - uchar diff = saturate_cast(v - vmin); - vr = v == r ? -1 : 0; - vg = v == g ? -1 : 0; - - s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift; - h = (vr & (g - b)) + - (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); - h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift; - h += h < 0 ? hr : 0; - - dst[i] = saturate_cast(h); - dst[i+1] = (uchar)s; - dst[i+2] = (uchar)v; - } - } - - int srccn, blueIdx, hrange; -}; - - -struct RGB2HSV_f -{ - typedef float channel_type; - - RGB2HSV_f(int _srccn, int _blueIdx, float _hrange) - : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - #if CV_SIMD128 - inline void process(v_float32x4& v_r, v_float32x4& v_g, - v_float32x4& v_b, float hscale) const - { - v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b); - v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b); - - v_float32x4 v_eps = v_setall_f32(FLT_EPSILON); - v_float32x4 v_diff = v_max_rgb - v_min_rgb; - v_float32x4 v_s = v_diff / (v_abs(v_max_rgb) + v_eps); - - v_float32x4 v_r_eq_max = v_r == v_max_rgb; - v_float32x4 v_g_eq_max = v_g == v_max_rgb; - v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b, - v_select(v_g_eq_max, v_b - v_r, v_r - v_g)); - v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f), - v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f))); - v_float32x4 v_rev_diff = v_setall_f32(60.0f) / (v_diff + v_eps); - v_r = v_muladd(v_h, v_rev_diff, v_res) * v_setall_f32(hscale); - - v_g = v_s; - v_b = v_max_rgb; - } - #endif - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, scn = srccn; - float hscale = hrange*(1.f/360.f); - n *= 3; - - #if CV_SIMD128 - if (hasSIMD) - { - if (scn == 3) { - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_r, v_g, v_b, hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_b, v_g, v_r, hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } else { // scn == 4 - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_r, v_g, v_b, hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_b, v_g, v_r, hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } - } - #endif - - for( ; i < n; i += 3, src += scn ) - { - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h, s, v; - - float vmin, diff; - - v = vmin = r; - if( v < g ) v = g; - if( v < b ) v = b; - if( vmin > g ) vmin = g; - if( vmin > b ) vmin = b; - - diff = v - vmin; - s = diff/(float)(fabs(v) + FLT_EPSILON); - diff = (float)(60./(diff + FLT_EPSILON)); - if( v == r ) - h = (g - b)*diff; - else if( v == g ) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if( h < 0 ) h += 360.f; - - dst[i] = h*hscale; - dst[i+1] = s; - dst[i+2] = v; - } - } - - int srccn, blueIdx; - float hrange; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -#if CV_SIMD128 -inline void HSV2RGB_simd(v_float32x4& v_h, v_float32x4& v_s, v_float32x4& v_v, float hscale) -{ - v_h = v_h * v_setall_f32(hscale); - v_float32x4 v_pre_sector = v_cvt_f32(v_trunc(v_h)); - v_h = v_h - v_pre_sector; - v_float32x4 v_tab0 = v_v; - v_float32x4 v_one = v_setall_f32(1.0f); - v_float32x4 v_tab1 = v_v * (v_one - v_s); - v_float32x4 v_tab2 = v_v * (v_one - (v_s * v_h)); - v_float32x4 v_tab3 = v_v * (v_one - (v_s * (v_one - v_h))); - - v_float32x4 v_one_sixth = v_setall_f32(1.0f / 6.0f); - v_float32x4 v_sector = v_pre_sector * v_one_sixth; - v_sector = v_cvt_f32(v_trunc(v_sector)); - v_float32x4 v_six = v_setall_f32(6.0f); - v_sector = v_pre_sector - (v_sector * v_six); - - v_float32x4 v_two = v_setall_f32(2.0f); - v_h = v_tab1 & (v_sector < v_two); - v_h = v_h | (v_tab3 & (v_sector == v_two)); - v_float32x4 v_three = v_setall_f32(3.0f); - v_h = v_h | (v_tab0 & (v_sector == v_three)); - v_float32x4 v_four = v_setall_f32(4.0f); - v_h = v_h | (v_tab0 & (v_sector == v_four)); - v_h = v_h | (v_tab2 & (v_sector > v_four)); - - v_s = v_tab3 & (v_sector < v_one); - v_s = v_s | (v_tab0 & (v_sector == v_one)); - v_s = v_s | (v_tab0 & (v_sector == v_two)); - v_s = v_s | (v_tab2 & (v_sector == v_three)); - v_s = v_s | (v_tab1 & (v_sector > v_three)); - - v_v = v_tab0 & (v_sector < v_one); - v_v = v_v | (v_tab2 & (v_sector == v_one)); - v_v = v_v | (v_tab1 & (v_sector == v_two)); - v_v = v_v | (v_tab1 & (v_sector == v_three)); - v_v = v_v | (v_tab3 & (v_sector == v_four)); - v_v = v_v | (v_tab0 & (v_sector > v_four)); -} -#endif - - -inline void HSV2RGB_native(const float* src, float* dst, const float hscale, const int bidx) -{ - float h = src[0], s = src[1], v = src[2]; - float b, g, r; - - if( s == 0 ) - b = g = r = v; - else - { - static const int sector_data[][3]= - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - float tab[4]; - int sector; - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - sector = cvFloor(h); - h -= sector; - if( (unsigned)sector >= 6u ) - { - sector = 0; - h = 0.f; - } - - tab[0] = v; - tab[1] = v*(1.f - s); - tab[2] = v*(1.f - s*h); - tab[3] = v*(1.f - s*(1.f - h)); - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; -} - - -struct HSV2RGB_f -{ - typedef float channel_type; - - HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange) - : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, dcn = dstcn; - n *= 3; - - if (dcn == 3) - { - #if CV_SIMD128 - if (hasSIMD) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_src[3]; - v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]); - HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); - v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2]); - } - } - #endif - for( ; i < n; i += 3, dst += dcn ) - { - HSV2RGB_native(src + i, dst, hscale, bidx); - } - } else { // dcn == 4 - float alpha = ColorChannel::max(); - #if CV_SIMD128 - if (hasSIMD) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_src[3]; - v_load_deinterleave(src + i, v_src[0], v_src[1], v_src[2]); - HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); - v_float32x4 v_a = v_setall_f32(alpha); - v_store_interleave(dst, v_src[bidx], v_src[1], v_src[bidx^2], v_a); - } - } - #endif - for( ; i < n; i += 3, dst += dcn ) - { - HSV2RGB_native(src + i, dst, hscale, bidx); - dst[3] = alpha; - } - } - } - - int dstcn, blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -struct HSV2RGB_b -{ - typedef uchar channel_type; - - HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange) - : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.0f / _hrange) - { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int j = 0, dcn = dstcn; - uchar alpha = ColorChannel::max(); - - #if CV_SIMD128 - if (hasSIMD) - { - for (j = 0; j <= (n - 16) * 3; j += 48, dst += dcn * 16) - { - v_uint8x16 h_b, s_b, v_b; - v_uint16x8 h_w[2], s_w[2], v_w[2]; - v_uint32x4 h_u[4], s_u[4], v_u[4]; - v_load_deinterleave(src + j, h_b, s_b, v_b); - v_expand(h_b, h_w[0], h_w[1]); - v_expand(s_b, s_w[0], s_w[1]); - v_expand(v_b, v_w[0], v_w[1]); - v_expand(h_w[0], h_u[0], h_u[1]); - v_expand(h_w[1], h_u[2], h_u[3]); - v_expand(s_w[0], s_u[0], s_u[1]); - v_expand(s_w[1], s_u[2], s_u[3]); - v_expand(v_w[0], v_u[0], v_u[1]); - v_expand(v_w[1], v_u[2], v_u[3]); - - v_int32x4 b_i[4], g_i[4], r_i[4]; - v_float32x4 v_coeff0 = v_setall_f32(1.0f / 255.0f); - v_float32x4 v_coeff1 = v_setall_f32(255.0f); - - for( int k = 0; k < 4; k++ ) - { - v_float32x4 v_src[3]; - v_src[0] = v_cvt_f32(v_reinterpret_as_s32(h_u[k])); - v_src[1] = v_cvt_f32(v_reinterpret_as_s32(s_u[k])); - v_src[2] = v_cvt_f32(v_reinterpret_as_s32(v_u[k])); - - v_src[1] *= v_coeff0; - v_src[2] *= v_coeff0; - HSV2RGB_simd(v_src[0], v_src[1], v_src[2], hscale); - - v_src[0] *= v_coeff1; - v_src[1] *= v_coeff1; - v_src[2] *= v_coeff1; - b_i[k] = v_trunc(v_src[0]); - g_i[k] = v_trunc(v_src[1]); - r_i[k] = v_trunc(v_src[2]); - } - - v_uint16x8 r_w[2], g_w[2], b_w[2]; - v_uint8x16 r_b, g_b, b_b; - - r_w[0] = v_pack_u(r_i[0], r_i[1]); - r_w[1] = v_pack_u(r_i[2], r_i[3]); - r_b = v_pack(r_w[0], r_w[1]); - g_w[0] = v_pack_u(g_i[0], g_i[1]); - g_w[1] = v_pack_u(g_i[2], g_i[3]); - g_b = v_pack(g_w[0], g_w[1]); - b_w[0] = v_pack_u(b_i[0], b_i[1]); - b_w[1] = v_pack_u(b_i[2], b_i[3]); - b_b = v_pack(b_w[0], b_w[1]); - - if( dcn == 3 ) - { - if( blueIdx == 0 ) - v_store_interleave(dst, b_b, g_b, r_b); - else - v_store_interleave(dst, r_b, g_b, b_b); - } - else - { - v_uint8x16 alpha_b = v_setall_u8(alpha); - if( blueIdx == 0 ) - v_store_interleave(dst, b_b, g_b, r_b, alpha_b); - else - v_store_interleave(dst, r_b, g_b, b_b, alpha_b); - } - } - } - #endif - for( ; j < n * 3; j += 3, dst += dcn ) - { - float buf[6]; - buf[0] = src[j]; - buf[1] = src[j+1] * (1.0f / 255.0f); - buf[2] = src[j+2] * (1.0f / 255.0f); - HSV2RGB_native(buf, buf + 3, hscale, blueIdx); - dst[0] = saturate_cast(buf[3] * 255.0f); - dst[1] = saturate_cast(buf[4] * 255.0f); - dst[2] = saturate_cast(buf[5] * 255.0f); - if( dcn == 4 ) - dst[3] = alpha; - } - } - - int dstcn; - int blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -///////////////////////////////////// RGB <-> HLS //////////////////////////////////////// - -struct RGB2HLS_f -{ - typedef float channel_type; - - RGB2HLS_f(int _srccn, int _blueIdx, float _hrange) - : srccn(_srccn), blueIdx(_blueIdx), hscale(_hrange/360.f) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - #if CV_SIMD128 - inline void process(v_float32x4& v_r, v_float32x4& v_g, - v_float32x4& v_b, v_float32x4& v_hscale) const - { - v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b); - v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b); - - v_float32x4 v_diff = v_max_rgb - v_min_rgb; - v_float32x4 v_sum = v_max_rgb + v_min_rgb; - v_float32x4 v_half = v_setall_f32(0.5f); - v_float32x4 v_l = v_sum * v_half; - - v_float32x4 v_s = v_diff / v_select(v_l < v_half, v_sum, v_setall_f32(2.0f) - v_sum); - - v_float32x4 v_r_eq_max = v_max_rgb == v_r; - v_float32x4 v_g_eq_max = v_max_rgb == v_g; - v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b, - v_select(v_g_eq_max, v_b - v_r, v_r - v_g)); - v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f), - v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f))); - v_float32x4 v_rev_diff = v_setall_f32(60.0f) / v_diff; - v_h = v_muladd(v_h, v_rev_diff, v_res) * v_hscale; - - v_float32x4 v_diff_gt_eps = v_diff > v_setall_f32(FLT_EPSILON); - v_r = v_diff_gt_eps & v_h; - v_g = v_l; - v_b = v_diff_gt_eps & v_s; - } - #endif - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, scn = srccn; - n *= 3; - - #if CV_SIMD128 - if (hasSIMD) - { - v_float32x4 v_hscale = v_setall_f32(hscale); - if (scn == 3) { - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_r, v_g, v_b, v_hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_load_deinterleave(src, v_r, v_g, v_b); - process(v_b, v_g, v_r, v_hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } else { // scn == 4 - if (bidx) { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_r, v_g, v_b, v_hscale); - v_store_interleave(dst + i, v_r, v_g, v_b); - } - } else { - for ( ; i <= n - 12; i += 12, src += scn * 4) - { - v_float32x4 v_r; - v_float32x4 v_g; - v_float32x4 v_b; - v_float32x4 v_a; - v_load_deinterleave(src, v_r, v_g, v_b, v_a); - process(v_b, v_g, v_r, v_hscale); - v_store_interleave(dst + i, v_b, v_g, v_r); - } - } - } - } - #endif - - for( ; i < n; i += 3, src += scn ) - { - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h = 0.f, s = 0.f, l; - float vmin, vmax, diff; - - vmax = vmin = r; - if( vmax < g ) vmax = g; - if( vmax < b ) vmax = b; - if( vmin > g ) vmin = g; - if( vmin > b ) vmin = b; - - diff = vmax - vmin; - l = (vmax + vmin)*0.5f; - - if( diff > FLT_EPSILON ) - { - s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); - diff = 60.f/diff; - - if( vmax == r ) - h = (g - b)*diff; - else if( vmax == g ) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if( h < 0.f ) h += 360.f; - } - - dst[i] = h*hscale; - dst[i+1] = l; - dst[i+2] = s; - } - } - - int srccn, blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -struct RGB2HLS_b -{ - typedef uchar channel_type; - - RGB2HLS_b(int _srccn, int _blueIdx, int _hrange) - : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) - { - #if CV_NEON - v_scale_inv = vdupq_n_f32(1.f/255.f); - v_scale = vdupq_n_f32(255.f); - v_alpha = vdup_n_u8(ColorChannel::max()); - #elif CV_SSE2 - v_scale_inv = _mm_set1_ps(1.f/255.f); - v_zero = _mm_setzero_si128(); - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - #endif - } - - #if CV_SSE2 - void process(const float * buf, - __m128 & v_coeffs, uchar * dst) const - { - __m128 v_l0f = _mm_load_ps(buf); - __m128 v_l1f = _mm_load_ps(buf + 4); - __m128 v_u0f = _mm_load_ps(buf + 8); - __m128 v_u1f = _mm_load_ps(buf + 12); - - v_l0f = _mm_mul_ps(v_l0f, v_coeffs); - v_u1f = _mm_mul_ps(v_u1f, v_coeffs); - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92)); - v_u0f = _mm_mul_ps(v_u0f, v_coeffs); - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x92)); - v_l1f = _mm_mul_ps(v_l1f, v_coeffs); - - __m128i v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f)); - __m128i v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f)); - __m128i v_l0 = _mm_packus_epi16(v_l, v_u); - - _mm_storeu_si128((__m128i *)(dst), v_l0); - } - #endif - - void operator()(const uchar* src, uchar* dst, int n) const - { - int i, j, scn = srccn; - float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; - #if CV_SSE2 - __m128 v_coeffs = _mm_set_ps(1.f, 255.f, 255.f, 1.f); - #endif - - for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) - { - int dn = std::min(n - i, (int)BLOCK_SIZE); - j = 0; - - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn) - { - uint16x8_t v_t0, v_t1, v_t2; - - if (scn == 3) - { - uint8x8x3_t v_src = vld3_u8(src); - v_t0 = vmovl_u8(v_src.val[0]); - v_t1 = vmovl_u8(v_src.val[1]); - v_t2 = vmovl_u8(v_src.val[2]); - } - else - { - uint8x8x4_t v_src = vld4_u8(src); - v_t0 = vmovl_u8(v_src.val[0]); - v_t1 = vmovl_u8(v_src.val[1]); - v_t2 = vmovl_u8(v_src.val[2]); - } - - float32x4x3_t v_dst; - v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j, v_dst); - - v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j + 12, v_dst); - } - #elif CV_SSE2 - if (scn == 3 && haveSIMD) - { - for ( ; j <= (dn * 3 - 16); j += 16, src += 16) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)src); - - __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero); - _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); - _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); - - v_src_p = _mm_unpackhi_epi8(v_src, v_zero); - _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv)); - _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv)); - } - - int jr = j % 3; - if (jr) - src -= jr, j -= jr; - } - else if (scn == 4 && haveSIMD) - { - for ( ; j <= (dn * 3 - 12); j += 12, src += 16) - { - __m128i v_src = _mm_loadu_si128((__m128i const *)src); - - __m128i v_src_lo = _mm_unpacklo_epi8(v_src, v_zero); - __m128i v_src_hi = _mm_unpackhi_epi8(v_src, v_zero); - _mm_storeu_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_lo, v_zero)), v_scale_inv)); - _mm_storeu_ps(buf + j + 3, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_lo, v_zero)), v_scale_inv)); - _mm_storeu_ps(buf + j + 6, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_hi, v_zero)), v_scale_inv)); - float tmp = buf[j + 8]; - _mm_storeu_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_unpackhi_epi16(v_src_hi, v_zero), 0x90)), v_scale_inv)); - buf[j + 8] = tmp; - } - - int jr = j % 3; - if (jr) - src -= jr, j -= jr; - } - #endif - for( ; j < dn*3; j += 3, src += scn ) - { - buf[j] = src[0]*(1.f/255.f); - buf[j+1] = src[1]*(1.f/255.f); - buf[j+2] = src[2]*(1.f/255.f); - } - cvt(buf, buf, dn); - - j = 0; - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24) - { - float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); - - uint8x8x3_t v_dst; - v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])), - vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0])))); - v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); - v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); - vst3_u8(dst + j, v_dst); - } - #elif CV_SSE2 - if (haveSIMD) - { - for ( ; j <= (dn - 16) * 3; j += 48) - { - process(buf + j, - v_coeffs, dst + j); - - process(buf + j + 16, - v_coeffs, dst + j + 16); - - process(buf + j + 32, - v_coeffs, dst + j + 32); - } - } - #endif - for( ; j < dn*3; j += 3 ) - { - dst[j] = saturate_cast(buf[j]); - dst[j+1] = saturate_cast(buf[j+1]*255.f); - dst[j+2] = saturate_cast(buf[j+2]*255.f); - } - } - } - - int srccn; - RGB2HLS_f cvt; - #if CV_NEON - float32x4_t v_scale, v_scale_inv; - uint8x8_t v_alpha; - #elif CV_SSE2 - __m128 v_scale_inv; - __m128i v_zero; - bool haveSIMD; - #endif -}; - - -struct HLS2RGB_f -{ - typedef float channel_type; - - HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange) - : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) { - #if CV_SIMD128 - hasSIMD = hasSIMD128(); - #endif - } - - #if CV_SIMD128 - inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const - { - v_float32x4 v_one = v_setall_f32(1.0f); - - v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f); - v_float32x4 v_ls = v_l * v_s; - v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls); - - v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale); - v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw)); - v_float32x4 v_hs = v_hs_raw - v_pre_hs; - v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f))); - v_float32x4 v_elem1 = v_hs + v_hs; - - v_float32x4 v_tab0 = v_l + v_elem0; - v_float32x4 v_tab1 = v_l - v_elem0; - v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1; - v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1; - - v_float32x4 v_two = v_setall_f32(2.0f); - v_float32x4 v_four = v_setall_f32(4.0f); - - v_h = v_select(v_sector < v_two , v_tab1, - v_select(v_sector <= v_two , v_tab3, - v_select(v_sector <= v_four, v_tab0, v_tab2))); - - v_l = v_select(v_sector < v_one , v_tab3, - v_select(v_sector <= v_two , v_tab0, - v_select(v_sector < v_four, v_tab2, v_tab1))); - - v_s = v_select(v_sector < v_one , v_tab0, - v_select(v_sector < v_two , v_tab2, - v_select(v_sector < v_four, v_tab1, - v_select(v_sector <= v_four, v_tab3, v_tab0)))); - } - #endif - - void operator()(const float* src, float* dst, int n) const - { - int i = 0, bidx = blueIdx, dcn = dstcn; - float alpha = ColorChannel::max(); - n *= 3; - - #if CV_SIMD128 - if (hasSIMD) - { - if (dcn == 3) - { - if (bidx) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_store_interleave(dst, v_s, v_l, v_h); - } - } else { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_store_interleave(dst, v_h, v_l, v_s); - } - } - } else { // dcn == 4 - if (bidx) - { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_float32x4 v_a = v_setall_f32(alpha); - v_store_interleave(dst, v_s, v_l, v_h, v_a); - } - } else { - for (; i <= n - 12; i += 12, dst += dcn * 4) - { - v_float32x4 v_h; - v_float32x4 v_l; - v_float32x4 v_s; - v_load_deinterleave(src + i, v_h, v_l, v_s); - process(v_h, v_l, v_s); - v_float32x4 v_a = v_setall_f32(alpha); - v_store_interleave(dst, v_h, v_l, v_s, v_a); - } - } - } - } - #endif - - for( ; i < n; i += 3, dst += dcn ) - { - float h = src[i], l = src[i+1], s = src[i+2]; - float b, g, r; - - if( s == 0 ) - b = g = r = l; - else - { - static const int sector_data[][3]= - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - float tab[4]; - int sector; - - float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; - float p1 = 2*l - p2; - - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - - assert( 0 <= h && h < 6 ); - sector = cvFloor(h); - h -= sector; - - tab[0] = p2; - tab[1] = p1; - tab[2] = p1 + (p2 - p1)*(1-h); - tab[3] = p1 + (p2 - p1)*h; - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; - if( dcn == 4 ) - dst[3] = alpha; - } - } - - int dstcn, blueIdx; - float hscale; - #if CV_SIMD128 - bool hasSIMD; - #endif -}; - - -struct HLS2RGB_b -{ - typedef uchar channel_type; - - HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange) - : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange) - { - #if CV_NEON - v_scale_inv = vdupq_n_f32(1.f/255.f); - v_scale = vdupq_n_f32(255.f); - v_alpha = vdup_n_u8(ColorChannel::max()); - #elif CV_SSE2 - v_scale = _mm_set1_ps(255.f); - v_alpha = _mm_set1_ps(ColorChannel::max()); - v_zero = _mm_setzero_si128(); - haveSIMD = checkHardwareSupport(CV_CPU_SSE2); - #endif - } - - #if CV_SSE2 - void process(__m128i v_r, __m128i v_g, __m128i v_b, - const __m128& v_coeffs_, - float * buf) const - { - __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero)); - __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero)); - __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero)); - - __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero)); - __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero)); - __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero)); - - __m128 v_coeffs = v_coeffs_; - - v_r0 = _mm_mul_ps(v_r0, v_coeffs); - v_g1 = _mm_mul_ps(v_g1, v_coeffs); - - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49)); - - v_r1 = _mm_mul_ps(v_r1, v_coeffs); - v_b0 = _mm_mul_ps(v_b0, v_coeffs); - - v_coeffs = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v_coeffs), 0x49)); - - v_g0 = _mm_mul_ps(v_g0, v_coeffs); - v_b1 = _mm_mul_ps(v_b1, v_coeffs); - - _mm_store_ps(buf, v_r0); - _mm_store_ps(buf + 4, v_r1); - _mm_store_ps(buf + 8, v_g0); - _mm_store_ps(buf + 12, v_g1); - _mm_store_ps(buf + 16, v_b0); - _mm_store_ps(buf + 20, v_b1); - } - #endif - - void operator()(const uchar* src, uchar* dst, int n) const - { - int i, j, dcn = dstcn; - uchar alpha = ColorChannel::max(); - float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; - #if CV_SSE2 - __m128 v_coeffs = _mm_set_ps(1.f, 1.f/255.f, 1.f/255.f, 1.f); - #endif - - for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) - { - int dn = std::min(n - i, (int)BLOCK_SIZE); - j = 0; - - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24) - { - uint8x8x3_t v_src = vld3_u8(src + j); - uint16x8_t v_t0 = vmovl_u8(v_src.val[0]), - v_t1 = vmovl_u8(v_src.val[1]), - v_t2 = vmovl_u8(v_src.val[2]); - - float32x4x3_t v_dst; - v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j, v_dst); - - v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))); - v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); - v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); - vst3q_f32(buf + j + 12, v_dst); - } - #elif CV_SSE2 - if (haveSIMD) - { - for ( ; j <= (dn - 8) * 3; j += 24) - { - __m128i v_src0 = _mm_loadu_si128((__m128i const *)(src + j)); - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src + j + 16)); - - process(_mm_unpacklo_epi8(v_src0, v_zero), - _mm_unpackhi_epi8(v_src0, v_zero), - _mm_unpacklo_epi8(v_src1, v_zero), - v_coeffs, - buf + j); - } - } - #endif - for( ; j < dn*3; j += 3 ) - { - buf[j] = src[j]; - buf[j+1] = src[j+1]*(1.f/255.f); - buf[j+2] = src[j+2]*(1.f/255.f); - } - cvt(buf, buf, dn); - - j = 0; - #if CV_NEON - for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8) - { - float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); - uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); - uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); - uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), - vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); - - if (dcn == 4) - { - uint8x8x4_t v_dst; - v_dst.val[0] = v_dst0; - v_dst.val[1] = v_dst1; - v_dst.val[2] = v_dst2; - v_dst.val[3] = v_alpha; - vst4_u8(dst, v_dst); - } - else - { - uint8x8x3_t v_dst; - v_dst.val[0] = v_dst0; - v_dst.val[1] = v_dst1; - v_dst.val[2] = v_dst2; - vst3_u8(dst, v_dst); - } - } - #elif CV_SSE2 - if (dcn == 3 && haveSIMD) - { - for ( ; j <= (dn * 3 - 16); j += 16, dst += 16) - { - __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); - __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); - __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); - __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale); - - __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0), - _mm_cvtps_epi32(v_src1)); - __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2), - _mm_cvtps_epi32(v_src3)); - - _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); - } - - int jr = j % 3; - if (jr) - dst -= jr, j -= jr; - } - else if (dcn == 4 && haveSIMD) - { - for ( ; j <= (dn * 3 - 12); j += 12, dst += 16) - { - __m128 v_buf0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale); - __m128 v_buf1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale); - __m128 v_buf2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale); - - __m128 v_ba0 = _mm_unpackhi_ps(v_buf0, v_alpha); - __m128 v_ba1 = _mm_unpacklo_ps(v_buf2, v_alpha); - - __m128i v_src0 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf0, v_ba0, 0x44)); - __m128i v_src1 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba0, v_buf1, 0x4e)), 0x78); - __m128i v_src2 = _mm_cvtps_epi32(_mm_shuffle_ps(v_buf1, v_ba1, 0x4e)); - __m128i v_src3 = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_shuffle_ps(v_ba1, v_buf2, 0xee)), 0x78); - - __m128i v_dst0 = _mm_packs_epi32(v_src0, v_src1); - __m128i v_dst1 = _mm_packs_epi32(v_src2, v_src3); - - _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1)); - } - - int jr = j % 3; - if (jr) - dst -= jr, j -= jr; - } - #endif - - for( ; j < dn*3; j += 3, dst += dcn ) - { - dst[0] = saturate_cast(buf[j]*255.f); - dst[1] = saturate_cast(buf[j+1]*255.f); - dst[2] = saturate_cast(buf[j+2]*255.f); - if( dcn == 4 ) - dst[3] = alpha; - } - } - } - - int dstcn; - HLS2RGB_f cvt; - #if CV_NEON - float32x4_t v_scale, v_scale_inv; - uint8x8_t v_alpha; - #elif CV_SSE2 - __m128 v_scale; - __m128 v_alpha; - __m128i v_zero; - bool haveSIMD; - #endif -}; +namespace cv { // // IPP functions @@ -1302,29 +123,15 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step, } #endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180; - int blueIdx = swapBlue ? 2 : 0; - if(isHSV) - { - if(depth == CV_8U) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_b(scn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HSV_f(scn, blueIdx, static_cast(hrange))); - } - else - { - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_b(scn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2HLS_f(scn, blueIdx, static_cast(hrange))); - } + CV_CPU_DISPATCH(cvtBGRtoHSV, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV), + CV_CPU_DISPATCH_MODES_ALL); } // 8u, 32f void cvtHSVtoBGR(const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) { CV_INSTRUMENT_REGION(); @@ -1393,22 +200,8 @@ void cvtHSVtoBGR(const uchar * src_data, size_t src_step, } #endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180; - int blueIdx = swapBlue ? 2 : 0; - if(isHSV) - { - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_b(dcn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HSV2RGB_f(dcn, blueIdx, static_cast(hrange))); - } - else - { - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_b(dcn, blueIdx, hrange)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, HLS2RGB_f(dcn, blueIdx, static_cast(hrange))); - } + CV_CPU_DISPATCH(cvtHSVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV), + CV_CPU_DISPATCH_MODES_ALL); } } // namespace hal diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp index f0a4c87558..30ae7064bc 100644 --- a/modules/imgproc/src/color_hsv.simd.hpp +++ b/modules/imgproc/src/color_hsv.simd.hpp @@ -3,11 +3,31 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" +#include "opencv2/core/hal/intrin.hpp" -namespace cv -{ +namespace cv { +namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void cvtBGRtoHSV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV); +void cvtHSVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if defined(CV_CPU_BASELINE_MODE) +// included in color.hpp +#else +#include "color.simd_helpers.hpp" +#endif + +namespace { ////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// @@ -1192,46 +1212,7 @@ struct HLS2RGB_b #endif }; -// -// IPP functions -// - -#if NEED_IPP - -#if !IPP_DISABLE_RGB_HSV -static ippiGeneralFunc ippiRGB2HSVTab[] = -{ - (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0, - 0, 0, 0, 0 -}; -#endif - -static ippiGeneralFunc ippiHSV2RGBTab[] = -{ - (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0, - 0, 0, 0, 0 -}; - -static ippiGeneralFunc ippiRGB2HLSTab[] = -{ - (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0, - 0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0 -}; - -static ippiGeneralFunc ippiHLS2RGBTab[] = -{ - (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0, - 0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0 -}; - -#endif - -// -// HAL functions -// - -namespace hal -{ +} // namespace anon // 8u, 32f void cvtBGRtoHSV(const uchar * src_data, size_t src_step, @@ -1241,67 +1222,6 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoHSV, cv_hal_cvtBGRtoHSV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if(depth == CV_8U && isFullRange) - { - if (isHSV) - { -#if !IPP_DISABLE_RGB_HSV // breaks OCL accuracy tests - if(scn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) ) - return; - } -#endif - } - else - { - if(scn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(scn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKE_TYPE(depth, scn), dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiRGB2HLSTab[depth])) ) - return; - } - else if(scn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) ) - return; - } - } - } - } -#endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 256 : 180; int blueIdx = swapBlue ? 2 : 0; if(isHSV) @@ -1322,77 +1242,12 @@ void cvtBGRtoHSV(const uchar * src_data, size_t src_step, // 8u, 32f void cvtHSVtoBGR(const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV) { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtHSVtoBGR, cv_hal_cvtHSVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isFullRange, isHSV); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if (depth == CV_8U && isFullRange) - { - if (isHSV) - { - if(dcn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiHSV2RGBTab[depth])) ) - return; - } - else if(dcn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) ) - return; - } - } - else - { - if(dcn == 3 && !swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) ) - return; - } - else if(dcn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, 3), dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiHLS2RGBTab[depth])) ) - return; - } - else if(dcn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) ) - return; - } - } - } - } -#endif - int hrange = depth == CV_32F ? 360 : isFullRange ? 255 : 180; int blueIdx = swapBlue ? 2 : 0; if(isHSV) @@ -1411,155 +1266,6 @@ void cvtHSVtoBGR(const uchar * src_data, size_t src_step, } } -} // namespace hal - -// -// OCL calls -// - -#ifdef HAVE_OPENCL - -bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255); - - if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc, - format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, bool full ) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255); - - if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc, - format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full ) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f; - - if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc, - format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full ) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256); - - cv::String options = (_src.depth() == CV_8U ? - format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) : - format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx)); - - if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options)) - { - return false; - } - - if(_src.depth() == CV_8U) - { - static UMat sdiv_data; - static UMat hdiv_data180; - static UMat hdiv_data256; - static int sdiv_table[256]; - static int hdiv_table180[256]; - static int hdiv_table256[256]; - static volatile bool initialized180 = false, initialized256 = false; - volatile bool & initialized = hrange == 180 ? initialized180 : initialized256; - - if (!initialized) - { - int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12; - UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256; - - sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0; - - int v = 255 << hsv_shift; - if (!initialized180 && !initialized256) - { - for(int i = 1; i < 256; i++ ) - sdiv_table[i] = saturate_cast(v/(1.*i)); - Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data); - } - - v = hrange << hsv_shift; - for (int i = 1; i < 256; i++ ) - hdiv_table[i] = saturate_cast(v/(6.*i)); - - Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data); - initialized = true; - } - - h.setArg(ocl::KernelArg::PtrReadOnly(sdiv_data)); - h.setArg(hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) : - ocl::KernelArg::PtrReadOnly(hdiv_data180)); - } - - return h.run(); -} - #endif - -// -// HAL calls -// - -void cvtColorBGR2HLS( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ) -{ - CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb, fullRange, false); -} - -void cvtColorBGR2HSV( InputArray _src, OutputArray _dst, bool swapb, bool fullRange ) -{ - CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - hal::cvtBGRtoHSV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb, fullRange, true); -} - -void cvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, dcn, swapb, fullRange, false); -} - -void cvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, bool fullRange) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtHSVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, dcn, swapb, fullRange, true); -} - - -} // namespace cv +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp index 0fff89358c..cb5c0fdf53 100644 --- a/modules/imgproc/src/color_lab.cpp +++ b/modules/imgproc/src/color_lab.cpp @@ -9,6 +9,10 @@ \**********************************************************************************/ #include "precomp.hpp" +#include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" +#include "opencv2/core/softfloat.hpp" + #include "color.hpp" using cv::softfloat; diff --git a/modules/imgproc/src/color_rgb.dispatch.cpp b/modules/imgproc/src/color_rgb.dispatch.cpp index 9245f26d05..ed2961f0fb 100644 --- a/modules/imgproc/src/color_rgb.dispatch.cpp +++ b/modules/imgproc/src/color_rgb.dispatch.cpp @@ -3,1047 +3,16 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" +#include "opencl_kernels_imgproc.hpp" + #include "color.hpp" +#include "color_rgb.simd.hpp" +#include "color_rgb.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + #define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1 -namespace cv -{ - -////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// - -template struct v_type; - -template<> -struct v_type{ - typedef v_uint8 t; -}; - -template<> -struct v_type{ - typedef v_uint16 t; -}; - -template<> -struct v_type{ - typedef v_float32 t; -}; - -template struct v_set; - -template<> -struct v_set -{ - static inline v_type::t set(uchar x) - { - return vx_setall_u8(x); - } -}; - -template<> -struct v_set -{ - static inline v_type::t set(ushort x) - { - return vx_setall_u16(x); - } -}; - -template<> -struct v_set -{ - static inline v_type::t set(float x) - { - return vx_setall_f32(x); - } -}; - -template -struct RGB2RGB -{ - typedef _Tp channel_type; - typedef typename v_type<_Tp>::t vt; - - RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : - srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) - { - CV_Assert(srccn == 3 || srccn == 4); - CV_Assert(dstcn == 3 || dstcn == 4); - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn, dcn = dstcn, bi = blueIdx; - int i = 0; - _Tp alphav = ColorChannel<_Tp>::max(); - -#if CV_SIMD - const int vsize = vt::nlanes; - - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*dcn) - { - vt a, b, c, d; - if(scn == 4) - { - v_load_deinterleave(src, a, b, c, d); - } - else - { - v_load_deinterleave(src, a, b, c); - d = v_set<_Tp>::set(alphav); - } - if(bi == 2) - swap(a, c); - - if(dcn == 4) - { - v_store_interleave(dst, a, b, c, d); - } - else - { - v_store_interleave(dst, a, b, c); - } - } - vx_cleanup(); -#endif - for ( ; i < n; i++, src += scn, dst += dcn ) - { - _Tp t0 = src[0], t1 = src[1], t2 = src[2]; - dst[bi ] = t0; - dst[1] = t1; - dst[bi^2] = t2; - if(dcn == 4) - { - _Tp d = scn == 4 ? src[3] : alphav; - dst[3] = d; - } - } - } - - int srccn, dstcn, blueIdx; -}; - - -/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// - -struct RGB5x52RGB -{ - typedef uchar channel_type; - - RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits) - : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) - { } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, gb = greenBits; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 vz = vx_setzero_u8(), vn0 = vx_setall_u8(255); - for(; i <= n-vsize; - i += vsize, src += vsize*sizeof(ushort), dst += vsize*dcn) - { - v_uint16 t0 = v_reinterpret_as_u16(vx_load(src)); - v_uint16 t1 = v_reinterpret_as_u16(vx_load(src + - sizeof(ushort)*v_uint16::nlanes)); - - //TODO: shorten registers use when v_interleave is available - v_uint8 r, g, b, a; - v_uint16 b0 = (t0 << 11) >> 8; - v_uint16 b1 = (t1 << 11) >> 8; - b = v_pack(b0, b1); - - v_uint16 g0, g1, r0, r1, a0, a1; - - if( gb == 6 ) - { - g0 = ((t0 >> 5) << 10) >> 8; - g1 = ((t1 >> 5) << 10) >> 8; - - r0 = (t0 >> 11) << 3; - r1 = (t1 >> 11) << 3; - - a = vn0; - } - else - { - g0 = ((t0 >> 5) << 11) >> 8; - g1 = ((t1 >> 5) << 11) >> 8; - - r0 = ((t0 >> 10) << 11) >> 8; - r1 = ((t1 >> 10) << 11) >> 8; - - a0 = t0 >> 15; - a1 = t1 >> 15; - a = v_pack(a0, a1); - a = a != vz; - } - g = v_pack(g0, g1); - r = v_pack(r0, r1); - - if(bidx == 2) - swap(b, r); - - if(dcn == 4) - { - v_store_interleave(dst, b, g, r, a); - } - else - { - v_store_interleave(dst, b, g, r); - } - } - vx_cleanup(); -#endif - - for( ; i < n; i++, src += sizeof(ushort), dst += dcn ) - { - unsigned t = ((const ushort*)src)[0]; - uchar b, g, r, a; - - b = (uchar)(t << 3); - - if( gb == 6 ) - { - g = (uchar)((t >> 3) & ~3); - r = (uchar)((t >> 8) & ~7); - a = 255; - } - else - { - g = (uchar)((t >> 2) & ~7); - r = (uchar)((t >> 7) & ~7); - a = (uchar)(((t & 0x8000) >> 15) * 255); - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx ^ 2] = r; - if( dcn == 4 ) - dst[3] = a; - } - } - - int dstcn, blueIdx, greenBits; -}; - - -struct RGB2RGB5x5 -{ - typedef uchar channel_type; - - RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits) - : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) - { } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int scn = srccn, bidx = blueIdx, gb = greenBits; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint16 vn3 = vx_setall_u16((ushort)(~3)); - v_uint16 vn7 = vx_setall_u16((ushort)(~7)); - v_uint16 vz = vx_setzero_u16(); - v_uint8 v7 = vx_setall_u8((uchar)(~7)); - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*sizeof(ushort)) - { - v_uint8 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - a = vx_setzero_u8(); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - if(bidx == 2) - swap(b, r); - - r = r & v7; - - //TODO: shorten registers use when v_deinterleave is available - v_uint16 r0, r1, g0, g1, b0, b1, a0, a1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - v_expand(a, a0, a1); - - v_uint16 d0, d1; - - b0 = b0 >> 3; - b1 = b1 >> 3; - a0 = (a0 != vz) << 15; - a1 = (a1 != vz) << 15; - - if(gb == 6) - { - d0 = b0 | ((g0 & vn3) << 3) | (r0 << 8); - d1 = b1 | ((g1 & vn3) << 3) | (r1 << 8); - } - else - { - d0 = b0 | ((g0 & vn7) << 2) | (r0 << 7) | a0; - d1 = b1 | ((g1 & vn7) << 2) | (r1 << 7) | a1; - } - - v_store((ushort*)dst, d0); - v_store(((ushort*)dst) + vsize/2, d1); - } - vx_cleanup(); -#endif - for ( ; i < n; i++, src += scn, dst += sizeof(ushort) ) - { - uchar r = src[bidx^2]; - uchar g = src[1]; - uchar b = src[bidx]; - uchar a = scn == 4 ? src[3] : 0; - - ushort d; - if (gb == 6) - { - d = (ushort)((b >> 3)|((g & ~3) << 3)|((r & ~7) << 8)); - } - else - { - d = (ushort)((b >> 3)|((g & ~7) << 2)|((r & ~7) << 7)|(a ? 0x8000 : 0)); - } - ((ushort*)dst)[0] = d; - } - } - - int srccn, blueIdx, greenBits; -}; - - -///////////////////////////////// Color to/from Grayscale //////////////////////////////// - -template -struct Gray2RGB -{ - typedef _Tp channel_type; - typedef typename v_type<_Tp>::t vt; - - Gray2RGB(int _dstcn) : dstcn(_dstcn) {} - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int dcn = dstcn; - int i = 0; - _Tp alpha = ColorChannel<_Tp>::max(); - -#if CV_SIMD - const int vsize = vt::nlanes; - vt valpha = v_set<_Tp>::set(alpha); - for(; i <= n-vsize; - i += vsize, src += vsize, dst += vsize*dcn) - { - vt g = vx_load(src); - - if(dcn == 3) - { - v_store_interleave(dst, g, g, g); - } - else - { - v_store_interleave(dst, g, g, g, valpha); - } - } - vx_cleanup(); -#endif - for ( ; i < n; i++, src++, dst += dcn ) - { - dst[0] = dst[1] = dst[2] = src[0]; - if(dcn == 4) - dst[3] = alpha; - } - } - - int dstcn; -}; - - -struct Gray2RGB5x5 -{ - typedef uchar channel_type; - - Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) - { } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int gb = greenBits; - int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; - v_uint16 v3 = vx_setall_u16((ushort)(~3)); - for(; i <= n-vsize; - i += vsize, src += vsize, dst += vsize*sizeof(ushort)) - { - v_uint8 t8 = vx_load_low(src); - v_uint16 t = v_expand_low(t8); - - v_uint16 t3 = t >> 3; - - v_uint16 d = t3; - if(gb == 6) - { - d |= ((t & v3) << 3) | (t3 << 11); - } - else - { - d |= (t3 << 5) | (t3 << 10); - } - - v_store((ushort*)dst, d); - } - vx_cleanup(); -#endif - - for( ; i < n; i++, src++, dst += sizeof(ushort)) - { - int t = src[0]; - int t3 = t >> 3; - ushort d; - if( gb == 6 ) - { - d = (ushort)(t3 |((t & ~3) << 3)|(t3 << 11)); - } - else - { - d = (ushort)(t3 |(t3 << 5)|(t3 << 10)); - } - ((ushort*)dst)[0] = d; - } - } - int greenBits; -}; - - -struct RGB5x52Gray -{ - typedef uchar channel_type; - - // can be changed to 15-shift coeffs - static const int BY = B2Y; - static const int GY = G2Y; - static const int RY = R2Y; - static const int shift = yuv_shift; - - RGB5x52Gray(int _greenBits) : greenBits(_greenBits) - { - CV_Assert(BY + GY + RY == (1 << shift)); - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int gb = greenBits; - int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; - - v_int16 bg2y; - v_int16 r12y; - v_int16 dummy; - v_zip(vx_setall_s16(BY), vx_setall_s16(GY), bg2y, dummy); - v_zip(vx_setall_s16(RY), vx_setall_s16( 1), r12y, dummy); - v_int16 delta = vx_setall_s16(1 << (shift-1)); - - for(; i <= n-vsize; - i += vsize, src += vsize*sizeof(ushort), dst += vsize) - { - v_uint16 t = vx_load((ushort*)src); - - v_uint16 r, g, b; - b = (t << 11) >> 8; - - if(gb == 5) - { - g = ((t >> 5) << 11) >> 8; - r = ((t >> 10) << 11) >> 8; - } - else - { - g = ((t >> 5) << 10) >> 8; - r = (t >> 11) << 3; - } - - v_uint8 d; - v_uint16 dx; - - v_int16 sr = v_reinterpret_as_s16(r); - v_int16 sg = v_reinterpret_as_s16(g); - v_int16 sb = v_reinterpret_as_s16(b); - - v_int16 bg0, bg1; - v_int16 rd0, rd1; - v_zip(sb, sg, bg0, bg1); - v_zip(sr, delta, rd0, rd1); - - v_uint32 d0, d1; - d0 = v_reinterpret_as_u32(v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)); - d1 = v_reinterpret_as_u32(v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)); - - d0 = d0 >> shift; - d1 = d1 >> shift; - - dx = v_pack(d0, d1); - // high part isn't used - d = v_pack(dx, dx); - - v_store_low(dst, d); - } - vx_cleanup(); -#endif - for( ; i < n; i++, src += sizeof(ushort), dst++) - { - int t = ((ushort*)src)[0]; - uchar r, g, b; - b = (t << 3) & 0xf8; - if( gb == 6 ) - { - g = (t >> 3) & 0xfc; - r = (t >> 8) & 0xf8; - } - else - { - g = (t >> 2) & 0xf8; - r = (t >> 7) & 0xf8; - } - dst[0] = (uchar)CV_DESCALE(b*BY + g*GY + r*RY, shift); - } - } - int greenBits; -}; - - -template struct RGB2Gray -{ - typedef _Tp channel_type; - - RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) - { - static const float coeffs0[] = { R2YF, G2YF, B2YF }; - memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn; - float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - for(int i = 0; i < n; i++, src += scn) - dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr); - } - int srccn; - float coeffs[3]; -}; - - -template <> -struct RGB2Gray -{ - typedef float channel_type; - - RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) - { - static const float coeffs0[] = { R2YF, G2YF, B2YF }; - for(int i = 0; i < 3; i++) - { - coeffs[i] = _coeffs ? _coeffs[i] : coeffs0[i]; - } - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const float * src, float * dst, int n) const - { - int scn = srccn, i = 0; - float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - -#if CV_SIMD - const int vsize = v_float32::nlanes; - v_float32 rv = vx_setall_f32(cr), gv = vx_setall_f32(cg), bv = vx_setall_f32(cb); - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize) - { - v_float32 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_float32 d = v_fma(r, rv, v_fma(g, gv, b*bv)); - - v_store(dst, d); - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += scn, dst++) - dst[0] = src[0]*cb + src[1]*cg + src[2]*cr; - } - - int srccn; - float coeffs[3]; -}; - -template<> -struct RGB2Gray -{ - typedef uchar channel_type; - - // can be changed to 15-shift coeffs - static const int BY = B2Y; - static const int GY = G2Y; - static const int RY = R2Y; - static const int shift = yuv_shift; - - RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) - { - const int coeffs0[] = { RY, GY, BY }; - for(int i = 0; i < 3; i++) - coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - - CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int scn = srccn; - short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_int16 bg2y; - v_int16 r12y; - v_int16 dummy; - v_zip(vx_setall_s16(cb), vx_setall_s16(cg), bg2y, dummy); - v_zip(vx_setall_s16(cr), vx_setall_s16( 1), r12y, dummy); - v_int16 delta = vx_setall_s16(1 << (shift-1)); - - for( ; i <= n-vsize; - i += vsize, src += scn*vsize, dst += vsize) - { - v_uint8 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - //TODO: shorten registers use when v_deinterleave is available - - v_uint16 r0, r1, g0, g1, b0, b1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - - v_int16 bg00, bg01, bg10, bg11; - v_int16 rd00, rd01, rd10, rd11; - v_zip(v_reinterpret_as_s16(b0), v_reinterpret_as_s16(g0), bg00, bg01); - v_zip(v_reinterpret_as_s16(b1), v_reinterpret_as_s16(g1), bg10, bg11); - v_zip(v_reinterpret_as_s16(r0), delta, rd00, rd01); - v_zip(v_reinterpret_as_s16(r1), delta, rd10, rd11); - - v_uint32 y00, y01, y10, y11; - y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; - y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; - y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; - y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; - - v_uint16 y0, y1; - y0 = v_pack(y00, y01); - y1 = v_pack(y10, y11); - - v_uint8 y = v_pack(y0, y1); - v_store(dst, y); - } - vx_cleanup(); -#endif - - for( ; i < n; i++, src += scn, dst++) - { - int b = src[0], g = src[1], r = src[2]; - uchar y = (uchar)CV_DESCALE(b*cb + g*cg + r*cr, shift); - dst[0] = y; - } - } - - int srccn; - short coeffs[3]; -}; - - -template<> -struct RGB2Gray -{ - typedef ushort channel_type; - - // can be changed to 15-shift coeffs - static const int BY = B2Y; - static const int GY = G2Y; - static const int RY = R2Y; - static const int shift = yuv_shift; - static const int fix_shift = (int)(sizeof(short)*8 - shift); - - RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn) - { - const int coeffs0[] = { RY, GY, BY }; - for(int i = 0; i < 3; i++) - coeffs[i] = (short)(_coeffs ? _coeffs[i] : coeffs0[i]); - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - - CV_Assert(coeffs[0] + coeffs[1] + coeffs[2] == (1 << shift)); - } - - void operator()(const ushort* src, ushort* dst, int n) const - { - int scn = srccn; - short cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint16::nlanes; - - v_int16 b2y = vx_setall_s16(cb); - v_int16 g2y = vx_setall_s16(cg); - v_int16 r2y = vx_setall_s16(cr); - v_int16 one = vx_setall_s16(1); - v_int16 z = vx_setzero_s16(); - - v_int16 bg2y, r12y; - v_int16 dummy; - v_zip(b2y, g2y, bg2y, dummy); - v_zip(r2y, one, r12y, dummy); - - v_int16 delta = vx_setall_s16(1 << (shift-1)); - - for( ; i <= n-vsize; - i += vsize, src += scn*vsize, dst += vsize) - { - v_uint16 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_int16 sb = v_reinterpret_as_s16(b); - v_int16 sr = v_reinterpret_as_s16(r); - v_int16 sg = v_reinterpret_as_s16(g); - - v_int16 bg0, bg1; - v_int16 rd0, rd1; - v_zip(sb, sg, bg0, bg1); - v_zip(sr, delta, rd0, rd1); - - // fixing 16bit signed multiplication - v_int16 mr, mg, mb; - mr = (sr < z) & r2y; - mg = (sg < z) & g2y; - mb = (sb < z) & b2y; - v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; - - v_int32 sy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; - v_int32 sy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; - - v_int16 y = v_add_wrap(v_pack(sy0, sy1), fixmul); - - v_store((short*)dst, y); - } - vx_cleanup(); -#endif - for( ; i < n; i++, src += scn, dst++) - { - int b = src[0], g = src[1], r = src[2]; - ushort d = (ushort)CV_DESCALE((unsigned)(b*cb + g*cg + r*cr), shift); - dst[0] = d; - } - } - - int srccn; - short coeffs[3]; -}; - - -/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) ////////////// - -template -struct RGBA2mRGBA -{ - typedef _Tp channel_type; - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - _Tp max_val = ColorChannel<_Tp>::max(); - _Tp half_val = ColorChannel<_Tp>::half(); - for( int i = 0; i < n; i++ ) - { - _Tp v0 = *src++; - _Tp v1 = *src++; - _Tp v2 = *src++; - _Tp v3 = *src++; - - *dst++ = (v0 * v3 + half_val) / max_val; - *dst++ = (v1 * v3 + half_val) / max_val; - *dst++ = (v2 * v3 + half_val) / max_val; - *dst++ = v3; - } - } -}; - - -template<> -struct RGBA2mRGBA -{ - typedef uchar channel_type; - - void operator()(const uchar* src, uchar* dst, int n) const - { - const uchar max_val = 255; - const uchar half_val = 128; - - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); - v_uint16 vh = vx_setall_u16(half_val+1); - - // processing 4 registers per loop cycle is about 10% faster - // than processing 1 register - for( ; i <= n-vsize; - i += vsize, src += 4*vsize, dst += 4*vsize) - { - v_uint8 v[4]; - for(int j = 0; j < 4; j++) - v[j] = vx_load(src + j*vsize); - - // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => - // => 00,00,a0,a0,00,00,a1,a1 - // => a0,a0,a0,a0,a1,a1,a1,a1 - - v_uint16 a16[4]; - for(int j = 0; j < 4; j++) - a16[j] = v_reinterpret_as_u16(v[j] & amask); - - v_uint32 a32[4]; - for(int j = 0; j < 4; j++) - a32[j] = v_reinterpret_as_u32(a16[j] | (a16[j] >> 8)); - - v_uint8 a[4]; - for(int j = 0; j < 4; j++) - a[j] = v_reinterpret_as_u8(a32[j] | (a32[j] >> 16)); - - v_uint16 m[8]; - for(int j = 0; j < 4; j++) - v_mul_expand(v[j], a[j], m[j], m[j+4]); - - for(int j = 0; j < 8; j++) - m[j] += vh; - - // div 255: (v+1+(v>>8))>8 - // +1 is in vh, has no effect on (v>>8) - for(int j = 0; j < 8; j++) - m[j] = (m[j] + (m[j] >> 8)) >> 8; - - v_uint8 d[4]; - for(int j = 0; j < 4; j++) - d[j] = v_pack(m[j], m[j+4]); - - for(int j = 0; j < 4; j++) - d[j] = v_select(amask, a[j], d[j]); - - for(int j = 0; j < 4; j++) - v_store(dst + j*vsize, d[j]); - } - - vx_cleanup(); -#endif - for(; i < n; i++, src += 4, dst += 4 ) - { - uchar v0 = src[0]; - uchar v1 = src[1]; - uchar v2 = src[2]; - uchar v3 = src[3]; - - dst[0] = (v0 * v3 + half_val) / max_val; - dst[1] = (v1 * v3 + half_val) / max_val; - dst[2] = (v2 * v3 + half_val) / max_val; - dst[3] = v3; - } - } -}; - - -template -struct mRGBA2RGBA -{ - typedef _Tp channel_type; - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - _Tp max_val = ColorChannel<_Tp>::max(); - for( int i = 0; i < n; i++ ) - { - _Tp v0 = *src++; - _Tp v1 = *src++; - _Tp v2 = *src++; - _Tp v3 = *src++; - _Tp v3_half = v3 / 2; - - *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v0 * max_val + v3_half) / v3); - *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v1 * max_val + v3_half) / v3); - *dst++ = (v3==0)? 0 : saturate_cast<_Tp>((v2 * max_val + v3_half) / v3); - *dst++ = v3; - } - } -}; - - -template<> -struct mRGBA2RGBA -{ - typedef uchar channel_type; - - void operator()(const uchar* src, uchar* dst, int n) const - { - uchar max_val = ColorChannel::max(); - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 amask = v_reinterpret_as_u8(vx_setall_u32(0xFF000000)); - v_uint8 vmax = vx_setall_u8(max_val); - - for( ; i <= n-vsize/4; - i += vsize/4, src += vsize, dst += vsize) - { - v_uint8 s = vx_load(src + 0*vsize); - - // r0,g0,b0,a0,r1,g1,b1,a1 => 00,00,00,a0,00,00,00,a1 => - // => 00,00,a0,a0,00,00,a1,a1 - // => a0,a0,a0,a0,a1,a1,a1,a1 - v_uint8 a; - v_uint16 a16; - v_uint32 a32; - a16 = v_reinterpret_as_u16(s & amask); - a32 = v_reinterpret_as_u32(a16 | (a16 >> 8)); - a = v_reinterpret_as_u8(a32 | (a32 >> 16)); - - // s *= max_val - v_uint16 s0, s1; - v_mul_expand(s, vmax, s0, s1); - - // s += a/2 - v_uint16 ae0, ae1; - v_expand(a, ae0, ae1); - s0 += ae0 >> 1; s1 += ae1 >> 1; - - // s, a -> u32 -> float - v_uint32 u00, u01, u10, u11; - v_int32 s00, s01, s10, s11; - v_expand(s0, u00, u01); - v_expand(s1, u10, u11); - s00 = v_reinterpret_as_s32(u00); - s01 = v_reinterpret_as_s32(u01); - s10 = v_reinterpret_as_s32(u10); - s11 = v_reinterpret_as_s32(u11); - - v_uint32 ua00, ua01, ua10, ua11; - v_int32 a00, a01, a10, a11; - v_expand(ae0, ua00, ua01); - v_expand(ae1, ua10, ua11); - a00 = v_reinterpret_as_s32(ua00); - a01 = v_reinterpret_as_s32(ua01); - a10 = v_reinterpret_as_s32(ua10); - a11 = v_reinterpret_as_s32(ua11); - - v_float32 fs00, fs01, fs10, fs11; - fs00 = v_cvt_f32(s00); - fs01 = v_cvt_f32(s01); - fs10 = v_cvt_f32(s10); - fs11 = v_cvt_f32(s11); - - v_float32 fa00, fa01, fa10, fa11; - fa00 = v_cvt_f32(a00); - fa01 = v_cvt_f32(a01); - fa10 = v_cvt_f32(a10); - fa11 = v_cvt_f32(a11); - - // float d = (float)s/(float)a - v_float32 fd00, fd01, fd10, fd11; - fd00 = fs00/fa00; - fd01 = fs01/fa01; - fd10 = fs10/fa10; - fd11 = fs11/fa11; - - // d -> u32 -> u8 - v_uint32 ud00, ud01, ud10, ud11; - ud00 = v_reinterpret_as_u32(v_trunc(fd00)); - ud01 = v_reinterpret_as_u32(v_trunc(fd01)); - ud10 = v_reinterpret_as_u32(v_trunc(fd10)); - ud11 = v_reinterpret_as_u32(v_trunc(fd11)); - v_uint16 ud0, ud1; - ud0 = v_pack(ud00, ud01); - ud1 = v_pack(ud10, ud11); - v_uint8 d; - d = v_pack(ud0, ud1); - - // if a == 0 then d = 0 - v_uint8 am; - am = a != vx_setzero_u8(); - d = d & am; - - // put alpha values - d = v_select(amask, a, d); - - v_store(dst, d); - } - - vx_cleanup(); -#endif - for(; i < n; i++, src += 4, dst += 4 ) - { - uchar v0 = src[0]; - uchar v1 = src[1]; - uchar v2 = src[2]; - uchar v3 = src[3]; - - uchar v3_half = v3 / 2; - - dst[0] = (v3==0)? 0 : (v0 * max_val + v3_half) / v3; - dst[1] = (v3==0)? 0 : (v1 * max_val + v3_half) / v3; - dst[2] = (v3==0)? 0 : (v2 * max_val + v3_half) / v3; - dst[3] = v3; - - dst[0] = (v3==0)? 0 : saturate_cast((v0 * max_val + v3_half) / v3); - dst[1] = (v3==0)? 0 : saturate_cast((v1 * max_val + v3_half) / v3); - dst[2] = (v3==0)? 0 : saturate_cast((v2 * max_val + v3_half) / v3); - dst[3] = v3; - } - } -}; +namespace cv { // // IPP functions @@ -1051,25 +20,25 @@ struct mRGBA2RGBA #if NEED_IPP -static ippiColor2GrayFunc ippiColor2GrayC3Tab[] = +static const ippiColor2GrayFunc ippiColor2GrayC3Tab[] = { (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0, 0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0 }; -static ippiColor2GrayFunc ippiColor2GrayC4Tab[] = +static const ippiColor2GrayFunc ippiColor2GrayC4Tab[] = { (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0, 0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0 }; -static ippiGeneralFunc ippiRGB2GrayC3Tab[] = +static const ippiGeneralFunc ippiRGB2GrayC3Tab[] = { (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0, 0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0 }; -static ippiGeneralFunc ippiRGB2GrayC4Tab[] = +static const ippiGeneralFunc ippiRGB2GrayC4Tab[] = { (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0, 0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0 @@ -1208,8 +177,7 @@ static ippiReorderFunc ippiSwapChannelsC4RTab[] = // HAL functions // -namespace hal -{ +namespace hal { // 8u, 16u, 32f void cvtBGRtoBGR(const uchar * src_data, size_t src_step, @@ -1265,13 +233,8 @@ void cvtBGRtoBGR(const uchar * src_data, size_t src_step, #endif #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); + CV_CPU_DISPATCH(cvtBGRtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1284,7 +247,8 @@ void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits)); + CV_CPU_DISPATCH(cvtBGRtoBGR5x5, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1297,7 +261,8 @@ void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits)); + CV_CPU_DISPATCH(cvtBGR5x5toBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } // 8u, 16u, 32f @@ -1340,13 +305,8 @@ void cvtBGRtoGray(const uchar * src_data, size_t src_step, } #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); + CV_CPU_DISPATCH(cvtBGRtoGray, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue), + CV_CPU_DISPATCH_MODES_ALL); } // 8u, 16u, 32f @@ -1390,12 +350,8 @@ void cvtGraytoBGR(const uchar * src_data, size_t src_step, } #endif - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); + CV_CPU_DISPATCH(cvtGraytoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1407,7 +363,9 @@ void cvtBGR5x5toGray(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits)); + + CV_CPU_DISPATCH(cvtBGR5x5toGray, (src_data, src_step, dst_data, dst_step, width, height, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } // only 8u @@ -1419,7 +377,9 @@ void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits)); + + CV_CPU_DISPATCH(cvtGraytoBGR5x5, (src_data, src_step, dst_data, dst_step, width, height, greenBits), + CV_CPU_DISPATCH_MODES_ALL); } void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, @@ -1439,7 +399,8 @@ void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, } #endif - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA()); + CV_CPU_DISPATCH(cvtRGBAtoMultipliedRGBA, (src_data, src_step, dst_data, dst_step, width, height), + CV_CPU_DISPATCH_MODES_ALL); } void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, @@ -1449,7 +410,9 @@ void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA()); + + CV_CPU_DISPATCH(cvtMultipliedRGBAtoRGBA, (src_data, src_step, dst_data, dst_step, width, height), + CV_CPU_DISPATCH_MODES_ALL); } } // namespace hal diff --git a/modules/imgproc/src/color_rgb.simd.hpp b/modules/imgproc/src/color_rgb.simd.hpp index 9245f26d05..76dc4e5e1e 100644 --- a/modules/imgproc/src/color_rgb.simd.hpp +++ b/modules/imgproc/src/color_rgb.simd.hpp @@ -3,13 +3,58 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" +#include "opencv2/core/hal/intrin.hpp" -#define IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 1 +namespace cv { +namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations -namespace cv -{ +void cvtBGRtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, int dcn, bool swapBlue); +void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int greenBits); +void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int dcn, bool swapBlue, int greenBits); +void cvtBGRtoGray(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue); +void cvtGraytoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn); +void cvtBGR5x5toGray(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int greenBits); +void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int greenBits); +void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height); +void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if defined(CV_CPU_BASELINE_MODE) +// included in color.hpp +#else +#include "color.simd_helpers.hpp" +#endif + +namespace { ////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// template struct v_type; @@ -1044,172 +1089,7 @@ struct mRGBA2RGBA } } }; - -// -// IPP functions -// - -#if NEED_IPP - -static ippiColor2GrayFunc ippiColor2GrayC3Tab[] = -{ - (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0, - 0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0 -}; - -static ippiColor2GrayFunc ippiColor2GrayC4Tab[] = -{ - (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0, - 0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0 -}; - -static ippiGeneralFunc ippiRGB2GrayC3Tab[] = -{ - (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0, - 0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0 -}; - -static ippiGeneralFunc ippiRGB2GrayC4Tab[] = -{ - (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0, - 0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0 -}; - - -#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 -static IppStatus ippiGrayToRGB_C1C3R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); -} -#endif -static IppStatus ippiGrayToRGB_C1C3R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); -} -static IppStatus ippiGrayToRGB_C1C3R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C3R, pSrc, srcStep, pDst, dstStep, roiSize); -} - -static IppStatus ippiGrayToRGB_C1C4R(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, IppiSize roiSize, Ipp8u aval) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_8u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); -} -static IppStatus ippiGrayToRGB_C1C4R(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, IppiSize roiSize, Ipp16u aval) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_16u_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); -} -static IppStatus ippiGrayToRGB_C1C4R(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, IppiSize roiSize, Ipp32f aval) -{ - return CV_INSTRUMENT_FUN_IPP(ippiGrayToRGB_32f_C1C4R, pSrc, srcStep, pDst, dstStep, roiSize, aval); -} - -struct IPPColor2GrayFunctor -{ - IPPColor2GrayFunctor(ippiColor2GrayFunc _func) : - ippiColorToGray(_func) - { - coeffs[0] = B2YF; - coeffs[1] = G2YF; - coeffs[2] = R2YF; - } - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiColorToGray ? CV_INSTRUMENT_FUN_IPP(ippiColorToGray, src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false; - } -private: - ippiColor2GrayFunc ippiColorToGray; - Ipp32f coeffs[3]; -}; - -template -struct IPPGray2BGRFunctor -{ - IPPGray2BGRFunctor(){} - - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiGrayToRGB_C1C3R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows)) >= 0; - } -}; - -template -struct IPPGray2BGRAFunctor -{ - IPPGray2BGRAFunctor() - { - alpha = ColorChannel::max(); - } - - bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const - { - return ippiGrayToRGB_C1C4R((T*)src, srcStep, (T*)dst, dstStep, ippiSize(cols, rows), alpha) >= 0; - } - - T alpha; -}; - -static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep, - IppiSize roiSize, const int *dstOrder) -{ - return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_8u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u); -} - -static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep, - IppiSize roiSize, const int *dstOrder) -{ - return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_16u_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u); -} - -static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep, - IppiSize roiSize, const int *dstOrder) -{ - return CV_INSTRUMENT_FUN_IPP(ippiSwapChannels_32f_C3C4R, pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f); -} - -// shared -ippiReorderFunc ippiSwapChannelsC3C4RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0 -}; - -static ippiGeneralFunc ippiCopyAC4C3RTab[] = -{ - (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0, - 0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0 -}; - -// shared -ippiReorderFunc ippiSwapChannelsC4C3RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0 -}; - -// shared -ippiReorderFunc ippiSwapChannelsC3RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0 -}; - -#if IPP_VERSION_X100 >= 810 -static ippiReorderFunc ippiSwapChannelsC4RTab[] = -{ - (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0, - 0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0 -}; -#endif - -#endif - -// -// HAL functions -// - -namespace hal -{ +} // namespace anon // 8u, 16u, 32f void cvtBGRtoBGR(const uchar * src_data, size_t src_step, @@ -1219,52 +1099,6 @@ void cvtBGRtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoBGR, cv_hal_cvtBGRtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if(scn == 3 && dcn == 4 && !swapBlue) - { - if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) ) - return; - } - else if(scn == 4 && dcn == 3 && !swapBlue) - { - if ( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) ) - return; - } - else if(scn == 3 && dcn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) ) - return; - } - else if(scn == 4 && dcn == 3 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) ) - return; - } - else if(scn == 3 && dcn == 3 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) ) - return; - } -#if IPP_VERSION_X100 >= 810 - else if(scn == 4 && dcn == 4 && swapBlue) - { - if( CvtColorIPPLoopCopy(src_data, src_step, CV_MAKETYPE(depth, scn), dst_data, dst_step, width, height, - IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) ) - return; - } - } -#endif -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB(scn, dcn, blueIdx)); @@ -1282,8 +1116,6 @@ void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoBGR5x5, cv_hal_cvtBGRtoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2RGB5x5(scn, swapBlue ? 2 : 0, greenBits)); } @@ -1295,8 +1127,6 @@ void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGR5x5toBGR, cv_hal_cvtBGR5x5toBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, greenBits); - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52RGB(dcn, swapBlue ? 2 : 0, greenBits)); } @@ -1308,38 +1138,6 @@ void cvtBGRtoGray(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoGray, cv_hal_cvtBGRtoGray, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - if(depth == CV_32F && scn == 3 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) ) - return; - } - else if(depth == CV_32F && scn == 3 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) ) - return; - } - else if(depth == CV_32F && scn == 4 && !swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) ) - return; - } - else if(depth == CV_32F && scn == 4 && swapBlue) - { - if( CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) ) - return; - } - } -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2Gray(scn, blueIdx, 0)); @@ -1357,39 +1155,6 @@ void cvtGraytoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtGraytoBGR, cv_hal_cvtGraytoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn); - -#if defined(HAVE_IPP) && IPP_VERSION_X100 >= 700 - CV_IPP_CHECK() - { - bool ippres = false; - if(dcn == 3) - { - if( depth == CV_8U ) - { -#if !IPP_DISABLE_CVTCOLOR_GRAY2BGR_8UC3 - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); -#endif - } - else if( depth == CV_16U ) - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); - else - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRFunctor()); - } - else if(dcn == 4) - { - if( depth == CV_8U ) - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); - else if( depth == CV_16U ) - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); - else - ippres = CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, IPPGray2BGRAFunctor()); - } - if(ippres) - return; - } -#endif - if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB(dcn)); else if( depth == CV_16U ) @@ -1406,7 +1171,6 @@ void cvtBGR5x5toGray(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGR5x5toGray, cv_hal_cvtBGR5x5toGray, src_data, src_step, dst_data, dst_step, width, height, greenBits); CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB5x52Gray(greenBits)); } @@ -1418,7 +1182,6 @@ void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtGraytoBGR5x5, cv_hal_cvtGraytoBGR5x5, src_data, src_step, dst_data, dst_step, width, height, greenBits); CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, Gray2RGB5x5(greenBits)); } @@ -1428,17 +1191,6 @@ void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtRGBAtoMultipliedRGBA, cv_hal_cvtRGBAtoMultipliedRGBA, src_data, src_step, dst_data, dst_step, width, height); - -#ifdef HAVE_IPP - CV_IPP_CHECK() - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R))) - return; - } -#endif - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGBA2mRGBA()); } @@ -1448,209 +1200,9 @@ void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtMultipliedRGBAtoRGBA, cv_hal_cvtMultipliedRGBAtoRGBA, src_data, src_step, dst_data, dst_step, width, height); CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, mRGBA2RGBA()); } -} // namespace hal - -// -// OCL calls -// - -#ifdef HAVE_OPENCL - -bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool reverse ) -{ - OclHelper< Set<3, 4>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER"))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits ) -{ - OclHelper< Set<3, 4>, Set<2>, Set > h(_src, _dst, 2); - - if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int gbits) -{ - OclHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits) -{ - OclHelper< Set<2>, Set<1>, Set > h(_src, _dst, 1); - - if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits) -{ - OclHelper< Set<1>, Set<2>, Set > h(_src, _dst, 2); - - if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx) -{ - OclHelper< Set<3, 4>, Set<1>, Set > h(_src, _dst, 1); - - int stripeSize = 1; - if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc, - format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize))) - { - return false; - } - - h.globalSize[0] = (h.src.cols + stripeSize - 1)/stripeSize; - return h.run(); -} - -bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn) -{ - OclHelper< Set<1>, Set<3, 4>, Set > h(_src, _dst, dcn); - if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc, - format("-D bidx=0 -D dcn=%d", dcn))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst) -{ - OclHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc, - "-D dcn=4 -D bidx=3")) - { - return false; - } - - return h.run(); -} - -bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst) -{ - OclHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc, - "-D dcn=4 -D bidx=3")) - { - return false; - } - - return h.run(); -} - #endif - -// -// HAL calls -// - -void cvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb) -{ - CvtHelper< Set<3, 4>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtBGRtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, dcn, swapb); -} - -void cvtColorBGR25x5( InputArray _src, OutputArray _dst, bool swapb, int gbits) -{ - CvtHelper< Set<3, 4>, Set<2>, Set > h(_src, _dst, 2); - - hal::cvtBGRtoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.scn, swapb, gbits); -} - -void cvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int gbits) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtBGR5x5toBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - dcn, swapb, gbits); -} - -void cvtColorBGR2Gray( InputArray _src, OutputArray _dst, bool swapb) -{ - CvtHelper< Set<3, 4>, Set<1>, Set > h(_src, _dst, 1); - - hal::cvtBGRtoGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb); -} - -void cvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<1>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtGraytoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, h.depth, dcn); -} - -void cvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits) -{ - CvtHelper< Set<2>, Set<1>, Set > h(_src, _dst, 1); - - hal::cvtBGR5x5toGray(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits); -} - -void cvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits) -{ - CvtHelper< Set<1>, Set<2>, Set > h(_src, _dst, 2); - - hal::cvtGraytoBGR5x5(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, gbits); -} - -void cvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst) -{ - CvtHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - hal::cvtRGBAtoMultipliedRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows); -} - -void cvtColormRGBA2RGBA( InputArray _src, OutputArray _dst) -{ - CvtHelper< Set<4>, Set<4>, Set > h(_src, _dst, 4); - - hal::cvtMultipliedRGBAtoRGBA(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows); -} - -} // namespace cv +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace diff --git a/modules/imgproc/src/color_yuv.dispatch.cpp b/modules/imgproc/src/color_yuv.dispatch.cpp index 7d731378e2..6cb508f980 100644 --- a/modules/imgproc/src/color_yuv.dispatch.cpp +++ b/modules/imgproc/src/color_yuv.dispatch.cpp @@ -3,1747 +3,19 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" +#include "opencl_kernels_imgproc.hpp" + #include "color.hpp" -namespace cv -{ +#include "color_yuv.simd.hpp" +#include "color_yuv.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content -//constants for conversion from/to RGB and YUV, YCrCb according to BT.601 - -//to YCbCr -static const float YCBF = 0.564f; // == 1/2/(1-B2YF) -static const float YCRF = 0.713f; // == 1/2/(1-R2YF) -static const int YCBI = 9241; // == YCBF*16384 -static const int YCRI = 11682; // == YCRF*16384 -//to YUV -static const float B2UF = 0.492f; -static const float R2VF = 0.877f; -static const int B2UI = 8061; // == B2UF*16384 -static const int R2VI = 14369; // == R2VF*16384 -//from YUV -static const float U2BF = 2.032f; -static const float U2GF = -0.395f; -static const float V2GF = -0.581f; -static const float V2RF = 1.140f; -static const int U2BI = 33292; -static const int U2GI = -6472; -static const int V2GI = -9519; -static const int V2RI = 18678; -//from YCrCb -static const float CB2BF = 1.773f; -static const float CB2GF = -0.344f; -static const float CR2GF = -0.714f; -static const float CR2RF = 1.403f; -static const int CB2BI = 29049; -static const int CB2GI = -5636; -static const int CR2GI = -11698; -static const int CR2RI = 22987; - -///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// - -template struct RGB2YCrCb_f -{ - typedef _Tp channel_type; - - RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : - srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; - static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const _Tp delta = ColorChannel<_Tp>::half(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - n *= 3; - for(int i = 0; i < n; i += 3, src += scn) - { - _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2); - _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta); - _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta); - dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb; - } - } - int srccn, blueIdx; - bool isCrCb; - float coeffs[5]; -}; - -template <> -struct RGB2YCrCb_f -{ - typedef float channel_type; - - RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) : - srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF }; - static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF }; - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx == 0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const float * src, float * dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const float delta = ColorChannel::half(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - - int i = 0; -#if CV_SIMD - v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1), vc2 = vx_setall_f32(C2); - v_float32 vc3 = vx_setall_f32(C3), vc4 = vx_setall_f32(C4); - v_float32 vdelta = vx_setall_f32(delta); - const int vsize = v_float32::nlanes; - for( ; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*3) - { - v_float32 b, g, r, dummy; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, dummy); - } - - v_float32 y, cr, cb; - y = v_fma(b, vc0, v_fma(g, vc1, r*vc2)); - - if(bidx) - std::swap(r, b); - - cr = v_fma(r - y, vc3, vdelta); - cb = v_fma(b - y, vc4, vdelta); - - if(yuvOrder) - { - v_store_interleave(dst, y, cb, cr); - } - else - { - v_store_interleave(dst, y, cr, cb); - } - } - vx_cleanup(); -#endif - for ( ; i < n; i ++, src += scn, dst += 3) - { - float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; - float Cr = (src[bidx^2] - Y)*C3 + delta; - float Cb = (src[bidx] - Y)*C4 + delta; - dst[0 ] = Y; - dst[1+yuvOrder] = Cr; - dst[2-yuvOrder] = Cb; - } - } - - int srccn, blueIdx; - bool isCrCb; - float coeffs[5]; -}; - - -template struct RGB2YCrCb_i -{ - typedef _Tp channel_type; - static const int shift = yuv_shift; - - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx==0) std::swap(coeffs[0], coeffs[2]); - } - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel<_Tp>::half()*(1 << shift); - n *= 3; - for(int i = 0; i < n; i += 3, src += scn) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); - dst[i] = saturate_cast<_Tp>(Y); - dst[i+1+yuvOrder] = saturate_cast<_Tp>(Cr); - dst[i+2-yuvOrder] = saturate_cast<_Tp>(Cb); - } - } - int srccn, blueIdx; - bool isCrCb; - int coeffs[5]; -}; - - -template<> -struct RGB2YCrCb_i -{ - typedef ushort channel_type; - static const int shift = yuv_shift; - static const int fix_shift = (int)(sizeof(short)*8 - shift); - - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if(blueIdx==0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const ushort* src, ushort* dst, int n) const - { - int scn = srccn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int sdelta = ColorChannel::half()*(1 << shift); - int i = 0; -#if CV_SIMD - const int vsize = v_uint16::nlanes; - const int descale = 1 << (shift-1); - - v_int16 b2y = vx_setall_s16((short)C0); - v_int16 g2y = vx_setall_s16((short)C1); - v_int16 r2y = vx_setall_s16((short)C2); - v_int16 one = vx_setall_s16(1); - v_int16 z = vx_setzero_s16(); - - v_int16 bg2y, r12y; - v_int16 dummy; - v_zip(b2y, g2y, bg2y, dummy); - v_zip(r2y, one, r12y, dummy); - - v_int16 vdescale = vx_setall_s16(1 << (shift-1)); - v_int32 vc3 = vx_setall_s32(C3); - v_int32 vc4 = vx_setall_s32(C4); - v_int32 vdd = vx_setall_s32(sdelta + descale); - - for(; i <= n-vsize; - i += vsize, src += vsize*scn, dst += vsize*3) - { - v_uint16 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_uint16 y, cr, cb; - - v_int16 sb = v_reinterpret_as_s16(b); - v_int16 sr = v_reinterpret_as_s16(r); - v_int16 sg = v_reinterpret_as_s16(g); - - v_int16 bg0, bg1; - v_int16 rd0, rd1; - v_zip(sb, sg, bg0, bg1); - v_zip(sr, vdescale, rd0, rd1); - - // fixing 16bit signed multiplication - v_int16 mr, mg, mb; - mr = (sr < z) & r2y; - mg = (sg < z) & g2y; - mb = (sb < z) & b2y; - v_int16 fixmul = v_add_wrap(mr, v_add_wrap(mg, mb)) << fix_shift; - - v_int32 ssy0 = (v_dotprod(bg0, bg2y) + v_dotprod(rd0, r12y)) >> shift; - v_int32 ssy1 = (v_dotprod(bg1, bg2y) + v_dotprod(rd1, r12y)) >> shift; - - y = v_reinterpret_as_u16(v_add_wrap(v_pack(ssy0, ssy1), fixmul)); - - if(bidx) - swap(r, b); - - // (r-Y) and (b-Y) don't fit into int16 or uint16 range - v_uint32 r0, r1, b0, b1; - v_expand(r, r0, r1); - v_expand(b, b0, b1); - - v_uint32 uy0, uy1; - v_expand(y, uy0, uy1); - - v_int32 sr0 = v_reinterpret_as_s32(r0); - v_int32 sr1 = v_reinterpret_as_s32(r1); - v_int32 sb0 = v_reinterpret_as_s32(b0); - v_int32 sb1 = v_reinterpret_as_s32(b1); - v_int32 sy0 = v_reinterpret_as_s32(uy0); - v_int32 sy1 = v_reinterpret_as_s32(uy1); - - sr0 = sr0 - sy0; sr1 = sr1 - sy1; - sb0 = sb0 - sy0; sb1 = sb1 - sy1; - - v_int32 scr0, scr1, scb0, scb1; - - scr0 = (sr0*vc3 + vdd) >> shift; - scr1 = (sr1*vc3 + vdd) >> shift; - scb0 = (sb0*vc4 + vdd) >> shift; - scb1 = (sb1*vc4 + vdd) >> shift; - - // saturate and pack - cr = v_pack_u(scr0, scr1); - cb = v_pack_u(scb0, scb1); - - if(yuvOrder) - { - v_store_interleave(dst, y, cb, cr); - } - else - { - v_store_interleave(dst, y, cr, cb); - } - } - vx_cleanup(); -#endif - for( ; i < n; i++, src += scn, dst += 3) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + sdelta, shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + sdelta, shift); - dst[0] = saturate_cast(Y); - dst[1+yuvOrder] = saturate_cast(Cr); - dst[2-yuvOrder] = saturate_cast(Cb); - } - } - int srccn, blueIdx; - bool isCrCb; - int coeffs[5]; -}; - - -template <> -struct RGB2YCrCb_i -{ - typedef uchar channel_type; - static const int shift = yuv_shift; - - RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb) - : srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI }; - static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI }; - for(int i = 0; i < 5; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - if (blueIdx==0) - std::swap(coeffs[0], coeffs[2]); - } - - void operator()(const uchar * src, uchar * dst, int n) const - { - int scn = srccn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; - int delta = ColorChannel::half()*(1 << shift); - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - const int descaleShift = 1 << (shift-1); - v_int16 bg2y; - v_int16 r12y; - v_int16 dummy; - v_zip(vx_setall_s16((short)C0), vx_setall_s16((short)C1), bg2y, dummy); - v_zip(vx_setall_s16((short)C2), vx_setall_s16( 1), r12y, dummy); - - // delta + descaleShift == descaleShift*(half*2+1) - v_int16 c3h, c4h; - const short h21 = (short)(ColorChannel::half()*2+1); - v_zip(vx_setall_s16((short)C3), vx_setall_s16(h21), c3h, dummy); - v_zip(vx_setall_s16((short)C4), vx_setall_s16(h21), c4h, dummy); - - v_int16 vdescale = vx_setall_s16(descaleShift); - - for( ; i <= n-vsize; - i += vsize, src += scn*vsize, dst += 3*vsize) - { - v_uint8 r, g, b, a; - if(scn == 3) - { - v_load_deinterleave(src, b, g, r); - } - else - { - v_load_deinterleave(src, b, g, r, a); - } - - v_uint8 y; - - v_uint16 r0, r1, g0, g1, b0, b1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - - v_int16 sr0, sr1, sg0, sg1, sb0, sb1; - sr0 = v_reinterpret_as_s16(r0); sr1 = v_reinterpret_as_s16(r1); - sg0 = v_reinterpret_as_s16(g0); sg1 = v_reinterpret_as_s16(g1); - sb0 = v_reinterpret_as_s16(b0); sb1 = v_reinterpret_as_s16(b1); - - v_uint32 y00, y01, y10, y11; - { - v_int16 bg00, bg01, bg10, bg11; - v_int16 rd00, rd01, rd10, rd11; - v_zip(sb0, sg0, bg00, bg01); - v_zip(sb1, sg1, bg10, bg11); - v_zip(sr0, vdescale, rd00, rd01); - v_zip(sr1, vdescale, rd10, rd11); - - y00 = v_reinterpret_as_u32(v_dotprod(bg00, bg2y) + v_dotprod(rd00, r12y)) >> shift; - y01 = v_reinterpret_as_u32(v_dotprod(bg01, bg2y) + v_dotprod(rd01, r12y)) >> shift; - y10 = v_reinterpret_as_u32(v_dotprod(bg10, bg2y) + v_dotprod(rd10, r12y)) >> shift; - y11 = v_reinterpret_as_u32(v_dotprod(bg11, bg2y) + v_dotprod(rd11, r12y)) >> shift; - } - - v_uint16 y0, y1; - y0 = v_pack(y00, y01); - y1 = v_pack(y10, y11); - - y = v_pack(y0, y1); - - v_int16 sy0, sy1; - sy0 = v_reinterpret_as_s16(y0); - sy1 = v_reinterpret_as_s16(y1); - - // (r-Y) and (b-Y) don't fit into 8 bit, use 16 bits instead - sr0 = v_sub_wrap(sr0, sy0); - sr1 = v_sub_wrap(sr1, sy1); - sb0 = v_sub_wrap(sb0, sy0); - sb1 = v_sub_wrap(sb1, sy1); - - if(bidx) - { - swap(sr0, sb0); swap(sr1, sb1); - } - - v_int32 cr00, cr01, cr10, cr11; - v_int32 cb00, cb01, cb10, cb11; - - // delta + descaleShift == descaleShift*(half*2+1) - { - v_int16 rd00, rd01, rd10, rd11; - v_int16 bd00, bd01, bd10, bd11; - - v_zip(sr0, vdescale, rd00, rd01); - v_zip(sr1, vdescale, rd10, rd11); - - v_zip(sb0, vdescale, bd00, bd01); - v_zip(sb1, vdescale, bd10, bd11); - - cr00 = v_dotprod(rd00, c3h); - cr01 = v_dotprod(rd01, c3h); - cr10 = v_dotprod(rd10, c3h); - cr11 = v_dotprod(rd11, c3h); - - cb00 = v_dotprod(bd00, c4h); - cb01 = v_dotprod(bd01, c4h); - cb10 = v_dotprod(bd10, c4h); - cb11 = v_dotprod(bd11, c4h); - } - - v_uint8 cr, cb; - - cr00 = cr00 >> shift; - cr01 = cr01 >> shift; - cr10 = cr10 >> shift; - cr11 = cr11 >> shift; - - cb00 = cb00 >> shift; - cb01 = cb01 >> shift; - cb10 = cb10 >> shift; - cb11 = cb11 >> shift; - - v_int16 cr0, cr1, cb0, cb1; - cr0 = v_pack(cr00, cr01); cr1 = v_pack(cr10, cr11); - cb0 = v_pack(cb00, cb01); cb1 = v_pack(cb10, cb11); - - cr = v_pack_u(cr0, cr1); - cb = v_pack_u(cb0, cb1); - - if(yuvOrder) - { - v_store_interleave(dst, y, cb, cr); - } - else - { - v_store_interleave(dst, y, cr, cb); - } - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += scn, dst += 3) - { - int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, shift); - int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, shift); - int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, shift); - dst[0] = saturate_cast(Y); - dst[1+yuvOrder] = saturate_cast(Cr); - dst[2-yuvOrder] = saturate_cast(Cb); - } - } - - int srccn, blueIdx, coeffs[5]; - bool isCrCb; -}; - - -template struct YCrCb2RGB_f -{ - typedef _Tp channel_type; - - YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; - static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; - } - } - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; - for(int i = 0; i < n; i += 3, dst += dcn) - { - _Tp Y = src[i]; - _Tp Cr = src[i+1+yuvOrder]; - _Tp Cb = src[i+2-yuvOrder]; - - _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3); - _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1); - _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0); - - dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - float coeffs[4]; -}; - - -template<> -struct YCrCb2RGB_f -{ - typedef float channel_type; - - YCrCb2RGB_f(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const float coeffs_cbr[] = {CR2RF, CR2GF, CB2GF, CB2BF}; - static const float coeffs_yuv[] = { V2RF, V2GF, U2GF, U2BF}; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_cbr[i] : coeffs_yuv[i]; - } - } - - void operator()(const float* src, float* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const float delta = ColorChannel::half(), alpha = ColorChannel::max(); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - - int i = 0; -#if CV_SIMD - v_float32 vc0 = vx_setall_f32(C0), vc1 = vx_setall_f32(C1); - v_float32 vc2 = vx_setall_f32(C2), vc3 = vx_setall_f32(C3); - v_float32 vdelta = vx_setall_f32(delta); - v_float32 valpha = vx_setall_f32(alpha); - const int vsize = v_float32::nlanes; - for( ; i <= n-vsize; - i += vsize, src += vsize*3, dst += vsize*dcn) - { - v_float32 y, cr, cb; - if(yuvOrder) - v_load_deinterleave(src, y, cb, cr); - else - v_load_deinterleave(src, y, cr, cb); - - v_float32 b, g, r; - - cb -= vdelta; cr -= vdelta; - b = v_fma(cb, vc3, y); - g = v_fma(cr, vc1, v_fma(cb, vc2, y)); - r = v_fma(cr, vc0, y); - - if(bidx) - swap(r, b); - - if(dcn == 3) - v_store_interleave(dst, b, g, r); - else - v_store_interleave(dst, b, g, r, valpha); - } - vx_cleanup(); -#endif - for(; i < n; i++, src += 3, dst += dcn) - { - float Y = src[0]; - float Cr = src[1+yuvOrder]; - float Cb = src[2-yuvOrder]; - - float b = Y + (Cb - delta)*C3; - float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; - float r = Y + (Cr - delta)*C0; - - dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - float coeffs[4]; -}; - - -template struct YCrCb2RGB_i -{ - typedef _Tp channel_type; - static const int shift = yuv_shift; - - YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; - static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - } - - void operator()(const _Tp* src, _Tp* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max(); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - n *= 3; - for(int i = 0; i < n; i += 3, dst += dcn) - { - _Tp Y = src[i]; - _Tp Cr = src[i+1+yuvOrder]; - _Tp Cb = src[i+2-yuvOrder]; - - int b = Y + CV_DESCALE((Cb - delta)*C3, shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - - dst[bidx] = saturate_cast<_Tp>(b); - dst[1] = saturate_cast<_Tp>(g); - dst[bidx^2] = saturate_cast<_Tp>(r); - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - int coeffs[4]; -}; - - -template <> -struct YCrCb2RGB_i -{ - typedef uchar channel_type; - static const int shift = yuv_shift; - - YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; - static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - } - - void operator()(const uchar* src, uchar* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 valpha = vx_setall_u8(alpha); - v_uint8 vdelta = vx_setall_u8(delta); - const int descaleShift = 1 << (shift - 1); - v_int32 vdescale = vx_setall_s32(descaleShift); - - v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); - // if YUV then C3 > 2^15, need to subtract it - // to fit in short by short multiplication - v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); - - for( ; i <= n-vsize; - i += vsize, src += 3*vsize, dst += dcn*vsize) - { - v_uint8 y, cr, cb; - if(yuvOrder) - { - v_load_deinterleave(src, y, cb, cr); - } - else - { - v_load_deinterleave(src, y, cr, cb); - } - - cr = v_sub_wrap(cr, vdelta); - cb = v_sub_wrap(cb, vdelta); - - v_int8 scr = v_reinterpret_as_s8(cr); - v_int8 scb = v_reinterpret_as_s8(cb); - - v_int16 scr0, scr1, scb0, scb1; - v_expand(scr, scr0, scr1); - v_expand(scb, scb0, scb1); - - v_int32 b00, b01, b10, b11; - v_int32 g00, g01, g10, g11; - v_int32 r00, r01, r10, r11; - - v_mul_expand(scb0, vc3, b00, b01); - v_mul_expand(scb1, vc3, b10, b11); - if(yuvOrder) - { - // if YUV then C3 > 2^15 - // so we fix the multiplication - v_int32 cb00, cb01, cb10, cb11; - v_expand(scb0, cb00, cb01); - v_expand(scb1, cb10, cb11); - b00 += cb00 << 15; b01 += cb01 << 15; - b10 += cb10 << 15; b11 += cb11 << 15; - } - - v_int32 t00, t01, t10, t11; - v_mul_expand(scb0, vc2, t00, t01); - v_mul_expand(scb1, vc2, t10, t11); - v_mul_expand(scr0, vc1, g00, g01); - v_mul_expand(scr1, vc1, g10, g11); - g00 += t00; g01 += t01; - g10 += t10; g11 += t11; - v_mul_expand(scr0, vc0, r00, r01); - v_mul_expand(scr1, vc0, r10, r11); - - b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift; - b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift; - g00 = (g00 + vdescale) >> shift; g01 = (g01 + vdescale) >> shift; - g10 = (g10 + vdescale) >> shift; g11 = (g11 + vdescale) >> shift; - r00 = (r00 + vdescale) >> shift; r01 = (r01 + vdescale) >> shift; - r10 = (r10 + vdescale) >> shift; r11 = (r11 + vdescale) >> shift; - - v_int16 b0, b1, g0, g1, r0, r1; - b0 = v_pack(b00, b01); b1 = v_pack(b10, b11); - g0 = v_pack(g00, g01); g1 = v_pack(g10, g11); - r0 = v_pack(r00, r01); r1 = v_pack(r10, r11); - - v_uint16 y0, y1; - v_expand(y, y0, y1); - v_int16 sy0, sy1; - sy0 = v_reinterpret_as_s16(y0); - sy1 = v_reinterpret_as_s16(y1); - - b0 = v_add_wrap(b0, sy0); b1 = v_add_wrap(b1, sy1); - g0 = v_add_wrap(g0, sy0); g1 = v_add_wrap(g1, sy1); - r0 = v_add_wrap(r0, sy0); r1 = v_add_wrap(r1, sy1); - - v_uint8 b, g, r; - b = v_pack_u(b0, b1); - g = v_pack_u(g0, g1); - r = v_pack_u(r0, r1); - - if(bidx) - swap(r, b); - - if(dcn == 3) - { - v_store_interleave(dst, b, g, r); - } - else - { - v_store_interleave(dst, b, g, r, valpha); - } - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += 3, dst += dcn) - { - uchar Y = src[0]; - uchar Cr = src[1+yuvOrder]; - uchar Cb = src[2-yuvOrder]; - - int b = Y + CV_DESCALE((Cb - delta)*C3, shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - int coeffs[4]; -}; - - -template <> -struct YCrCb2RGB_i -{ - typedef ushort channel_type; - static const int shift = yuv_shift; - - YCrCb2RGB_i(int _dstcn, int _blueIdx, bool _isCrCb) - : dstcn(_dstcn), blueIdx(_blueIdx), isCrCb(_isCrCb) - { - static const int coeffs_crb[] = { CR2RI, CR2GI, CB2GI, CB2BI}; - static const int coeffs_yuv[] = { V2RI, V2GI, U2GI, U2BI }; - for(int i = 0; i < 4; i++) - { - coeffs[i] = isCrCb ? coeffs_crb[i] : coeffs_yuv[i]; - } - } - - void operator()(const ushort* src, ushort* dst, int n) const - { - int dcn = dstcn, bidx = blueIdx, i = 0; - int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb - const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; - -#if CV_SIMD - const int vsize = v_uint16::nlanes; - const int descaleShift = 1 << (shift-1); - v_uint16 valpha = vx_setall_u16(alpha); - v_uint16 vdelta = vx_setall_u16(delta); - v_int16 vc0 = vx_setall_s16((short)C0), vc1 = vx_setall_s16((short)C1), vc2 = vx_setall_s16((short)C2); - // if YUV then C3 > 2^15, need to subtract it - // to fit in short by short multiplication - v_int16 vc3 = vx_setall_s16(yuvOrder ? (short)(C3-(1 << 15)) : (short)C3); - v_int32 vdescale = vx_setall_s32(descaleShift); - for(; i <= n-vsize; - i += vsize, src += vsize*3, dst += vsize*dcn) - { - v_uint16 y, cr, cb; - if(yuvOrder) - { - v_load_deinterleave(src, y, cb, cr); - } - else - { - v_load_deinterleave(src, y, cr, cb); - } - - v_uint32 uy0, uy1; - v_expand(y, uy0, uy1); - v_int32 y0 = v_reinterpret_as_s32(uy0); - v_int32 y1 = v_reinterpret_as_s32(uy1); - - cr = v_sub_wrap(cr, vdelta); - cb = v_sub_wrap(cb, vdelta); - - v_int32 b0, b1, g0, g1, r0, r1; - - v_int16 scb = v_reinterpret_as_s16(cb); - v_int16 scr = v_reinterpret_as_s16(cr); - v_mul_expand(scb, vc3, b0, b1); - if(yuvOrder) - { - // if YUV then C3 > 2^15 - // so we fix the multiplication - v_int32 cb0, cb1; - v_expand(scb, cb0, cb1); - b0 += cb0 << 15; - b1 += cb1 << 15; - } - v_int32 t0, t1; - v_mul_expand(scb, vc2, t0, t1); - v_mul_expand(scr, vc1, g0, g1); - g0 += t0; g1 += t1; - v_mul_expand(scr, vc0, r0, r1); - - // shifted term doesn't fit into 16 bits, addition is to be done in 32 bits - b0 = ((b0 + vdescale) >> shift) + y0; - b1 = ((b1 + vdescale) >> shift) + y1; - g0 = ((g0 + vdescale) >> shift) + y0; - g1 = ((g1 + vdescale) >> shift) + y1; - r0 = ((r0 + vdescale) >> shift) + y0; - r1 = ((r1 + vdescale) >> shift) + y1; - - // saturate and pack - v_uint16 b, g, r; - b = v_pack_u(b0, b1); - g = v_pack_u(g0, g1); - r = v_pack_u(r0, r1); - - if(bidx) - swap(r, b); - - if(dcn == 3) - { - v_store_interleave(dst, b, g, r); - } - else - { - v_store_interleave(dst, b, g, r, valpha); - } - } - vx_cleanup(); -#endif - - for ( ; i < n; i++, src += 3, dst += dcn) - { - ushort Y = src[0]; - ushort Cr = src[1+yuvOrder]; - ushort Cb = src[2-yuvOrder]; - - int b = Y + CV_DESCALE((Cb - delta)*C3, shift); - int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, shift); - int r = Y + CV_DESCALE((Cr - delta)*C0, shift); - - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); - if( dcn == 4 ) - dst[3] = alpha; - } - } - int dstcn, blueIdx; - bool isCrCb; - int coeffs[4]; -}; - - -///////////////////////////////////// YUV420 -> RGB ///////////////////////////////////// - -static const int ITUR_BT_601_CY = 1220542; -static const int ITUR_BT_601_CUB = 2116026; -static const int ITUR_BT_601_CUG = -409993; -static const int ITUR_BT_601_CVG = -852492; -static const int ITUR_BT_601_CVR = 1673527; -static const int ITUR_BT_601_SHIFT = 20; - -// Coefficients for RGB to YUV420p conversion -static const int ITUR_BT_601_CRY = 269484; -static const int ITUR_BT_601_CGY = 528482; -static const int ITUR_BT_601_CBY = 102760; -static const int ITUR_BT_601_CRU = -155188; -static const int ITUR_BT_601_CGU = -305135; -static const int ITUR_BT_601_CBU = 460324; -static const int ITUR_BT_601_CGV = -385875; -static const int ITUR_BT_601_CBV = -74448; - -//R = 1.164(Y - 16) + 1.596(V - 128) -//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) -//B = 1.164(Y - 16) + 2.018(U - 128) - -//R = (1220542(Y - 16) + 1673527(V - 128) + (1 << 19)) >> 20 -//G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20 -//B = (1220542(Y - 16) + 2116026(U - 128) + (1 << 19)) >> 20 - -static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) -{ - int uu, vv; - uu = int(u) - 128; - vv = int(v) - 128; - - ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv; - guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu; - buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; -} - -static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, - v_int32 (&ruv)[4], - v_int32 (&guv)[4], - v_int32 (&buv)[4]) -{ - v_uint8 v128 = vx_setall_u8(128); - v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128)); - v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128)); - - v_int16 uu0, uu1, vv0, vv1; - v_expand(su, uu0, uu1); - v_expand(sv, vv0, vv1); - v_int32 uu[4], vv[4]; - v_expand(uu0, uu[0], uu[1]); v_expand(uu1, uu[2], uu[3]); - v_expand(vv0, vv[0], vv[1]); v_expand(vv1, vv[2], vv[3]); - - v_int32 vshift = vx_setall_s32(1 << (ITUR_BT_601_SHIFT - 1)); - v_int32 vr = vx_setall_s32(ITUR_BT_601_CVR); - v_int32 vg = vx_setall_s32(ITUR_BT_601_CVG); - v_int32 ug = vx_setall_s32(ITUR_BT_601_CUG); - v_int32 ub = vx_setall_s32(ITUR_BT_601_CUB); - - for (int k = 0; k < 4; k++) - { - ruv[k] = vshift + vr * vv[k]; - guv[k] = vshift + vg * vv[k] + ug * uu[k]; - buv[k] = vshift + ub * uu[k]; - } -} - -static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv, - uchar& r, uchar& g, uchar& b, uchar& a) -{ - int yy = int(vy); - int y = std::max(0, yy - 16) * ITUR_BT_601_CY; - r = saturate_cast((y + ruv) >> ITUR_BT_601_SHIFT); - g = saturate_cast((y + guv) >> ITUR_BT_601_SHIFT); - b = saturate_cast((y + buv) >> ITUR_BT_601_SHIFT); - a = uchar(0xff); -} - -static inline void yRGBuvToRGBA(const v_uint8& vy, - const v_int32 (&ruv)[4], - const v_int32 (&guv)[4], - const v_int32 (&buv)[4], - v_uint8& rr, v_uint8& gg, v_uint8& bb) -{ - v_uint8 v16 = vx_setall_u8(16); - v_uint8 posY = vy - v16; - v_uint16 yy0, yy1; - v_expand(posY, yy0, yy1); - v_int32 yy[4]; - v_int32 yy00, yy01, yy10, yy11; - v_expand(v_reinterpret_as_s16(yy0), yy[0], yy[1]); - v_expand(v_reinterpret_as_s16(yy1), yy[2], yy[3]); - - v_int32 vcy = vx_setall_s32(ITUR_BT_601_CY); - - v_int32 y[4], r[4], g[4], b[4]; - for(int k = 0; k < 4; k++) - { - y[k] = yy[k]*vcy; - r[k] = (y[k] + ruv[k]) >> ITUR_BT_601_SHIFT; - g[k] = (y[k] + guv[k]) >> ITUR_BT_601_SHIFT; - b[k] = (y[k] + buv[k]) >> ITUR_BT_601_SHIFT; - } - - v_int16 r0, r1, g0, g1, b0, b1; - r0 = v_pack(r[0], r[1]); - r1 = v_pack(r[2], r[3]); - g0 = v_pack(g[0], g[1]); - g1 = v_pack(g[2], g[3]); - b0 = v_pack(b[0], b[1]); - b1 = v_pack(b[2], b[3]); - - rr = v_pack_u(r0, r1); - gg = v_pack_u(g0, g1); - bb = v_pack_u(b0, b1); -} - -template -static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v, - const uchar vy01, const uchar vy11, const uchar vy02, const uchar vy12, - uchar* row1, uchar* row2) -{ - int ruv, guv, buv; - uvToRGBuv(u, v, ruv, guv, buv); - - uchar r00, g00, b00, a00; - uchar r01, g01, b01, a01; - - yRGBuvToRGBA(vy01, ruv, guv, buv, r00, g00, b00, a00); - yRGBuvToRGBA(vy11, ruv, guv, buv, r01, g01, b01, a01); - - row1[2-bIdx] = r00; - row1[1] = g00; - row1[bIdx] = b00; - if(dcn == 4) - row1[3] = a00; - - row1[dcn+2-bIdx] = r01; - row1[dcn+1] = g01; - row1[dcn+0+bIdx] = b01; - if(dcn == 4) - row1[7] = a01; - - if(is420) - { - uchar r10, g10, b10, a10; - uchar r11, g11, b11, a11; - - yRGBuvToRGBA(vy02, ruv, guv, buv, r10, g10, b10, a10); - yRGBuvToRGBA(vy12, ruv, guv, buv, r11, g11, b11, a11); - - row2[2-bIdx] = r10; - row2[1] = g10; - row2[bIdx] = b10; - if(dcn == 4) - row2[3] = a10; - - row2[dcn+2-bIdx] = r11; - row2[dcn+1] = g11; - row2[dcn+0+bIdx] = b11; - if(dcn == 4) - row2[7] = a11; - } -} - -// bIdx is 0 or 2, uIdx is 0 or 1, dcn is 3 or 4 -template -struct YUV420sp2RGB8Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - int width; - const uchar* my1, *muv; - size_t stride; - - YUV420sp2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _uv) - : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), muv(_uv), stride(_stride) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - const int rangeBegin = range.start * 2; - const int rangeEnd = range.end * 2; - - const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2; - - for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride) - { - uchar* row1 = dst_data + dst_step * j; - uchar* row2 = dst_data + dst_step * (j + 1); - const uchar* y2 = y1 + stride; - - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 a = vx_setall_u8(uchar(0xff)); - for( ; i <= width - 2*vsize; - i += 2*vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) - { - v_uint8 u, v; - v_load_deinterleave(uv + i, u, v); - - if(uIdx) - { - swap(u, v); - } - - v_uint8 vy[4]; - v_load_deinterleave(y1 + i, vy[0], vy[1]); - v_load_deinterleave(y2 + i, vy[2], vy[3]); - - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); - - v_uint8 r[4], g[4], b[4]; - - for(int k = 0; k < 4; k++) - { - yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); - } - - if(bIdx) - { - for(int k = 0; k < 4; k++) - swap(r[k], b[k]); - } - - // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] - v_uint8 r0_0, r0_1, r1_0, r1_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_zip(r[2], r[3], r1_0, r1_1); - v_uint8 g0_0, g0_1, g1_0, g1_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_zip(g[2], g[3], g1_0, g1_1); - v_uint8 b0_0, b0_1, b1_0, b1_1; - v_zip(b[0], b[1], b0_0, b0_1); - v_zip(b[2], b[3], b1_0, b1_1); - - if(dcn == 4) - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); - v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); - v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); - } - else //dcn == 3 - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); - v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); - v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); - } - } - vx_cleanup(); -#endif - for ( ; i < width; i += 2, row1 += dcn*2, row2 += dcn*2) - { - uchar u = uv[i + 0 + uIdx]; - uchar v = uv[i + 1 - uIdx]; - - uchar vy01 = y1[i]; - uchar vy11 = y1[i + 1]; - uchar vy02 = y2[i]; - uchar vy12 = y2[i + 1]; - - cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); - } - } - } -}; - -template -struct YUV420p2RGB8Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - int width; - const uchar* my1, *mu, *mv; - size_t stride; - int ustepIdx, vstepIdx; - - YUV420p2RGB8Invoker(uchar * _dst_data, size_t _dst_step, int _dst_width, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx) - : dst_data(_dst_data), dst_step(_dst_step), width(_dst_width), my1(_y1), mu(_u), mv(_v), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - const int rangeBegin = range.start * 2; - const int rangeEnd = range.end * 2; - - int uvsteps[2] = {width/2, static_cast(stride) - width/2}; - int usIdx = ustepIdx, vsIdx = vstepIdx; - - const uchar* y1 = my1 + rangeBegin * stride; - const uchar* u1 = mu + (range.start / 2) * stride; - const uchar* v1 = mv + (range.start / 2) * stride; - - if(range.start % 2 == 1) - { - u1 += uvsteps[(usIdx++) & 1]; - v1 += uvsteps[(vsIdx++) & 1]; - } - - for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1]) - { - uchar* row1 = dst_data + dst_step * j; - uchar* row2 = dst_data + dst_step * (j + 1); - const uchar* y2 = y1 + stride; - int i = 0; - -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 a = vx_setall_u8(uchar(0xff)); - for( ; i <= width/2 - vsize; - i += vsize, row1 += vsize*dcn*2, row2 += vsize*dcn*2) - { - v_uint8 u, v; - u = vx_load(u1 + i); - v = vx_load(v1 + i); - - v_uint8 vy[4]; - v_load_deinterleave(y1 + 2*i, vy[0], vy[1]); - v_load_deinterleave(y2 + 2*i, vy[2], vy[3]); - - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); - - v_uint8 r[4], g[4], b[4]; - - for(int k = 0; k < 4; k++) - { - yRGBuvToRGBA(vy[k], ruv, guv, buv, r[k], g[k], b[k]); - } - - if(bIdx) - { - for(int k = 0; k < 4; k++) - swap(r[k], b[k]); - } - - // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] - v_uint8 r0_0, r0_1, r1_0, r1_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_zip(r[2], r[3], r1_0, r1_1); - v_uint8 g0_0, g0_1, g1_0, g1_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_zip(g[2], g[3], g1_0, g1_1); - v_uint8 b0_0, b0_1, b1_0, b1_1; - v_zip(b[0], b[1], b0_0, b0_1); - v_zip(b[2], b[3], b1_0, b1_1); - - if(dcn == 4) - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0, a); - v_store_interleave(row1 + 4*vsize, b0_1, g0_1, r0_1, a); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0, a); - v_store_interleave(row2 + 4*vsize, b1_1, g1_1, r1_1, a); - } - else //dcn == 3 - { - v_store_interleave(row1 + 0*vsize, b0_0, g0_0, r0_0); - v_store_interleave(row1 + 3*vsize, b0_1, g0_1, r0_1); - - v_store_interleave(row2 + 0*vsize, b1_0, g1_0, r1_0); - v_store_interleave(row2 + 3*vsize, b1_1, g1_1, r1_1); - } - } - vx_cleanup(); -#endif - for (; i < width / 2; i += 1, row1 += dcn*2, row2 += dcn*2) - { - uchar u = u1[i]; - uchar v = v1[i]; - - uchar vy01 = y1[2 * i]; - uchar vy11 = y1[2 * i + 1]; - uchar vy02 = y2[2 * i]; - uchar vy12 = y2[2 * i + 1]; - - cvtYuv42xxp2RGB8(u, v, vy01, vy11, vy02, vy12, row1, row2); - } - } - } -}; - - -#define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240) - -template -inline void cvtYUV420sp2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _uv) -{ - YUV420sp2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _uv); - if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) - parallel_for_(Range(0, dst_height/2), converter); - else - converter(Range(0, dst_height/2)); -} - -template -inline void cvtYUV420p2RGB(uchar * dst_data, size_t dst_step, int dst_width, int dst_height, size_t _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx) -{ - YUV420p2RGB8Invoker converter(dst_data, dst_step, dst_width, _stride, _y1, _u, _v, ustepIdx, vstepIdx); - if (dst_width * dst_height >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION) - parallel_for_(Range(0, dst_height/2), converter); - else - converter(Range(0, dst_height/2)); -} - -///////////////////////////////////// RGB -> YUV420p ///////////////////////////////////// - -static inline uchar rgbToY42x(uchar r, uchar g, uchar b) -{ - const int shifted16 = (16 << ITUR_BT_601_SHIFT); - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - int yy = ITUR_BT_601_CRY * r + ITUR_BT_601_CGY * g + ITUR_BT_601_CBY * b + halfShift + shifted16; - - return saturate_cast(yy >> ITUR_BT_601_SHIFT); -} - -static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b) -{ - const int shifted16 = (16 << ITUR_BT_601_SHIFT); - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - v_uint16 r0, r1, g0, g1, b0, b1; - v_expand(r, r0, r1); - v_expand(g, g0, g1); - v_expand(b, b0, b1); - - v_uint32 rq[4], gq[4], bq[4]; - v_expand(r0, rq[0], rq[1]); v_expand(r1, rq[2], rq[3]); - v_expand(g0, gq[0], gq[1]); v_expand(g1, gq[2], gq[3]); - v_expand(b0, bq[0], bq[1]); v_expand(b1, bq[2], bq[3]); - - v_uint32 ry = vx_setall_u32(ITUR_BT_601_CRY), gy = vx_setall_u32(ITUR_BT_601_CGY); - v_uint32 by = vx_setall_u32(ITUR_BT_601_CBY), shift = vx_setall_u32(halfShift + shifted16); - - v_uint32 y[4]; - for(int k = 0; k < 4; k++) - { - y[k] = (rq[k]*ry + gq[k]*gy + bq[k]*by + shift) >> ITUR_BT_601_SHIFT; - } - - v_uint16 y0, y1; - y0 = v_pack(y[0], y[1]); - y1 = v_pack(y[2], y[3]); - - return v_pack(y0, y1); -} - -static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) -{ - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - const int shifted128 = (128 << ITUR_BT_601_SHIFT); - int uu = ITUR_BT_601_CRU * r + ITUR_BT_601_CGU * g + ITUR_BT_601_CBU * b + halfShift + shifted128; - int vv = ITUR_BT_601_CBU * r + ITUR_BT_601_CGV * g + ITUR_BT_601_CBV * b + halfShift + shifted128; - - u = saturate_cast(uu >> ITUR_BT_601_SHIFT); - v = saturate_cast(vv >> ITUR_BT_601_SHIFT); -} - -static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1, - const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v) -{ - // [r0, r1, r2, r3,..] => [r0, 0, r2, 0,..] - v_int16 vlowByte = vx_setall_s16(0x00ff); - v_int16 rd0, rd1, gd0, gd1, bd0, bd1; - rd0 = v_reinterpret_as_s16(r0) & vlowByte; - rd1 = v_reinterpret_as_s16(r1) & vlowByte; - gd0 = v_reinterpret_as_s16(g0) & vlowByte; - gd1 = v_reinterpret_as_s16(g1) & vlowByte; - bd0 = v_reinterpret_as_s16(b0) & vlowByte; - bd1 = v_reinterpret_as_s16(b1) & vlowByte; - - v_int32 rq[4], gq[4], bq[4]; - v_expand(rd0, rq[0], rq[1]); - v_expand(rd1, rq[2], rq[3]); - v_expand(gd0, gq[0], gq[1]); - v_expand(gd1, gq[2], gq[3]); - v_expand(bd0, bq[0], bq[1]); - v_expand(bd1, bq[2], bq[3]); - - const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1)); - const int shifted128 = (128 << ITUR_BT_601_SHIFT); - v_int32 shift = vx_setall_s32(halfShift + shifted128); - v_int32 ru, gu, bu, gv, bv; - ru = vx_setall_s32(ITUR_BT_601_CRU); - gu = vx_setall_s32(ITUR_BT_601_CGU); - gv = vx_setall_s32(ITUR_BT_601_CGV); - bu = vx_setall_s32(ITUR_BT_601_CBU); - bv = vx_setall_s32(ITUR_BT_601_CBV); - - v_int32 uq[4], vq[4]; - for(int k = 0; k < 4; k++) - { - uq[k] = (ru*rq[k] + gu*gq[k] + bu*bq[k] + shift) >> ITUR_BT_601_SHIFT; - vq[k] = (bu*rq[k] + gv*gq[k] + bv*bq[k] + shift) >> ITUR_BT_601_SHIFT; - } - - v_int16 u0, u1, v0, v1; - u0 = v_pack(uq[0], uq[1]); - u1 = v_pack(uq[2], uq[3]); - v0 = v_pack(vq[0], vq[1]); - v1 = v_pack(vq[2], vq[3]); - - u = v_pack_u(u0, u1); - v = v_pack_u(v0, v1); -} - - -struct RGB8toYUV420pInvoker: public ParallelLoopBody -{ - RGB8toYUV420pInvoker(const uchar * _srcData, size_t _srcStep, - uchar * _yData, uchar * _uvData, size_t _dstStep, - int _srcWidth, int _srcHeight, int _scn, bool _swapBlue, bool _swapUV, bool _interleave) - : srcData(_srcData), srcStep(_srcStep), - yData(_yData), uvData(_uvData), dstStep(_dstStep), - srcWidth(_srcWidth), srcHeight(_srcHeight), - srcCn(_scn), swapBlue(_swapBlue), swapUV(_swapUV), interleave(_interleave) { } - - void operator()(const Range& rowRange) const CV_OVERRIDE - { - const int w = srcWidth; - const int h = srcHeight; - const int scn = srcCn; - const uchar* srcRow = (uchar*)0; - uchar* yRow = (uchar*)0, *uRow = (uchar*)0, *vRow = (uchar*)0, *uvRow = (uchar*)0; - for( int sRow = rowRange.start*2; sRow < rowRange.end*2; sRow++) - { - srcRow = srcData + srcStep*sRow; - yRow = yData + dstStep * sRow; - bool evenRow = (sRow % 2) == 0; - if(evenRow) - { - if (interleave) - { - uvRow = uvData + dstStep*(sRow/2); - } - else - { - uRow = uvData + dstStep * (sRow/4) + ((sRow/2) % 2) * (w/2); - vRow = uvData + dstStep * ((sRow + h)/4) + (((sRow + h)/2) % 2) * (w/2); - } - } - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - - for( ; i <= w/2 - vsize; - i += vsize) - { - // processing (2*vsize) pixels at once - v_uint8 b0, b1, g0, g1, r0, r1, a0, a1; - if(scn == 4) - { - v_load_deinterleave(srcRow + 2*4*i + 0*vsize, b0, g0, r0, a0); - v_load_deinterleave(srcRow + 2*4*i + 4*vsize, b1, g1, r1, a1); - } - else // scn == 3 - { - v_load_deinterleave(srcRow + 2*3*i + 0*vsize, b0, g0, r0); - v_load_deinterleave(srcRow + 2*3*i + 3*vsize, b1, g1, r1); - } - - if(swapBlue) - { - swap(b0, r0); swap(b1, r1); - } - - v_uint8 y0, y1; - - y0 = rgbToY42x(r0, g0, b0); - y1 = rgbToY42x(r1, g1, b1); - - v_store(yRow + 2*i + 0*vsize, y0); - v_store(yRow + 2*i + 1*vsize, y1); - - if(evenRow) - { - v_uint8 u, v; - rgbToUV42x(r0, r1, g0, g1, b0, b1, u, v); - - if(swapUV) - { - swap(u, v); - } - - if(interleave) - { - v_store_interleave(uvRow + 2*i, u, v); - } - else - { - v_store(uRow + i, u); - v_store(vRow + i, v); - } - } - } - vx_cleanup(); -#endif - // processing two pixels at once - for( ; i < w/2; i++) - { - uchar b0, g0, r0; - uchar b1, g1, r1; - b0 = srcRow[(2*i+0)*scn + 0]; - g0 = srcRow[(2*i+0)*scn + 1]; - r0 = srcRow[(2*i+0)*scn + 2]; - b1 = srcRow[(2*i+1)*scn + 0]; - g1 = srcRow[(2*i+1)*scn + 1]; - r1 = srcRow[(2*i+1)*scn + 2]; - - if(swapBlue) - { - swap(b0, r0); swap(b1, r1); - } - - uchar y0 = rgbToY42x(r0, g0, b0); - uchar y1 = rgbToY42x(r1, g1, b1); - - yRow[2*i+0] = y0; - yRow[2*i+1] = y1; - - if(evenRow) - { - uchar uu, vv; - rgbToUV42x(r0, g0, b0, uu, vv); - if(swapUV) - { - swap(uu, vv); - } - - if(interleave) - { - uvRow[2*i+0] = uu; - uvRow[2*i+1] = vv; - } - else - { - uRow[i] = uu; - vRow[i] = vv; - } - } - } - } - } - - const uchar * srcData; - size_t srcStep; - uchar *yData, *uvData; - size_t dstStep; - int srcWidth; - int srcHeight; - const int srcCn; - bool swapBlue; - bool swapUV; - bool interleave; -}; - - -///////////////////////////////////// YUV422 -> RGB ///////////////////////////////////// - -// bIdx is 0 or 2; [uIdx, yIdx] is [0, 0], [0, 1], [1, 0]; dcn is 3 or 4 -template -struct YUV422toRGB8Invoker : ParallelLoopBody -{ - uchar * dst_data; - size_t dst_step; - const uchar * src_data; - size_t src_step; - int width; - - YUV422toRGB8Invoker(uchar * _dst_data, size_t _dst_step, - const uchar * _src_data, size_t _src_step, - int _width) - : dst_data(_dst_data), dst_step(_dst_step), src_data(_src_data), src_step(_src_step), width(_width) {} - - void operator()(const Range& range) const CV_OVERRIDE - { - int rangeBegin = range.start; - int rangeEnd = range.end; - - // [yIdx, uIdx] | [uidx, vidx]: - // 0, 0 | 1, 3 - // 0, 1 | 3, 1 - // 1, 0 | 0, 2 - const int uidx = 1 - yIdx + uIdx * 2; - const int vidx = (2 + uidx) % 4; - const uchar* yuv_src = src_data + rangeBegin * src_step; - - for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += src_step) - { - uchar* row = dst_data + dst_step * j; - int i = 0; -#if CV_SIMD - const int vsize = v_uint8::nlanes; - v_uint8 a = vx_setall_u8(uchar(0xff)); - for(; i <= 2*width - 4*vsize; - i += 4*vsize, row += vsize*dcn*2) - { - v_uint8 u, v, vy[2]; - if(yIdx == 1) // UYVY - { - v_load_deinterleave(yuv_src + i, u, vy[0], v, vy[1]); - } - else // YUYV or YVYU - { - v_load_deinterleave(yuv_src + i, vy[0], u, vy[1], v); - if(uIdx == 1) // YVYU - { - swap(u, v); - } - } - - v_int32 ruv[4], guv[4], buv[4]; - uvToRGBuv(u, v, ruv, guv, buv); - - v_uint8 r[2], g[2], b[2]; - - yRGBuvToRGBA(vy[0], ruv, guv, buv, r[0], g[0], b[0]); - yRGBuvToRGBA(vy[1], ruv, guv, buv, r[1], g[1], b[1]); - - if(bIdx) - { - swap(r[0], b[0]); - swap(r[1], b[1]); - } - - // [r0...], [r1...] => [r0, r1, r0, r1...], [r0, r1, r0, r1...] - v_uint8 r0_0, r0_1; - v_zip(r[0], r[1], r0_0, r0_1); - v_uint8 g0_0, g0_1; - v_zip(g[0], g[1], g0_0, g0_1); - v_uint8 b0_0, b0_1; - v_zip(b[0], b[1], b0_0, b0_1); - - if(dcn == 4) - { - v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0, a); - v_store_interleave(row + 4*vsize, b0_1, g0_1, r0_1, a); - } - else //dcn == 3 - { - v_store_interleave(row + 0*vsize, b0_0, g0_0, r0_0); - v_store_interleave(row + 3*vsize, b0_1, g0_1, r0_1); - } - } - vx_cleanup(); -#endif - for (; i < 2 * width; i += 4, row += dcn*2) - { - uchar u = yuv_src[i + uidx]; - uchar v = yuv_src[i + vidx]; - - uchar vy0 = yuv_src[i + yIdx]; - uchar vy1 = yuv_src[i + yIdx + 2]; - - cvtYuv42xxp2RGB8(u, v, vy0, vy1, 0, 0, row, (uchar*)(0)); - } - } - } -}; - -#define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240) - -template -inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_data, size_t src_step, - int width, int height) -{ - YUV422toRGB8Invoker converter(dst_data, dst_step, src_data, src_step, width); - if (width * height >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION) - parallel_for_(Range(0, height), converter); - else - converter(Range(0, height)); -} +namespace cv { // // HAL functions // - -namespace hal -{ +namespace hal { // 8u, 16u, 32f void cvtBGRtoYUV(const uchar * src_data, size_t src_step, @@ -1790,13 +62,8 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step, #endif #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_f(scn, blueIdx, isCbCr)); + CV_CPU_DISPATCH(cvtBGRtoYUV, (src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr), + CV_CPU_DISPATCH_MODES_ALL); } void cvtYUVtoBGR(const uchar * src_data, size_t src_step, @@ -1844,13 +111,8 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step, #endif #endif - int blueIdx = swapBlue ? 2 : 0; - if( depth == CV_8U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); - else if( depth == CV_16U ) - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); - else - CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_f(dcn, blueIdx, isCbCr)); + CV_CPU_DISPATCH(cvtYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr), + CV_CPU_DISPATCH_MODES_ALL); } void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, @@ -1861,17 +123,10 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); - const uchar* uv = src_data + src_step * static_cast(dst_height); - cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); -} -typedef void (*cvt_2plane_yuv_ptr_t)(uchar * /* dst_data*/, - size_t /* dst_step */, - int /* dst_width */, - int /* dst_height */, - size_t /* _stride */, - const uchar* /* _y1 */, - const uchar* /* _uv */); + CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); +} void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step, uchar * dst_data, size_t dst_step, @@ -1880,66 +135,21 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src { CV_INSTRUMENT_REGION(); - // TODO: add hal replacement method - - int blueIdx = swapBlue ? 2 : 0; - - cvt_2plane_yuv_ptr_t cvtPtr; - switch(dcn*100 + blueIdx * 10 + uIdx) - { - case 300: cvtPtr = cvtYUV420sp2RGB<0, 0, 3>; break; - case 301: cvtPtr = cvtYUV420sp2RGB<0, 1, 3>; break; - case 320: cvtPtr = cvtYUV420sp2RGB<2, 0, 3>; break; - case 321: cvtPtr = cvtYUV420sp2RGB<2, 1, 3>; break; - case 400: cvtPtr = cvtYUV420sp2RGB<0, 0, 4>; break; - case 401: cvtPtr = cvtYUV420sp2RGB<0, 1, 4>; break; - case 420: cvtPtr = cvtYUV420sp2RGB<2, 0, 4>; break; - case 421: cvtPtr = cvtYUV420sp2RGB<2, 1, 4>; break; - default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; - }; - - cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, y_data, uv_data); + CV_CPU_DISPATCH(cvtTwoPlaneYUVtoBGR, (y_data, uv_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } -typedef void (*cvt_3plane_yuv_ptr_t)(uchar * /* dst_data */, - size_t /* dst_step */, - int /* dst_width */, - int /* dst_height */, - size_t /* _stride */, - const uchar* /* _y1 */, - const uchar* /* _u */, - const uchar* /* _v */, - int /* ustepIdx */, - int /* vstepIdx */); - void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int dst_width, int dst_height, - int dcn, bool swapBlue, int uIdx) + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx) { CV_INSTRUMENT_REGION(); CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); - const uchar* u = src_data + src_step * static_cast(dst_height); - const uchar* v = src_data + src_step * static_cast(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2); - int ustepIdx = 0; - int vstepIdx = dst_height % 4 == 2 ? 1 : 0; - - if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); } - int blueIdx = swapBlue ? 2 : 0; - - cvt_3plane_yuv_ptr_t cvtPtr; - switch(dcn*10 + blueIdx) - { - case 30: cvtPtr = cvtYUV420p2RGB<0, 3>; break; - case 32: cvtPtr = cvtYUV420p2RGB<2, 3>; break; - case 40: cvtPtr = cvtYUV420p2RGB<0, 4>; break; - case 42: cvtPtr = cvtYUV420p2RGB<2, 4>; break; - default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; - }; - - cvtPtr(dst_data, dst_step, dst_width, dst_height, src_step, src_data, u, v, ustepIdx, vstepIdx); + CV_CPU_DISPATCH(cvtThreePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, @@ -1950,15 +160,9 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, CV_INSTRUMENT_REGION(); CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx); - uchar * uv_data = dst_data + dst_step * height; - RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height, - scn, swapBlue, uIdx == 2, false); - - if( width * height >= 320*240 ) - parallel_for_(Range(0, height/2), cvt); - else - cvt(Range(0, height/2)); + CV_CPU_DISPATCH(cvtBGRtoThreePlaneYUV, (src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, @@ -1970,22 +174,10 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, // TODO: add hal replacement method - RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height, - scn, swapBlue, uIdx == 2, true); - - if( width * height >= 320*240 ) - parallel_for_(Range(0, height/2), cvt); - else - cvt(Range(0, height/2)); + CV_CPU_DISPATCH(cvtBGRtoTwoPlaneYUV, (src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx), + CV_CPU_DISPATCH_MODES_ALL); } -typedef void (*cvt_1plane_yuv_ptr_t)(uchar * /* dst_data */, - size_t /* dst_step */, - const uchar * /* src_data */, - size_t /* src_step */, - int /* width */, - int /* height */); - void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, @@ -1995,26 +187,8 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn); - cvt_1plane_yuv_ptr_t cvtPtr; - int blueIdx = swapBlue ? 2 : 0; - switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn) - { - case 3000: cvtPtr = cvtYUV422toRGB<0,0,0,3>; break; - case 3001: cvtPtr = cvtYUV422toRGB<0,0,1,3>; break; - case 3010: cvtPtr = cvtYUV422toRGB<0,1,0,3>; break; - case 3200: cvtPtr = cvtYUV422toRGB<2,0,0,3>; break; - case 3201: cvtPtr = cvtYUV422toRGB<2,0,1,3>; break; - case 3210: cvtPtr = cvtYUV422toRGB<2,1,0,3>; break; - case 4000: cvtPtr = cvtYUV422toRGB<0,0,0,4>; break; - case 4001: cvtPtr = cvtYUV422toRGB<0,0,1,4>; break; - case 4010: cvtPtr = cvtYUV422toRGB<0,1,0,4>; break; - case 4200: cvtPtr = cvtYUV422toRGB<2,0,0,4>; break; - case 4201: cvtPtr = cvtYUV422toRGB<2,0,1,4>; break; - case 4210: cvtPtr = cvtYUV422toRGB<2,1,0,4>; break; - default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break; - }; - - cvtPtr(dst_data, dst_step, src_data, src_step, width, height); + CV_CPU_DISPATCH(cvtOnePlaneYUVtoBGR, (src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn), + CV_CPU_DISPATCH_MODES_ALL); } } // namespace hal diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp index 7d731378e2..8bbd78b244 100644 --- a/modules/imgproc/src/color_yuv.simd.hpp +++ b/modules/imgproc/src/color_yuv.simd.hpp @@ -3,11 +3,54 @@ // of this distribution and at http://opencv.org/license.html #include "precomp.hpp" -#include "color.hpp" +#include "opencv2/core/hal/intrin.hpp" -namespace cv -{ +namespace cv { +namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void cvtBGRtoYUV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int scn, bool swapBlue, bool isCbCr); +void cvtYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int depth, int dcn, bool swapBlue, bool isCbCr); +void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx); +void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx); +void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int dst_width, int dst_height, + int dcn, bool swapBlue, int uIdx); +void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int uIdx); +void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, + uchar * y_data, uchar * uv_data, size_t dst_step, + int width, int height, + int scn, bool swapBlue, int uIdx); +void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int dcn, bool swapBlue, int uIdx, int ycn); +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if defined(CV_CPU_BASELINE_MODE) +// included in color.hpp +#else +#include "color.simd_helpers.hpp" +#endif + +namespace { //constants for conversion from/to RGB and YUV, YCrCb according to BT.601 //to YCbCr @@ -1738,12 +1781,8 @@ inline void cvtYUV422toRGB(uchar * dst_data, size_t dst_step, const uchar * src_ converter(Range(0, height)); } -// -// HAL functions -// +} // namespace anon -namespace hal -{ // 8u, 16u, 32f void cvtBGRtoYUV(const uchar * src_data, size_t src_step, @@ -1753,43 +1792,6 @@ void cvtBGRtoYUV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoYUV, cv_hal_cvtBGRtoYUV, src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr); - -#if defined(HAVE_IPP) -#if !IPP_DISABLE_RGB_YUV - CV_IPP_CHECK() - { - if (scn == 3 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R))) - return; - } - else if (scn == 3 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], - (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth))) - return; - } - else if (scn == 4 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], - (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth))) - return; - } - else if (scn == 4 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], - (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth))) - return; - } - } -#endif -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, RGB2YCrCb_i(scn, blueIdx, isCbCr)); @@ -1806,44 +1808,6 @@ void cvtYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtYUVtoBGR, cv_hal_cvtYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, depth, dcn, swapBlue, isCbCr); - - -#if defined(HAVE_IPP) -#if !IPP_DISABLE_YUV_RGB - CV_IPP_CHECK() - { - if (dcn == 3 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R))) - return; - } - else if (dcn == 3 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, - ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth))) - return; - } - else if (dcn == 4 && depth == CV_8U && swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, - ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth))) - return; - } - else if (dcn == 4 && depth == CV_8U && !swapBlue && !isCbCr) - { - if (CvtColorIPPLoop(src_data, src_step, dst_data, dst_step, width, height, - IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R, - ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth))) - return; - } - } -#endif -#endif - int blueIdx = swapBlue ? 2 : 0; if( depth == CV_8U ) CvtColorLoop(src_data, src_step, dst_data, dst_step, width, height, YCrCb2RGB_i(dcn, blueIdx, isCbCr)); @@ -1860,7 +1824,6 @@ void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtTwoPlaneYUVtoBGR, cv_hal_cvtTwoPlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); const uchar* uv = src_data + src_step * static_cast(dst_height); cvtTwoPlaneYUVtoBGR(src_data, uv, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); } @@ -1880,8 +1843,6 @@ void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src { CV_INSTRUMENT_REGION(); - // TODO: add hal replacement method - int blueIdx = swapBlue ? 2 : 0; cvt_2plane_yuv_ptr_t cvtPtr; @@ -1919,7 +1880,6 @@ void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtThreePlaneYUVtoBGR, cv_hal_cvtThreePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx); const uchar* u = src_data + src_step * static_cast(dst_height); const uchar* v = src_data + src_step * static_cast(dst_height + dst_height/4) + (dst_width/2) * ((dst_height % 4)/2); @@ -1949,7 +1909,6 @@ void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtBGRtoThreePlaneYUV, cv_hal_cvtBGRtoThreePlaneYUV, src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, uIdx); uchar * uv_data = dst_data + dst_step * height; RGB8toYUV420pInvoker cvt(src_data, src_step, dst_data, uv_data, dst_step, width, height, @@ -1968,8 +1927,6 @@ void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - // TODO: add hal replacement method - RGB8toYUV420pInvoker cvt(src_data, src_step, y_data, uv_data, dst_step, width, height, scn, swapBlue, uIdx == 2, true); @@ -1993,8 +1950,6 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, { CV_INSTRUMENT_REGION(); - CALL_HAL(cvtOnePlaneYUVtoBGR, cv_hal_cvtOnePlaneYUVtoBGR, src_data, src_step, dst_data, dst_step, width, height, dcn, swapBlue, uIdx, ycn); - cvt_1plane_yuv_ptr_t cvtPtr; int blueIdx = swapBlue ? 2 : 0; switch(dcn*1000 + blueIdx*100 + uIdx*10 + ycn) @@ -2017,227 +1972,6 @@ void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, cvtPtr(dst_data, dst_step, src_data, src_step, width, height); } -} // namespace hal - -// -// OCL calls -// - -#ifdef HAVE_OPENCL - -bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx ) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("YUV2RGB", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d", dcn, bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx ) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - if(!h.createKernel("RGB2YUV", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=3 -D bidx=%d", bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx) -{ - OclHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - if(!h.createKernel("YCrCb2RGB", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d", dcn, bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx) -{ - OclHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - if(!h.createKernel("RGB2YCrCb", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=3 -D bidx=%d", bidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx, int yidx ) -{ - OclHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - bool optimized = _src.offset() % 4 == 0 && _src.step() % 4 == 0; - if(!h.createKernel("YUV2RGB_422", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx, - optimized ? " -D USE_OPTIMIZED_LOAD" : ""))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ) -{ - OclHelper< Set<1>, Set<1>, Set, FROM_YUV> h(_src, _dst, 1); - - h.src.rowRange(0, _dst.rows()).copyTo(_dst); - return true; -} - -bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ) -{ - OclHelper< Set<1>, Set<3, 4>, Set, FROM_YUV > h(_src, _dst, dcn); - - if(!h.createKernel("YUV2RGB_NVx", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, int uidx ) -{ - OclHelper< Set<1>, Set<3, 4>, Set, FROM_YUV > h(_src, _dst, dcn); - - if(!h.createKernel("YUV2RGB_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx, - _src.isContinuous() ? " -D SRC_CONT" : ""))) - { - return false; - } - - return h.run(); -} - -bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx, int uidx ) -{ - OclHelper< Set<3, 4>, Set<1>, Set, TO_YUV > h(_src, _dst, 1); - - if(!h.createKernel("RGB2YUV_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc, - format("-D dcn=1 -D bidx=%d -D uidx=%d", bidx, uidx))) - { - return false; - } - - return h.run(); -} - #endif - -// -// HAL calls -// - -void cvtColorBGR2YUV(InputArray _src, OutputArray _dst, bool swapb, bool crcb) -{ - CvtHelper< Set<3, 4>, Set<3>, Set > h(_src, _dst, 3); - - hal::cvtBGRtoYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, h.scn, swapb, crcb); -} - -void cvtColorYUV2BGR(InputArray _src, OutputArray _dst, int dcn, bool swapb, bool crcb) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<3>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.depth, dcn, swapb, crcb); -} - -void cvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx, int ycn) -{ - CvtHelper< Set<2>, Set<3, 4>, Set > h(_src, _dst, dcn); - - hal::cvtOnePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - dcn, swapb, uidx, ycn); -} - -void cvtColorYUV2Gray_ch( InputArray _src, OutputArray _dst, int coi ) -{ - CV_Assert( _src.channels() == 2 && _src.depth() == CV_8U ); - - extractChannel(_src, _dst, coi); -} - -void cvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, bool swapb, int uidx) -{ - CvtHelper< Set<3, 4>, Set<1>, Set, TO_YUV > h(_src, _dst, 1); - - hal::cvtBGRtoThreePlaneYUV(h.src.data, h.src.step, h.dst.data, h.dst.step, h.src.cols, h.src.rows, - h.scn, swapb, uidx); -} - -void cvtColorYUV2Gray_420( InputArray _src, OutputArray _dst ) -{ - CvtHelper< Set<1>, Set<1>, Set, FROM_YUV > h(_src, _dst, 1); - -#ifdef HAVE_IPP -#if IPP_VERSION_X100 >= 201700 - if (CV_INSTRUMENT_FUN_IPP(ippiCopy_8u_C1R_L, h.src.data, (IppSizeL)h.src.step, h.dst.data, (IppSizeL)h.dst.step, - ippiSizeL(h.dstSz.width, h.dstSz.height)) >= 0) - return; -#endif -#endif - h.src(Range(0, h.dstSz.height), Range::all()).copyTo(h.dst); -} - -void cvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<1>, Set<3, 4>, Set, FROM_YUV> h(_src, _dst, dcn); - - hal::cvtThreePlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows, - dcn, swapb, uidx); -} - -// http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples -// http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples - -void cvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, bool swapb, int uidx ) -{ - if(dcn <= 0) dcn = 3; - CvtHelper< Set<1>, Set<3, 4>, Set, FROM_YUV> h(_src, _dst, dcn); - - hal::cvtTwoPlaneYUVtoBGR(h.src.data, h.src.step, h.dst.data, h.dst.step, h.dst.cols, h.dst.rows, - dcn, swapb, uidx); -} - -void cvtColorTwoPlaneYUV2BGRpair( InputArray _ysrc, InputArray _uvsrc, OutputArray _dst, int dcn, bool swapb, int uidx ) -{ - int stype = _ysrc.type(); - int depth = CV_MAT_DEPTH(stype); - Size ysz = _ysrc.size(), uvs = _uvsrc.size(); - CV_Assert( dcn == 3 || dcn == 4 ); - CV_Assert( depth == CV_8U ); - CV_Assert( ysz.width == uvs.width * 2 && ysz.height == uvs.height * 2 ); - - Mat ysrc = _ysrc.getMat(), uvsrc = _uvsrc.getMat(); - - _dst.create( ysz, CV_MAKETYPE(depth, dcn)); - Mat dst = _dst.getMat(); - - hal::cvtTwoPlaneYUVtoBGR(ysrc.data, uvsrc.data, ysrc.step, - dst.data, dst.step, dst.cols, dst.rows, - dcn, swapb, uidx); -} - -} // namespace cv +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace From 442fdfd4b262b8d2d3ada82f7fbe786a0bd00708 Mon Sep 17 00:00:00 2001 From: Adrian Renner Date: Thu, 7 Mar 2019 21:18:22 +0100 Subject: [PATCH 09/21] Merge pull request #13999 from addyi:fixAndroidLintingErrors * fix android lint error in BaseLoaderCallback (LongLogTag) Lint Error that was causde by the to long TAG: BaseLoaderCallback.java:31: Error: The logging tag can be at most 23 characters, was 31 (OpenCVLoader/BaseLoaderCallback) [LongLogTag] Log.e(TAG, "Package installation failed!"); * add requested "OCV" prefix for android logging --- .../android/java/org/opencv/android/BaseLoaderCallback.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/java/generator/android/java/org/opencv/android/BaseLoaderCallback.java b/modules/java/generator/android/java/org/opencv/android/BaseLoaderCallback.java index 0b8aeedc6a..8ece662514 100644 --- a/modules/java/generator/android/java/org/opencv/android/BaseLoaderCallback.java +++ b/modules/java/generator/android/java/org/opencv/android/BaseLoaderCallback.java @@ -137,5 +137,5 @@ public abstract class BaseLoaderCallback implements LoaderCallbackInterface { } protected Context mAppContext; - private final static String TAG = "OpenCVLoader/BaseLoaderCallback"; + private final static String TAG = "OCV/BaseLoaderCallback"; } From 40af53b1d6bf6655bc01347e61c3e7d51bea3a5d Mon Sep 17 00:00:00 2001 From: Rajkiran Natarajan Date: Sat, 9 Mar 2019 15:48:21 -0500 Subject: [PATCH 10/21] issue-13921: Support setting compression setting for writing tiff images --- modules/imgcodecs/include/opencv2/imgcodecs.hpp | 7 ++++--- modules/imgcodecs/src/grfmt_tiff.cpp | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp index 4e79518ae1..ab75990ef7 100644 --- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp +++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp @@ -92,9 +92,10 @@ enum ImwriteFlags { IMWRITE_EXR_TYPE = (3 << 4) + 0, /* 48 */ //!< override EXR storage type (FLOAT (FP32) is default) IMWRITE_WEBP_QUALITY = 64, //!< For WEBP, it can be a quality from 1 to 100 (the higher is the better). By default (without any parameter) and for quality above 100 the lossless compression is used. IMWRITE_PAM_TUPLETYPE = 128,//!< For PAM, sets the TUPLETYPE field to the corresponding string value that is defined for the format - IMWRITE_TIFF_RESUNIT = 256,//!< For TIFF, use to specify which DPI resolution unit to set; see libtiff documentation for valid values - IMWRITE_TIFF_XDPI = 257,//!< For TIFF, use to specify the X direction DPI - IMWRITE_TIFF_YDPI = 258 //!< For TIFF, use to specify the Y direction DPI + IMWRITE_TIFF_RESUNIT = 256,//!< For TIFF, use to specify which DPI resolution unit to set; see libtiff documentation for valid values. + IMWRITE_TIFF_XDPI = 257,//!< For TIFF, use to specify the X direction DPI. + IMWRITE_TIFF_YDPI = 258, //!< For TIFF, use to specify the Y direction DPI. + IMWRITE_TIFF_COMPRESSION = 259 //!< For TIFF, use to specify the image compression scheme. See libtiff for integer constants corresponding to compression formats. Note, for images whose depth is CV_32F, only libtiff's SGILOG compression scheme is used. For other supported depths, the compression scheme can be specified by this flag; LZW compression is the default. }; enum ImwriteEXRTypeFlags { diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp index b83ab1068a..40295df088 100644 --- a/modules/imgcodecs/src/grfmt_tiff.cpp +++ b/modules/imgcodecs/src/grfmt_tiff.cpp @@ -750,12 +750,11 @@ bool TiffEncoder::writeLibTiff( const std::vector& img_vec, const std::vect } //Settings that matter to all images - // defaults for now, maybe base them on params in the future int compression = COMPRESSION_LZW; int predictor = PREDICTOR_HORIZONTAL; int resUnit = -1, dpiX = -1, dpiY = -1; - readParam(params, TIFFTAG_COMPRESSION, compression); + readParam(params, IMWRITE_TIFF_COMPRESSION, compression); readParam(params, TIFFTAG_PREDICTOR, predictor); readParam(params, IMWRITE_TIFF_RESUNIT, resUnit); readParam(params, IMWRITE_TIFF_XDPI, dpiX); From 11dbd86aa3fa70c9a28f14203edc65b0f7f68273 Mon Sep 17 00:00:00 2001 From: Giles Payne Date: Sun, 10 Mar 2019 06:11:04 +0900 Subject: [PATCH 11/21] Merge pull request #13956 from komakai:java-mat-class-improvements * Expose more C++ functionality in the Java wrapper of the Mat class In particular expose methods for handling Mat with more than 2 dimensions * add constructors taking an array of dimension sizes * add constructor taking an existing Mat and an array of Ranges * add override of the create method taking an array of dimension sizes * add overrides of the ones and zeros methods taking an array of dimension sizes * add override of the submat method taking an array of ranges * add overrides of put and get taking arrays of indices * add wrapper for copySize method * fix crash in the JNI wrapper of the reshape(int cn, int[] newshape) method * add test for each method added to Mat.java * Fix broken test --- modules/core/misc/java/src/java/core+Mat.java | 342 +++++++++++ modules/core/misc/java/test/MatTest.java | 219 +++++++ modules/java/generator/src/cpp/Mat.cpp | 566 +++++++++++++++++- .../src/org/opencv/test/OpenCVTestCase.java | 25 +- .../src/org/opencv/test/OpenCVTestCase.java | 25 +- 5 files changed, 1171 insertions(+), 6 deletions(-) diff --git a/modules/core/misc/java/src/java/core+Mat.java b/modules/core/misc/java/src/java/core+Mat.java index c75f9d5c01..e42fca9897 100644 --- a/modules/core/misc/java/src/java/core+Mat.java +++ b/modules/core/misc/java/src/java/core+Mat.java @@ -67,6 +67,19 @@ public class Mat { return; } + // + // C++: Mat::Mat(int ndims, const int* sizes, int type) + // + + // javadoc: Mat::Mat(sizes, type) + public Mat(int[] sizes, int type) + { + + nativeObj = n_Mat(sizes.length, sizes, type); + + return; + } + // // C++: Mat::Mat(int rows, int cols, int type, Scalar s) // @@ -93,6 +106,19 @@ public class Mat { return; } + // + // C++: Mat::Mat(int ndims, const int* sizes, int type, Scalar s) + // + + // javadoc: Mat::Mat(sizes, type, s) + public Mat(int[] sizes, int type, Scalar s) + { + + nativeObj = n_Mat(sizes.length, sizes, type, s.val[0], s.val[1], s.val[2], s.val[3]); + + return; + } + // // C++: Mat::Mat(Mat m, Range rowRange, Range colRange = Range::all()) // @@ -115,6 +141,19 @@ public class Mat { return; } + // + // C++: Mat::Mat(const Mat& m, const std::vector& ranges) + // + + // javadoc: Mat::Mat(m, ranges) + public Mat(Mat m, Range[] ranges) + { + + nativeObj = n_Mat(m.nativeObj, ranges); + + return; + } + // // C++: Mat::Mat(Mat m, Rect roi) // @@ -370,6 +409,31 @@ public class Mat { return; } + // + // C++: void Mat::create(int ndims, const int* sizes, int type) + // + + // javadoc: Mat::create(sizes, type) + public void create(int[] sizes, int type) + { + + n_create(nativeObj, sizes.length, sizes, type); + + return; + } + + // + // C++: void Mat::copySize(const Mat& m); + // + + // javadoc: Mat::copySize(m) + public void copySize(Mat m) + { + n_copySize(nativeObj, m.nativeObj); + + return; + } + // // C++: Mat Mat::cross(Mat m) // @@ -633,6 +697,19 @@ public class Mat { return retVal; } + // + // C++: static Mat Mat::ones(int ndims, const int* sizes, int type) + // + + // javadoc: Mat::ones(sizes, type) + public static Mat ones(int[] sizes, int type) + { + + Mat retVal = new Mat(n_ones(sizes.length, sizes, type)); + + return retVal; + } + // // C++: void Mat::push_back(Mat m) // @@ -867,6 +944,19 @@ public class Mat { return retVal; } + // + // C++: Mat Mat::operator()(const std::vector& ranges) + // + + // javadoc: Mat::operator()(ranges[]) + public Mat submat(Range[] ranges) + { + + Mat retVal = new Mat(n_submat_ranges(nativeObj, ranges)); + + return retVal; + } + // // C++: Mat Mat::operator()(Rect roi) // @@ -945,6 +1035,19 @@ public class Mat { return retVal; } + // + // C++: static Mat Mat::zeros(int ndims, const int* sizes, int type) + // + + // javadoc: Mat::zeros(sizes, type) + public static Mat zeros(int[] sizes, int type) + { + + Mat retVal = new Mat(n_zeros(sizes.length, sizes, type)); + + return retVal; + } + @Override protected void finalize() throws Throwable { n_delete(nativeObj); @@ -979,6 +1082,20 @@ public class Mat { return nPutD(nativeObj, row, col, data.length, data); } + // javadoc:Mat::put(idx,data) + public int put(int[] idx, double... data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + return nPutDIdx(nativeObj, idx, data.length, data); + } + // javadoc:Mat::put(row,col,data) public int put(int row, int col, float[] data) { int t = type(); @@ -994,6 +1111,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::put(idx,data) + public int put(int[] idx, float[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_32F) { + return nPutFIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::put(row,col,data) public int put(int row, int col, int[] data) { int t = type(); @@ -1009,6 +1143,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::put(idx,data) + public int put(int[] idx, int[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_32S) { + return nPutIIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::put(row,col,data) public int put(int row, int col, short[] data) { int t = type(); @@ -1024,6 +1175,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::put(idx,data) + public int put(int[] idx, short[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_16U || CvType.depth(t) == CvType.CV_16S) { + return nPutSIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::put(row,col,data) public int put(int row, int col, byte[] data) { int t = type(); @@ -1039,6 +1207,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::put(idx,data) + public int put(int[] idx, byte[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) { + return nPutBIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::put(row,col,data,offset,length) public int put(int row, int col, byte[] data, int offset, int length) { int t = type(); @@ -1054,6 +1239,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::put(idx,data,offset,length) + public int put(int[] idx, byte[] data, int offset, int length) { + int t = type(); + if (data == null || length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) { + return nPutBwIdxOffset(nativeObj, idx, length, offset, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::get(row,col,data) public int get(int row, int col, byte[] data) { int t = type(); @@ -1069,6 +1271,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::get(idx,data) + public int get(int[] idx, byte[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_8U || CvType.depth(t) == CvType.CV_8S) { + return nGetBIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::get(row,col,data) public int get(int row, int col, short[] data) { int t = type(); @@ -1084,6 +1303,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::get(idx,data) + public int get(int[] idx, short[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_16U || CvType.depth(t) == CvType.CV_16S) { + return nGetSIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::get(row,col,data) public int get(int row, int col, int[] data) { int t = type(); @@ -1099,6 +1335,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::get(idx,data) + public int get(int[] idx, int[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_32S) { + return nGetIIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::get(row,col,data) public int get(int row, int col, float[] data) { int t = type(); @@ -1114,6 +1367,23 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::get(idx,data) + public int get(int[] idx, float[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_32F) { + return nGetFIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::get(row,col,data) public int get(int row, int col, double[] data) { int t = type(); @@ -1129,11 +1399,35 @@ public class Mat { throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); } + // javadoc:Mat::get(idx,data) + public int get(int[] idx, double[] data) { + int t = type(); + if (data == null || data.length % CvType.channels(t) != 0) + throw new java.lang.UnsupportedOperationException( + "Provided data element number (" + + (data == null ? 0 : data.length) + + ") should be multiple of the Mat channels count (" + + CvType.channels(t) + ")"); + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + if (CvType.depth(t) == CvType.CV_64F) { + return nGetDIdx(nativeObj, idx, data.length, data); + } + throw new java.lang.UnsupportedOperationException("Mat data type is not compatible: " + t); + } + // javadoc:Mat::get(row,col) public double[] get(int row, int col) { return nGet(nativeObj, row, col); } + // javadoc:Mat::get(idx) + public double[] get(int[] idx) { + if (idx.length != dims()) + throw new IllegalArgumentException("Incorrect number of indices"); + return nGetIdx(nativeObj, idx); + } + // javadoc:Mat::height() public int height() { return rows(); @@ -1155,6 +1449,9 @@ public class Mat { // C++: Mat::Mat(int rows, int cols, int type) private static native long n_Mat(int rows, int cols, int type); + // C++: Mat::Mat(int ndims, const int* sizes, int type) + private static native long n_Mat(int ndims, int[] sizes, int type); + // C++: Mat::Mat(int rows, int cols, int type, void* data) private static native long n_Mat(int rows, int cols, int type, ByteBuffer data); @@ -1167,11 +1464,17 @@ public class Mat { // C++: Mat::Mat(Size size, int type, Scalar s) private static native long n_Mat(double size_width, double size_height, int type, double s_val0, double s_val1, double s_val2, double s_val3); + // C++: Mat::Mat(int ndims, const int* sizes, int type, Scalar s) + private static native long n_Mat(int ndims, int[] sizes, int type, double s_val0, double s_val1, double s_val2, double s_val3); + // C++: Mat::Mat(Mat m, Range rowRange, Range colRange = Range::all()) private static native long n_Mat(long m_nativeObj, int rowRange_start, int rowRange_end, int colRange_start, int colRange_end); private static native long n_Mat(long m_nativeObj, int rowRange_start, int rowRange_end); + // C++: Mat::Mat(const Mat& m, const std::vector& ranges) + private static native long n_Mat(long m_nativeObj, Range[] ranges); + // C++: Mat Mat::adjustROI(int dtop, int dbottom, int dleft, int dright) private static native long n_adjustROI(long nativeObj, int dtop, int dbottom, int dleft, int dright); @@ -1226,6 +1529,12 @@ public class Mat { // C++: void Mat::create(Size size, int type) private static native void n_create(long nativeObj, double size_width, double size_height, int type); + // C++: void Mat::create(int ndims, const int* sizes, int type) + private static native void n_create(long nativeObj, int ndims, int[] sizes, int type); + + // C++: void Mat::copySize(const Mat& m) + private static native void n_copySize(long nativeObj, long m_nativeObj); + // C++: Mat Mat::cross(Mat m) private static native long n_cross(long nativeObj, long m_nativeObj); @@ -1284,6 +1593,9 @@ public class Mat { // C++: static Mat Mat::ones(Size size, int type) private static native long n_ones(double size_width, double size_height, int type); + // C++: static Mat Mat::ones(int ndims, const int* sizes, int type) + private static native long n_ones(int ndims, int[] sizes, int type); + // C++: void Mat::push_back(Mat m) private static native void n_push_back(long nativeObj, long m_nativeObj); @@ -1332,6 +1644,9 @@ public class Mat { // C++: Mat Mat::operator()(Range rowRange, Range colRange) private static native long n_submat_rr(long nativeObj, int rowRange_start, int rowRange_end, int colRange_start, int colRange_end); + // C++: Mat Mat::operator()(const std::vector& ranges) + private static native long n_submat_ranges(long nativeObj, Range[] ranges); + // C++: Mat Mat::operator()(Rect roi) private static native long n_submat(long nativeObj, int roi_x, int roi_y, int roi_width, int roi_height); @@ -1350,32 +1665,59 @@ public class Mat { // C++: static Mat Mat::zeros(Size size, int type) private static native long n_zeros(double size_width, double size_height, int type); + // C++: static Mat Mat::zeros(int ndims, const int* sizes, int type) + private static native long n_zeros(int ndims, int[] sizes, int type); + // native support for java finalize() private static native void n_delete(long nativeObj); private static native int nPutD(long self, int row, int col, int count, double[] data); + private static native int nPutDIdx(long self, int[] idx, int count, double[] data); + private static native int nPutF(long self, int row, int col, int count, float[] data); + private static native int nPutFIdx(long self, int[] idx, int count, float[] data); + private static native int nPutI(long self, int row, int col, int count, int[] data); + private static native int nPutIIdx(long self, int[] idx, int count, int[] data); + private static native int nPutS(long self, int row, int col, int count, short[] data); + private static native int nPutSIdx(long self, int[] idx, int count, short[] data); + private static native int nPutB(long self, int row, int col, int count, byte[] data); + private static native int nPutBIdx(long self, int[] idx, int count, byte[] data); + private static native int nPutBwOffset(long self, int row, int col, int count, int offset, byte[] data); + private static native int nPutBwIdxOffset(long self, int[] idx, int count, int offset, byte[] data); + private static native int nGetB(long self, int row, int col, int count, byte[] vals); + private static native int nGetBIdx(long self, int[] idx, int count, byte[] vals); + private static native int nGetS(long self, int row, int col, int count, short[] vals); + private static native int nGetSIdx(long self, int[] idx, int count, short[] vals); + private static native int nGetI(long self, int row, int col, int count, int[] vals); + private static native int nGetIIdx(long self, int[] idx, int count, int[] vals); + private static native int nGetF(long self, int row, int col, int count, float[] vals); + private static native int nGetFIdx(long self, int[] idx, int count, float[] vals); + private static native int nGetD(long self, int row, int col, int count, double[] vals); + private static native int nGetDIdx(long self, int[] idx, int count, double[] vals); + private static native double[] nGet(long self, int row, int col); + private static native double[] nGetIdx(long self, int[] idx); + private static native String nDump(long self); } diff --git a/modules/core/misc/java/test/MatTest.java b/modules/core/misc/java/test/MatTest.java index f0f4fdc200..4c8c52b09c 100644 --- a/modules/core/misc/java/test/MatTest.java +++ b/modules/core/misc/java/test/MatTest.java @@ -185,6 +185,16 @@ public class MatTest extends OpenCVTestCase { assertEquals(CvType.CV_16U, dst.type()); } + public void testCreateIntArrayInt() { + int[] dims = new int[] {5, 6, 7}; + dst.create(dims, CvType.CV_16U); + + assertEquals(5, dst.size(0)); + assertEquals(6, dst.size(1)); + assertEquals(7, dst.size(2)); + assertEquals(CvType.CV_16U, dst.type()); + } + public void testCross() { Mat answer = new Mat(1, 3, CvType.CV_32F); answer.put(0, 0, 7.0, 1.0, -5.0); @@ -569,6 +579,15 @@ public class MatTest extends OpenCVTestCase { assertMatEqual(truth, dst, EPS); } + public void testMatMatRangeArray() { + dst = new Mat(gray255_32f_3d, new Range[]{new Range(0, 5), new Range(0, 5), new Range(0, 5)}); + + truth = new Mat(new int[] {5, 5, 5}, CvType.CV_32FC1, new Scalar(255)); + + assertFalse(dst.empty()); + assertMatEqual(truth, dst, EPS); + } + public void testMatMatRect() { Mat m = new Mat(7, 6, CvType.CV_32SC1); m.put(0, 0, @@ -606,6 +625,13 @@ public class MatTest extends OpenCVTestCase { assertMatEqual(gray255_32f, dst, EPS); } + public void testMatIntArrayIntScalar() { + dst = new Mat(new int[]{10, 10, 10}, CvType.CV_32F, new Scalar(255)); + + assertFalse(dst.empty()); + assertMatEqual(gray255_32f_3d, dst, EPS); + } + public void testMulMat() { assertMatEqual(gray0, gray0.mul(gray255)); @@ -619,6 +645,16 @@ public class MatTest extends OpenCVTestCase { } + public void testMulMat3d() { + Mat m1 = new Mat(new int[] {2, 2, 2}, CvType.CV_32F, new Scalar(2)); + Mat m2 = new Mat(new int[] {2, 2, 2}, CvType.CV_32F, new Scalar(3)); + + dst = m1.mul(m2); + + truth = new Mat(new int[] {2, 2, 2}, CvType.CV_32F, new Scalar(6)); + assertMatEqual(truth, dst, EPS); + } + public void testMulMatDouble() { Mat m1 = new Mat(2, 2, CvType.CV_32F, new Scalar(2)); Mat m2 = new Mat(2, 2, CvType.CV_32F, new Scalar(3)); @@ -642,6 +678,12 @@ public class MatTest extends OpenCVTestCase { assertMatEqual(truth, dst); } + public void testOnesIntArrayInt() { + dst = Mat.ones(new int[]{2, 2, 2}, CvType.CV_16S); + truth = new Mat(new int[]{2, 2, 2}, CvType.CV_16S, new Scalar(1)); + assertMatEqual(truth, dst); + } + public void testPush_back() { Mat m1 = new Mat(2, 4, CvType.CV_32F, new Scalar(2)); Mat m2 = new Mat(3, 4, CvType.CV_32F, new Scalar(3)); @@ -699,6 +741,46 @@ public class MatTest extends OpenCVTestCase { } } + public void testPutIntArrayByteArray() { + Mat m = new Mat(new int[]{5, 5, 5}, CvType.CV_8UC3, new Scalar(1, 2, 3)); + Mat sm = m.submat(new Range[]{ new Range(0, 2), new Range(1, 3), new Range(2, 4)}); + byte[] buff = new byte[] { 0, 0, 0, 0, 0, 0 }; + byte[] buff0 = new byte[] { 10, 20, 30, 40, 50, 60 }; + byte[] buff1 = new byte[] { -1, -2, -3, -4, -5, -6 }; + + int bytesNum = m.put(new int[]{1, 2, 0}, buff0); + + assertEquals(6, bytesNum); + bytesNum = m.get(new int[]{1, 2, 0}, buff); + assertEquals(6, bytesNum); + assertTrue(Arrays.equals(buff, buff0)); + + bytesNum = sm.put(new int[]{0, 0, 0}, buff1); + + assertEquals(6, bytesNum); + bytesNum = sm.get(new int[]{0, 0, 0}, buff); + assertEquals(6, bytesNum); + assertTrue(Arrays.equals(buff, buff1)); + + bytesNum = m.get(new int[]{0, 1, 2}, buff); + assertEquals(6, bytesNum); + assertTrue(Arrays.equals(buff, buff1)); + + Mat m1 = m.submat(new Range[]{ new Range(1,2), Range.all(), Range.all() }); + bytesNum = m1.get(new int[]{ 0, 2, 0}, buff); + assertEquals(6, bytesNum); + assertTrue(Arrays.equals(buff, buff0)); + + try { + byte[] bytes2 = new byte[] { 10, 20, 30, 40, 50 }; + m.put(new int[]{ 2, 2, 0 }, bytes2); + fail("Expected UnsupportedOperationException (data.length % CvType.channels(t) != 0)"); + } catch (UnsupportedOperationException e) { + // expected + } + + } + public void testPutIntIntDoubleArray() { Mat m = new Mat(5, 5, CvType.CV_8UC3, new Scalar(1, 2, 3)); Mat sm = m.submat(2, 4, 3, 5); @@ -722,6 +804,29 @@ public class MatTest extends OpenCVTestCase { assertTrue(Arrays.equals(buff, new byte[]{-1, -2, -3, -4, -5, -6})); } + public void testPutIntArrayDoubleArray() { + Mat m = new Mat(new int[]{5, 5, 5}, CvType.CV_8UC3, new Scalar(1, 2, 3)); + Mat sm = m.submat(new Range[]{ new Range(0, 2), new Range(1, 3), new Range(2, 4)}); + byte[] buff = new byte[] { 0, 0, 0, 0, 0, 0 }; + + int bytesNum = m.put(new int[]{1, 2, 0}, 10, 20, 30, 40, 50, 60); + + assertEquals(6, bytesNum); + bytesNum = m.get(new int[]{1, 2, 0}, buff); + assertEquals(6, bytesNum); + assertTrue(Arrays.equals(buff, new byte[]{10, 20, 30, 40, 50, 60})); + + bytesNum = sm.put(new int[]{0, 0, 0}, 255, 254, 253, 252, 251, 250); + + assertEquals(6, bytesNum); + bytesNum = sm.get(new int[]{0, 0, 0}, buff); + assertEquals(6, bytesNum); + assertTrue(Arrays.equals(buff, new byte[]{-1, -2, -3, -4, -5, -6})); + bytesNum = m.get(new int[]{0, 1, 2}, buff); + assertEquals(6, bytesNum); + assertTrue(Arrays.equals(buff, new byte[]{-1, -2, -3, -4, -5, -6})); + } + public void testPutIntIntFloatArray() { Mat m = new Mat(5, 5, CvType.CV_32FC3, new Scalar(1, 2, 3)); float[] elements = new float[] { 10, 20, 30, 40, 50, 60 }; @@ -745,6 +850,29 @@ public class MatTest extends OpenCVTestCase { } } + public void testPutIntArrayFloatArray() { + Mat m = new Mat(new int[]{5, 5, 5}, CvType.CV_32FC3, new Scalar(1, 2, 3)); + float[] elements = new float[] { 10, 20, 30, 40, 50, 60 }; + + int bytesNum = m.put(new int[]{0, 4, 3}, elements); + + assertEquals(elements.length * 4, bytesNum); + Mat m1 = m.submat(new Range[]{ Range.all(), new Range(4, 5), Range.all() }); + float buff[] = new float[3]; + bytesNum = m1.get(new int[]{ 0, 0, 4 }, buff); + assertEquals(buff.length * 4, bytesNum); + assertTrue(Arrays.equals(new float[]{40, 50, 60}, buff)); + assertArrayEquals(new double[]{10, 20, 30}, m.get(new int[]{ 0, 4, 3 }), EPS); + + try { + float[] elements2 = new float[] { 10, 20, 30, 40, 50 }; + m.put(new int[]{4, 2, 2}, elements2); + fail("Expected UnsupportedOperationException (data.length % CvType.channels(t) != 0)"); + } catch (UnsupportedOperationException e) { + // expected + } + } + public void testPutIntIntIntArray() { Mat m = new Mat(5, 5, CvType.CV_32SC3, new Scalar(-1, -2, -3)); int[] elements = new int[] { 10, 20, 30, 40, 50, 60 }; @@ -768,6 +896,29 @@ public class MatTest extends OpenCVTestCase { } } + public void testPutIntArrayIntArray() { + Mat m = new Mat(new int[]{5, 5, 5}, CvType.CV_32SC3, new Scalar(-1, -2, -3)); + int[] elements = new int[] { 10, 20, 30, 40, 50, 60 }; + + int bytesNum = m.put(new int[]{ 0, 0, 4 }, elements); + + assertEquals(elements.length * 4, bytesNum); + Mat m1 = m.submat(new Range[]{ Range.all(), Range.all(), new Range(4, 5)}); + int buff[] = new int[3]; + bytesNum = m1.get(new int[]{ 0, 0, 0 }, buff); + assertEquals(buff.length * 4, bytesNum); + assertTrue(Arrays.equals(new int[]{ 10, 20, 30 }, buff)); + assertArrayEquals(new double[]{ 40, 50, 60 }, m.get(new int[]{ 0, 1, 0 }), EPS); + + try { + int[] elements2 = new int[] { 10, 20, 30, 40, 50 }; + m.put(new int[] { 2, 2, 0 }, elements2); + fail("Expected UnsupportedOperationException (data.length % CvType.channels(t) != 0)"); + } catch (UnsupportedOperationException e) { + // expected + } + } + public void testPutIntIntShortArray() { Mat m = new Mat(5, 5, CvType.CV_16SC3, new Scalar(-1, -2, -3)); short[] elements = new short[] { 10, 20, 30, 40, 50, 60 }; @@ -790,6 +941,28 @@ public class MatTest extends OpenCVTestCase { } } + public void testPutIntArrayShortArray() { + Mat m = new Mat(new int[]{ 5, 5, 5}, CvType.CV_16SC3, new Scalar(-1, -2, -3)); + short[] elements = new short[] { 10, 20, 30, 40, 50, 60 }; + + int bytesNum = m.put(new int[]{ 0, 2, 3 }, elements); + + assertEquals(elements.length * 2, bytesNum); + Mat m1 = m.submat(new Range[]{ Range.all(), Range.all(), new Range(3, 4)}); + short buff[] = new short[3]; + bytesNum = m1.get(new int[]{ 0, 2, 0 }, buff); + assertTrue(Arrays.equals(new short[]{10, 20, 30}, buff)); + assertArrayEquals(new double[]{40, 50, 60}, m.get(new int[]{ 0, 2, 4 }), EPS); + + try { + short[] elements2 = new short[] { 10, 20, 30, 40, 50 }; + m.put(new int[] { 2, 2, 0 }, elements2); + fail("Expected UnsupportedOperationException (data.length % CvType.channels(t) != 0)"); + } catch (UnsupportedOperationException e) { + // expected + } + } + public void testRelease() { assertFalse(gray0.empty()); assertTrue(gray0.rows() > 0); @@ -818,6 +991,7 @@ public class MatTest extends OpenCVTestCase { } public void testReshapeIntIntArray() { + // 2D -> 4D Mat src = new Mat(6, 5, CvType.CV_8UC3, new Scalar(0)); assertEquals(2, src.dims()); assertEquals(src.rows(), src.size(0)); @@ -828,6 +1002,34 @@ public class MatTest extends OpenCVTestCase { assertEquals(newShape.length, dst.dims()); for (int i = 0; i < newShape.length; ++i) assertEquals(newShape[i], dst.size(i)); + + // 3D -> 2D + src = new Mat(new int[]{4, 6, 7}, CvType.CV_8UC3, new Scalar(0)); + assertEquals(3, src.dims()); + assertEquals(4, src.size(0)); + assertEquals(6, src.size(1)); + assertEquals(7, src.size(2)); + + int[] newShape2 = {src.channels() * src.size(2), src.size(0) * src.size(1)}; + dst = src.reshape(1, newShape2); + assertEquals(newShape2.length, dst.dims()); + for (int i = 0; i < newShape2.length; ++i) + assertEquals(newShape2[i], dst.size(i)); + } + + public void testCopySize() { + Mat src = new Mat(new int[]{1, 1, 10, 10}, CvType.CV_8UC1, new Scalar(1)); + assertEquals(4, src.dims()); + assertEquals(1, src.size(0)); + assertEquals(1, src.size(1)); + assertEquals(10, src.size(2)); + assertEquals(10, src.size(3)); + Mat other = new Mat(new int[]{10, 10}, src.type()); + + src.copySize(other); + assertEquals(other.dims(), src.dims()); + for (int i = 0; i < other.dims(); ++i) + assertEquals(other.size(i), src.size(i)); } public void testRow() { @@ -949,6 +1151,16 @@ public class MatTest extends OpenCVTestCase { assertEquals(2, submat.cols()); } + public void testSubmatRangeArray() { + Mat submat = gray255_32f_3d.submat(new Range[]{ new Range(2, 4), new Range(2, 4), new Range(3, 6) }); + assertTrue(submat.isSubmatrix()); + assertFalse(submat.isContinuous()); + + assertEquals(2, submat.size(0)); + assertEquals(2, submat.size(1)); + assertEquals(3, submat.size(2)); + } + public void testSubmatRect() { Mat submat = gray255.submat(new Rect(5, 5, gray255.cols() / 2, gray255.rows() / 2)); assertTrue(submat.isSubmatrix()); @@ -1015,6 +1227,13 @@ public class MatTest extends OpenCVTestCase { assertMatEqual(truth, dst); } + public void testZerosIntArray() { + dst = Mat.zeros(new int[]{2, 3, 4}, CvType.CV_16S); + + truth = new Mat(new int[]{2, 3, 4}, CvType.CV_16S, new Scalar(0)); + assertMatEqual(truth, dst); + } + public void testMatFromByteBuffer() { ByteBuffer bbuf = ByteBuffer.allocateDirect(64*64); bbuf.putInt(0x01010101); diff --git a/modules/java/generator/src/cpp/Mat.cpp b/modules/java/generator/src/cpp/Mat.cpp index e222cb5904..1ae2aa6e8c 100644 --- a/modules/java/generator/src/cpp/Mat.cpp +++ b/modules/java/generator/src/cpp/Mat.cpp @@ -32,6 +32,18 @@ static void throwJavaException(JNIEnv *env, const std::exception *e, const char CV_UNUSED(method); // avoid "unused" warning } +// jint could be int or int32_t so casting jint* to int* in general wouldn't work +static std::vector convertJintArrayToVector(JNIEnv* env, jintArray in) { + std::vector out; + int len = env->GetArrayLength(in); + jint* inArray = env->GetIntArrayElements(in, 0); + for ( int i = 0; i < len; i++ ) { + out.push_back(inArray[i]); + } + env->ReleaseIntArrayElements(in, inArray, 0); + return out; +} + extern "C" { @@ -100,6 +112,30 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__III return 0; } +// +// Mat::Mat(int[] sizes, int type) +// + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__I_3II + (JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type); + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__I_3II + (JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type) +{ + static const char method_name[] = "Mat::n_1Mat__I_3II()"; + try { + LOGD("%s", method_name); + std::vector sizes = convertJintArrayToVector(env, sizesArray); + return (jlong) new Mat( ndims, sizes.data(), type ); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + // @@ -182,6 +218,33 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__DDIDDDD +// +// Mat::Mat(int[] sizes, int type, Scalar s) +// + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__I_3IIDDDD + (JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type, jdouble s_val0, jdouble s_val1, jdouble s_val2, jdouble s_val3); + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__I_3IIDDDD + (JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type, jdouble s_val0, jdouble s_val1, jdouble s_val2, jdouble s_val3) +{ + static const char method_name[] = "Mat::n_1Mat__I_3IIDDDD()"; + try { + LOGD("%s", method_name); + std::vector sizes = convertJintArrayToVector(env, sizesArray); + Scalar s(s_val0, s_val1, s_val2, s_val3); + return (jlong) new Mat( ndims, sizes.data(), type, s ); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + + + // // Mat::Mat(Mat m, Range rowRange, Range colRange = Range::all()) // @@ -207,6 +270,59 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__JIIII return 0; } +jint getObjectIntField(JNIEnv* env, jobject obj, const char * fieldName); + +jint getObjectIntField(JNIEnv* env, jobject obj, const char * fieldName) { + jfieldID fid; /* store the field ID */ + + /* Get a reference to obj's class */ + jclass cls = env->GetObjectClass(obj); + + /* Look for the instance field s in cls */ + fid = env->GetFieldID(cls, fieldName, "I"); + if (fid == NULL) + { + return 0; /* failed to find the field */ + } + + /* Read the instance field s */ + return env->GetIntField(obj, fid); +} + +#define RANGE_START_FIELD "start" +#define RANGE_END_FIELD "end" + +// +// Mat::Mat(Mat m, Range[] ranges) +// + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__J_3Lorg_opencv_core_Range_2 + (JNIEnv* env, jclass, jlong m_nativeObj, jobjectArray rangesArray); + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__J_3Lorg_opencv_core_Range_2 + (JNIEnv* env, jclass, jlong m_nativeObj, jobjectArray rangesArray) +{ + static const char method_name[] = "Mat::n_1Mat__J_3Lorg_opencv_core_Range_2()"; + try { + LOGD("%s", method_name); + std::vector ranges; + int rangeCount = env->GetArrayLength(rangesArray); + for (int i = 0; i < rangeCount; i++) { + jobject range = env->GetObjectArrayElement(rangesArray, i); + jint start = getObjectIntField(env, range, RANGE_START_FIELD); + jint end = getObjectIntField(env, range, RANGE_END_FIELD); + ranges.push_back(Range(start, end)); + } + return (jlong) new Mat( (*(Mat*)m_nativeObj), ranges ); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1Mat__JII (JNIEnv* env, jclass, jlong m_nativeObj, jint rowRange_start, jint rowRange_end); @@ -718,6 +834,56 @@ JNIEXPORT void JNICALL Java_org_opencv_core_Mat_n_1create__JDDI +// +// void Mat::create(int[] sizes, int type) +// + +JNIEXPORT void JNICALL Java_org_opencv_core_Mat_n_1create__JI_3II + (JNIEnv* env, jclass, jlong self, jint ndims, jintArray sizesArray, jint type); + +JNIEXPORT void JNICALL Java_org_opencv_core_Mat_n_1create__JI_3II + (JNIEnv* env, jclass, jlong self, jint ndims, jintArray sizesArray, jint type) +{ + static const char method_name[] = "Mat::n_1create__JI_3II()"; + try { + LOGD("%s", method_name); + Mat* me = (Mat*) self; + std::vector sizes = convertJintArrayToVector(env, sizesArray); + me->create( ndims, sizes.data(), type ); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } +} + + + +// +// Mat Mat::copySize(Mat m) +// + +JNIEXPORT void JNICALL Java_org_opencv_core_Mat_n_1copySize + (JNIEnv* env, jclass, jlong self, jlong m_nativeObj); + +JNIEXPORT void JNICALL Java_org_opencv_core_Mat_n_1copySize + (JNIEnv* env, jclass, jlong self, jlong m_nativeObj) +{ + static const char method_name[] = "Mat::n_1copySize()"; + try { + LOGD("%s", method_name); + Mat* me = (Mat*) self; + Mat& m = *((Mat*)m_nativeObj); + me->copySize( m ); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } +} + + + // // Mat Mat::cross(Mat m) // @@ -1234,6 +1400,33 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1ones__DDI +// +// static Mat Mat::ones(int[] sizes, int type) +// + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1ones__I_3II + (JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type); + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1ones__I_3II + (JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type) +{ + static const char method_name[] = "Mat::n_1ones__I_3II()"; + try { + LOGD("%s", method_name); + std::vector sizes = convertJintArrayToVector(env, sizesArray); + Mat _retval_ = Mat::ones( ndims, sizes.data(), type ); + return (jlong) new Mat(_retval_); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + + + // // void Mat::push_back(Mat m) // @@ -1344,8 +1537,8 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1reshape_11 try { LOGD("%s", method_name); Mat* me = (Mat*) self; //TODO: check for NULL - int* newsz = (int*)env->GetPrimitiveArrayCritical(newshape, 0); - Mat _retval_ = me->reshape( cn, newndims, newsz ); + std::vector newsz = convertJintArrayToVector(env, newshape); + Mat _retval_ = me->reshape( cn, newndims, newsz.data() ); return (jlong) new Mat(_retval_); } catch(const std::exception &e) { throwJavaException(env, &e, method_name); @@ -1649,6 +1842,39 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1submat_1rr return 0; } +// +// Mat Mat::operator()(Range[] ranges) +// + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1submat_1ranges +(JNIEnv* env, jclass, jlong self, jobjectArray rangesArray); + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1submat_1ranges +(JNIEnv* env, jclass, jlong self, jobjectArray rangesArray) +{ + static const char method_name[] = "Mat::n_1submat_1ranges()"; + try { + LOGD("%s", method_name); + Mat* me = (Mat*) self; + std::vector ranges; + int rangeCount = env->GetArrayLength(rangesArray); + for (int i = 0; i < rangeCount; i++) { + jobject range = env->GetObjectArrayElement(rangesArray, i); + jint start = getObjectIntField(env, range, RANGE_START_FIELD); + jint end = getObjectIntField(env, range, RANGE_END_FIELD); + ranges.push_back(Range(start, end)); + } + Mat _retval_ = me->operator()( ranges ); + return (jlong) new Mat(_retval_); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + // @@ -1811,6 +2037,33 @@ JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1zeros__DDI +// +// static Mat Mat::zeros(int[] sizes, int type) +// + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1zeros__I_3II +(JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type); + +JNIEXPORT jlong JNICALL Java_org_opencv_core_Mat_n_1zeros__I_3II +(JNIEnv* env, jclass, jint ndims, jintArray sizesArray, jint type) +{ + static const char method_name[] = "Mat::n_1zeros__I_3II()"; + try { + LOGD("%s", method_name); + std::vector sizes = convertJintArrayToVector(env, sizesArray); + Mat _retval_ = Mat::zeros( ndims, sizes.data(), type ); + return (jlong) new Mat(_retval_); + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + + + // // native support for java finalize() // static void Mat::n_delete( __int64 self ) @@ -1880,6 +2133,50 @@ template static int mat_put(cv::Mat* m, int row, int col, int count, return res; } +// returns true if final index was reached +static bool updateIdx(cv::Mat* m, std::vector& idx, int inc) { + for (int i=m->dims-1; i>=0; i--) { + if (inc == 0) return false; + idx[i] = (idx[i] + 1) % m->size[i]; + inc--; + } + return true; +} + +template static int mat_put_idx(cv::Mat* m, std::vector& idx, int count, int offset, char* buff) +{ + if(! m) return 0; + if(! buff) return 0; + + count *= sizeof(T); + int rest = (int)m->elemSize(); + for (int i = 0; i < m->dims; i++) { + rest *= (m->size[i] - idx[i]); + } + if(count>rest) count = rest; + int res = count; + + if( m->isContinuous() ) + { + memcpy(m->ptr(idx.data()), buff + offset, count); + } else { + // dim by dim + int num = (m->size[m->dims-1] - idx[m->dims-1]) * (int)m->elemSize(); // 1st partial row + if(countptr(idx.data()); + while(count>0){ + memcpy(data, buff + offset, num); + updateIdx(m, idx, num / (int)m->elemSize()); + count -= num; + buff += num; + num = m->size[m->dims-1] * (int)m->elemSize(); + if(countptr(idx.data()); + } + } + return res; +} + template static jint java_mat_put(JNIEnv* env, jlong self, jint row, jint col, jint count, jint offset, ARRAY vals) { static const char *method_name = JavaOpenCVTrait::put; @@ -1903,6 +2200,31 @@ template static jint java_mat_put(JNIEnv* env, jlong self, jint row return 0; } +template static jint java_mat_put_idx(JNIEnv* env, jlong self, jintArray idxArray, jint count, jint offset, ARRAY vals) +{ + static const char *method_name = JavaOpenCVTrait::put; + try { + LOGD("%s", method_name); + cv::Mat* me = (cv::Mat*) self; + if(! self) return 0; // no native object behind + if(me->depth() != JavaOpenCVTrait::cvtype_1 && me->depth() != JavaOpenCVTrait::cvtype_2) return 0; // incompatible type + std::vector idx = convertJintArrayToVector(env, idxArray); + for (int i = 0; i < me->dims ; i++ ) { + if (me->size[i]<=idx[i]) return 0; + } + char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0); + int res = mat_put_idx::value_type>(me, idx, count, offset, values); + env->ReleasePrimitiveArrayCritical(vals, values, JNI_ABORT); + return res; + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + extern "C" { JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutB @@ -1914,6 +2236,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutB return java_mat_put(env, self, row, col, count, 0, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutBIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jbyteArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutBIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jbyteArray vals) +{ + return java_mat_put_idx(env, self, idxArray, count, 0, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutBwOffset (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jint offset, jbyteArray vals); @@ -1923,6 +2254,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutBwOffset return java_mat_put(env, self, row, col, count, offset, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutBwIdxOffset + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jint offset, jbyteArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutBwIdxOffset + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jint offset, jbyteArray vals) +{ + return java_mat_put_idx(env, self, idxArray, count, offset, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutS (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jshortArray vals); @@ -1932,6 +2272,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutS return java_mat_put(env, self, row, col, count, 0, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutSIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jshortArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutSIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jshortArray vals) +{ + return java_mat_put_idx(env, self, idxArray, count, 0, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutI (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jintArray vals); @@ -1941,6 +2290,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutI return java_mat_put(env, self, row, col, count, 0, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutIIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jintArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutIIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jintArray vals) +{ + return java_mat_put_idx(env, self, idxArray, count, 0, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutF (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jfloatArray vals); @@ -1950,6 +2308,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutF return java_mat_put(env, self, row, col, count, 0, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutFIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jfloatArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutFIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jfloatArray vals) +{ + return java_mat_put_idx(env, self, idxArray, count, 0, vals); +} + // unlike other nPut()-s this one (with double[]) should convert input values to correct type #define PUT_ITEM(T, R, C) { T*dst = (T*)me->ptr(R, C); for(int ch=0; chchannels() && count>0; count--,ch++,src++,dst++) *dst = cv::saturate_cast(*src); } @@ -2010,6 +2377,56 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutD return 0; } +// unlike other nPut()-s this one (with double[]) should convert input values to correct type +#define PUT_ITEM_IDX(T, I) { T*dst = (T*)me->ptr(I); for(int ch=0; chchannels() && count>0; count--,ch++,src++,dst++) *dst = cv::saturate_cast(*src); } + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutDIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jdoubleArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nPutDIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jdoubleArray vals) +{ + static const char* method_name = JavaOpenCVTrait::put; + try { + LOGD("%s", method_name); + cv::Mat* me = (cv::Mat*) self; + if(!me || !me->data) return 0; // no native object behind + std::vector idx = convertJintArrayToVector(env, idxArray); + for (int i=0; idims; i++) { + if (me->size[i]<=idx[i]) return 0; // indexes out of range + } + int rest = me->channels(); + for (int i=0; idims; i++) { + rest *= (me->size[i] - idx[i]); + } + if(count>rest) count = rest; + int res = count; + double* values = (double*)env->GetPrimitiveArrayCritical(vals, 0); + double* src = values; + bool reachedFinalIndex = false; + for(; !reachedFinalIndex && count>0; reachedFinalIndex = updateIdx(me, idx, 1)) + { + switch(me->depth()) { + case CV_8U: PUT_ITEM_IDX(uchar, idx.data()); break; + case CV_8S: PUT_ITEM_IDX(schar, idx.data()); break; + case CV_16U: PUT_ITEM_IDX(ushort, idx.data()); break; + case CV_16S: PUT_ITEM_IDX(short, idx.data()); break; + case CV_32S: PUT_ITEM_IDX(int, idx.data()); break; + case CV_32F: PUT_ITEM_IDX(float, idx.data()); break; + case CV_64F: PUT_ITEM_IDX(double, idx.data()); break; + } + } + env->ReleasePrimitiveArrayCritical(vals, values, 0); + return res; + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + } // extern "C" template static int mat_get(cv::Mat* m, int row, int col, int count, char* buff) @@ -2042,6 +2459,40 @@ template static int mat_get(cv::Mat* m, int row, int col, int count, return res; } +template static int mat_get_idx(cv::Mat* m, std::vector& idx, int count, char* buff) +{ + if(! m) return 0; + if(! buff) return 0; + + count *= sizeof(T); + int rest = (int)m->elemSize(); + for (int i = 0; i < m->dims; i++) { + rest *= (m->size[i] - idx[i]); + } + if(count>rest) count = rest; + int res = count; + + if( m->isContinuous() ) + { + memcpy(buff, m->ptr(idx.data()), count); + } else { + // dim by dim + int num = (m->size[m->dims-1] - idx[m->dims-1]) * (int)m->elemSize(); // 1st partial row + if(countptr(idx.data()); + while(count>0){ + memcpy(buff, data, num); + updateIdx(m, idx, num / (int)m->elemSize()); + count -= num; + buff += num; + num = m->size[m->dims-1] * (int)m->elemSize(); + if(countptr(idx.data()); + } + } + return res; +} + template static jint java_mat_get(JNIEnv* env, jlong self, jint row, jint col, jint count, ARRAY vals) { static const char *method_name = JavaOpenCVTrait::get; try { @@ -2064,6 +2515,31 @@ template static jint java_mat_get(JNIEnv* env, jlong self, jint row return 0; } +template static jint java_mat_get_idx(JNIEnv* env, jlong self, jintArray idxArray, jint count, ARRAY vals) { + static const char *method_name = JavaOpenCVTrait::get; + try { + LOGD("%s", method_name); + cv::Mat* me = (cv::Mat*) self; + if(! self) return 0; // no native object behind + if(me->depth() != JavaOpenCVTrait::cvtype_1 && me->depth() != JavaOpenCVTrait::cvtype_2) return 0; // incompatible type + std::vector idx = convertJintArrayToVector(env, idxArray); + for (int i = 0; i < me->dims ; i++ ) { + if (me->size[i]<=idx[i]) return 0; + } + + char* values = (char*)env->GetPrimitiveArrayCritical(vals, 0); + int res = mat_get_idx::value_type>(me, idx, count, values); + env->ReleasePrimitiveArrayCritical(vals, values, 0); + return res; + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + extern "C" { JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetB @@ -2075,6 +2551,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetB return java_mat_get(env, self, row, col, count, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetBIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jbyteArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetBIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jbyteArray vals) +{ + return java_mat_get_idx(env, self, idxArray, count, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetS (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jshortArray vals); @@ -2084,6 +2569,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetS return java_mat_get(env, self, row, col, count, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetSIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jshortArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetSIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jshortArray vals) +{ + return java_mat_get_idx(env, self, idxArray, count, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetI (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jintArray vals); @@ -2093,6 +2587,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetI return java_mat_get(env, self, row, col, count, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetIIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jintArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetIIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jintArray vals) +{ + return java_mat_get_idx(env, self, idxArray, count, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetF (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jfloatArray vals); @@ -2102,6 +2605,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetF return java_mat_get(env, self, row, col, count, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetFIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jfloatArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetFIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jfloatArray vals) +{ + return java_mat_get_idx(env, self, idxArray, count, vals); +} + JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetD (JNIEnv* env, jclass, jlong self, jint row, jint col, jint count, jdoubleArray vals); @@ -2111,6 +2623,15 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetD return java_mat_get(env, self, row, col, count, vals); } +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetDIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jdoubleArray vals); + +JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_nGetDIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray, jint count, jdoubleArray vals) +{ + return java_mat_get_idx(env, self, idxArray, count, vals); +} + JNIEXPORT jdoubleArray JNICALL Java_org_opencv_core_Mat_nGet (JNIEnv* env, jclass, jlong self, jint row, jint col); @@ -2149,6 +2670,47 @@ JNIEXPORT jdoubleArray JNICALL Java_org_opencv_core_Mat_nGet return 0; } +JNIEXPORT jdoubleArray JNICALL Java_org_opencv_core_Mat_nGetIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray); + +JNIEXPORT jdoubleArray JNICALL Java_org_opencv_core_Mat_nGetIdx + (JNIEnv* env, jclass, jlong self, jintArray idxArray) +{ + static const char method_name[] = "Mat::nGetIdx()"; + try { + LOGD("%s", method_name); + cv::Mat* me = (cv::Mat*) self; + if(! self) return 0; // no native object behind + std::vector idx = convertJintArrayToVector(env, idxArray); + for (int i=0; idims; i++) { + if (me->size[i]<=idx[i]) return 0; // indexes out of range + } + + jdoubleArray res = env->NewDoubleArray(me->channels()); + if(res){ + jdouble buff[CV_CN_MAX];//me->channels() + int i; + switch(me->depth()){ + case CV_8U: for(i=0; ichannels(); i++) buff[i] = *((unsigned char*) me->ptr(idx.data()) + i); break; + case CV_8S: for(i=0; ichannels(); i++) buff[i] = *((signed char*) me->ptr(idx.data()) + i); break; + case CV_16U: for(i=0; ichannels(); i++) buff[i] = *((unsigned short*)me->ptr(idx.data()) + i); break; + case CV_16S: for(i=0; ichannels(); i++) buff[i] = *((signed short*) me->ptr(idx.data()) + i); break; + case CV_32S: for(i=0; ichannels(); i++) buff[i] = *((int*) me->ptr(idx.data()) + i); break; + case CV_32F: for(i=0; ichannels(); i++) buff[i] = *((float*) me->ptr(idx.data()) + i); break; + case CV_64F: for(i=0; ichannels(); i++) buff[i] = *((double*) me->ptr(idx.data()) + i); break; + } + env->SetDoubleArrayRegion(res, 0, me->channels(), buff); + } + return res; + } catch(const std::exception &e) { + throwJavaException(env, &e, method_name); + } catch (...) { + throwJavaException(env, 0, method_name); + } + + return 0; +} + JNIEXPORT jstring JNICALL Java_org_opencv_core_Mat_nDump (JNIEnv *env, jclass, jlong self); diff --git a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java index c3af0b343b..cc7eb9dca7 100644 --- a/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java +++ b/modules/java/test/android_test/src/org/opencv/test/OpenCVTestCase.java @@ -99,6 +99,8 @@ public class OpenCVTestCase extends TestCase { protected Mat rgbLena; protected Mat grayChess; + protected Mat gray255_32f_3d; + protected Mat v1; protected Mat v2; @@ -149,6 +151,8 @@ public class OpenCVTestCase extends TestCase { rgbLena = Imgcodecs.imread(OpenCVTestRunner.LENA_PATH); grayChess = Imgcodecs.imread(OpenCVTestRunner.CHESS_PATH, 0); + gray255_32f_3d = new Mat(new int[]{matSize, matSize, matSize}, CvType.CV_32F, new Scalar(255.0)); + v1 = new Mat(1, 3, CvType.CV_32F); v1.put(0, 0, 1.0, 3.0, 2.0); v2 = new Mat(1, 3, CvType.CV_32F); @@ -184,6 +188,7 @@ public class OpenCVTestCase extends TestCase { rgba128.release(); rgbLena.release(); grayChess.release(); + gray255_32f_3d.release(); v1.release(); v2.release(); @@ -442,8 +447,24 @@ public class OpenCVTestCase extends TestCase { assertEquals(msg, expected.z, actual.z, eps); } + static private boolean dimensionsEqual(Mat expected, Mat actual) { + if (expected.dims() != actual.dims()) { + return false; + } + if (expected.dims() > 2) { + for (int i = 0; i < expected.dims(); i++) { + if (expected.size(i) != actual.size(i)) { + return false; + } + } + return true; + } else { + return expected.cols() == actual.cols() && expected.rows() == actual.rows(); + } + } + static private void compareMats(Mat expected, Mat actual, boolean isEqualityMeasured) { - if (expected.type() != actual.type() || expected.cols() != actual.cols() || expected.rows() != actual.rows()) { + if (expected.type() != actual.type() || !dimensionsEqual(expected, actual)) { throw new UnsupportedOperationException("Can not compare " + expected + " and " + actual); } @@ -471,7 +492,7 @@ public class OpenCVTestCase extends TestCase { } static private void compareMats(Mat expected, Mat actual, double eps, boolean isEqualityMeasured) { - if (expected.type() != actual.type() || expected.cols() != actual.cols() || expected.rows() != actual.rows()) { + if (expected.type() != actual.type() || !dimensionsEqual(expected, actual)) { throw new UnsupportedOperationException("Can not compare " + expected + " and " + actual); } diff --git a/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java b/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java index a66206e223..7dc3432add 100644 --- a/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java +++ b/modules/java/test/pure_test/src/org/opencv/test/OpenCVTestCase.java @@ -97,6 +97,8 @@ public class OpenCVTestCase extends TestCase { protected Mat rgbLena; protected Mat grayChess; + protected Mat gray255_32f_3d; + protected Mat v1; protected Mat v2; @@ -175,6 +177,8 @@ public class OpenCVTestCase extends TestCase { rgbLena = Imgcodecs.imread(OpenCVTestRunner.LENA_PATH); grayChess = Imgcodecs.imread(OpenCVTestRunner.CHESS_PATH, 0); + gray255_32f_3d = new Mat(new int[]{matSize, matSize, matSize}, CvType.CV_32F, new Scalar(255.0)); + v1 = new Mat(1, 3, CvType.CV_32F); v1.put(0, 0, 1.0, 3.0, 2.0); v2 = new Mat(1, 3, CvType.CV_32F); @@ -210,6 +214,7 @@ public class OpenCVTestCase extends TestCase { rgba128.release(); rgbLena.release(); grayChess.release(); + gray255_32f_3d.release(); v1.release(); v2.release(); @@ -468,8 +473,24 @@ public class OpenCVTestCase extends TestCase { assertEquals(msg, expected.z, actual.z, eps); } + static private boolean dimensionsEqual(Mat expected, Mat actual) { + if (expected.dims() != actual.dims()) { + return false; + } + if (expected.dims() > 2) { + for (int i = 0; i < expected.dims(); i++) { + if (expected.size(i) != actual.size(i)) { + return false; + } + } + return true; + } else { + return expected.cols() == actual.cols() && expected.rows() == actual.rows(); + } + } + static private void compareMats(Mat expected, Mat actual, boolean isEqualityMeasured) { - if (expected.type() != actual.type() || expected.cols() != actual.cols() || expected.rows() != actual.rows()) { + if (expected.type() != actual.type() || !dimensionsEqual(expected, actual)) { throw new UnsupportedOperationException("Can not compare " + expected + " and " + actual); } @@ -497,7 +518,7 @@ public class OpenCVTestCase extends TestCase { } static private void compareMats(Mat expected, Mat actual, double eps, boolean isEqualityMeasured) { - if (expected.type() != actual.type() || expected.cols() != actual.cols() || expected.rows() != actual.rows()) { + if (expected.type() != actual.type() || !dimensionsEqual(expected, actual)) { throw new UnsupportedOperationException("Can not compare " + expected + " and " + actual); } From 842c58a7d6da15459900bb55dafc8b064f7b7882 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 11 Mar 2019 12:01:40 +0000 Subject: [PATCH 12/21] core(intrin): NEON v_load_expand_q() support unaligned addr --- modules/core/include/opencv2/core/hal/intrin_neon.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index e131909845..3b946ff7c6 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1202,14 +1202,16 @@ OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32) inline v_uint32x4 v_load_expand_q(const uchar* ptr) { - uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr); + typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint; + uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr); uint16x4_t v1 = vget_low_u16(vmovl_u8(v0)); return v_uint32x4(vmovl_u16(v1)); } inline v_int32x4 v_load_expand_q(const schar* ptr) { - int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); + typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint; + int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr); int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); return v_int32x4(vmovl_s16(v1)); } From 6eac8f78b9144cbd311a8e17c3f0ea4f1792b8f4 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 10:56:50 +0000 Subject: [PATCH 13/21] imgproc: copy .simd.hpp --- .../src/{bilateral_filter.cpp => bilateral_filter.simd.hpp} | 0 modules/imgproc/src/{box_filter.cpp => box_filter.simd.hpp} | 0 modules/imgproc/src/{filter.cpp => filter.simd.hpp} | 0 modules/imgproc/src/{median_blur.cpp => median_blur.simd.hpp} | 0 modules/imgproc/src/{morph.cpp => morph.simd.hpp} | 0 modules/imgproc/src/{smooth.cpp => smooth.simd.hpp} | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename modules/imgproc/src/{bilateral_filter.cpp => bilateral_filter.simd.hpp} (100%) rename modules/imgproc/src/{box_filter.cpp => box_filter.simd.hpp} (100%) rename modules/imgproc/src/{filter.cpp => filter.simd.hpp} (100%) rename modules/imgproc/src/{median_blur.cpp => median_blur.simd.hpp} (100%) rename modules/imgproc/src/{morph.cpp => morph.simd.hpp} (100%) rename modules/imgproc/src/{smooth.cpp => smooth.simd.hpp} (100%) diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.simd.hpp similarity index 100% rename from modules/imgproc/src/bilateral_filter.cpp rename to modules/imgproc/src/bilateral_filter.simd.hpp diff --git a/modules/imgproc/src/box_filter.cpp b/modules/imgproc/src/box_filter.simd.hpp similarity index 100% rename from modules/imgproc/src/box_filter.cpp rename to modules/imgproc/src/box_filter.simd.hpp diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.simd.hpp similarity index 100% rename from modules/imgproc/src/filter.cpp rename to modules/imgproc/src/filter.simd.hpp diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.simd.hpp similarity index 100% rename from modules/imgproc/src/median_blur.cpp rename to modules/imgproc/src/median_blur.simd.hpp diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.simd.hpp similarity index 100% rename from modules/imgproc/src/morph.cpp rename to modules/imgproc/src/morph.simd.hpp diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.simd.hpp similarity index 100% rename from modules/imgproc/src/smooth.cpp rename to modules/imgproc/src/smooth.simd.hpp From 9dc755408982d4416260d9f7ee6bccc23c2a333d Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 10:57:05 +0000 Subject: [PATCH 14/21] imgproc: copy .dispatch.cpp --- .../src/{bilateral_filter.cpp => bilateral_filter.dispatch.cpp} | 0 modules/imgproc/src/{box_filter.cpp => box_filter.dispatch.cpp} | 0 modules/imgproc/src/{filter.cpp => filter.dispatch.cpp} | 0 modules/imgproc/src/{median_blur.cpp => median_blur.dispatch.cpp} | 0 modules/imgproc/src/{morph.cpp => morph.dispatch.cpp} | 0 modules/imgproc/src/{smooth.cpp => smooth.dispatch.cpp} | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename modules/imgproc/src/{bilateral_filter.cpp => bilateral_filter.dispatch.cpp} (100%) rename modules/imgproc/src/{box_filter.cpp => box_filter.dispatch.cpp} (100%) rename modules/imgproc/src/{filter.cpp => filter.dispatch.cpp} (100%) rename modules/imgproc/src/{median_blur.cpp => median_blur.dispatch.cpp} (100%) rename modules/imgproc/src/{morph.cpp => morph.dispatch.cpp} (100%) rename modules/imgproc/src/{smooth.cpp => smooth.dispatch.cpp} (100%) diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp similarity index 100% rename from modules/imgproc/src/bilateral_filter.cpp rename to modules/imgproc/src/bilateral_filter.dispatch.cpp diff --git a/modules/imgproc/src/box_filter.cpp b/modules/imgproc/src/box_filter.dispatch.cpp similarity index 100% rename from modules/imgproc/src/box_filter.cpp rename to modules/imgproc/src/box_filter.dispatch.cpp diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.dispatch.cpp similarity index 100% rename from modules/imgproc/src/filter.cpp rename to modules/imgproc/src/filter.dispatch.cpp diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.dispatch.cpp similarity index 100% rename from modules/imgproc/src/median_blur.cpp rename to modules/imgproc/src/median_blur.dispatch.cpp diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.dispatch.cpp similarity index 100% rename from modules/imgproc/src/morph.cpp rename to modules/imgproc/src/morph.dispatch.cpp diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.dispatch.cpp similarity index 100% rename from modules/imgproc/src/smooth.cpp rename to modules/imgproc/src/smooth.dispatch.cpp From 9a8dbfd57fab0b9a7777f4baad0da8d23f8a8756 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 12:21:23 +0000 Subject: [PATCH 15/21] imgproc: dispatch filter.cpp --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/filter.dispatch.cpp | 2985 +---------------------- modules/imgproc/src/filter.hpp | 2 + modules/imgproc/src/filter.simd.hpp | 1559 ++---------- 4 files changed, 258 insertions(+), 4289 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 6232aa5fab..d3afe151bd 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,5 +1,6 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp index 43200218dc..b6f5331028 100644 --- a/modules/imgproc/src/filter.dispatch.cpp +++ b/modules/imgproc/src/filter.dispatch.cpp @@ -47,19 +47,15 @@ #include "opencv2/core/hal/intrin.hpp" #include "filter.hpp" +#include "filter.simd.hpp" +#include "filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + /****************************************************************************************\ Base Image Filter \****************************************************************************************/ -#if IPP_VERSION_X100 >= 710 -#define USE_IPP_SEP_FILTERS 1 -#else -#undef USE_IPP_SEP_FILTERS -#endif - -namespace cv -{ +namespace cv { BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } BaseRowFilter::~BaseRowFilter() {} @@ -163,107 +159,12 @@ void FilterEngine::init( const Ptr& _filter2D, #define VEC_ALIGN CV_MALLOC_ALIGN -int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs) +int FilterEngine::start(const Size& _wholeSize, const Size& sz, const Point& ofs) { - int i, j; + CV_INSTRUMENT_REGION(); - wholeSize = _wholeSize; - roi = Rect(ofs, sz); - CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 && - roi.x + roi.width <= wholeSize.width && - roi.y + roi.height <= wholeSize.height ); - - int esz = (int)getElemSize(srcType); - int bufElemSize = (int)getElemSize(bufType); - const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0; - - int _maxBufRows = std::max(ksize.height + 3, - std::max(anchor.y, - ksize.height-anchor.y-1)*2+1); - - if( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) - { - rows.resize(_maxBufRows); - maxWidth = std::max(maxWidth, roi.width); - int cn = CV_MAT_CN(srcType); - srcRow.resize(esz*(maxWidth + ksize.width - 1)); - if( columnBorderType == BORDER_CONSTANT ) - { - CV_Assert(constVal != NULL); - constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN)); - uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst; - int n = (int)constBorderValue.size(), N; - N = (maxWidth + ksize.width - 1)*esz; - tdst = isSeparable() ? &srcRow[0] : dst; - - for( i = 0; i < N; i += n ) - { - n = std::min( n, N - i ); - for(j = 0; j < n; j++) - tdst[i+j] = constVal[j]; - } - - if( isSeparable() ) - (*rowFilter)(&srcRow[0], dst, maxWidth, cn); - } - - int maxBufStep = bufElemSize*(int)alignSize(maxWidth + - (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); - ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN); - } - - // adjust bufstep so that the used part of the ring buffer stays compact in memory - bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); - - dx1 = std::max(anchor.x - roi.x, 0); - dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); - - // recompute border tables - if( dx1 > 0 || dx2 > 0 ) - { - if( rowBorderType == BORDER_CONSTANT ) - { - CV_Assert(constVal != NULL); - int nr = isSeparable() ? 1 : (int)rows.size(); - for( i = 0; i < nr; i++ ) - { - uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i; - memcpy( dst, constVal, dx1*esz ); - memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz ); - } - } - else - { - int xofs1 = std::min(roi.x, anchor.x) - roi.x; - - int btab_esz = borderElemSize, wholeWidth = wholeSize.width; - int* btab = (int*)&borderTab[0]; - - for( i = 0; i < dx1; i++ ) - { - int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz; - for( j = 0; j < btab_esz; j++ ) - btab[i*btab_esz + j] = p0 + j; - } - - for( i = 0; i < dx2; i++ ) - { - int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz; - for( j = 0; j < btab_esz; j++ ) - btab[(i + dx1)*btab_esz + j] = p0 + j; - } - } - } - - rowCount = dstY = 0; - startY = startY0 = std::max(roi.y - anchor.y, 0); - endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height); - if( columnFilter ) - columnFilter->reset(); - if( filter2D ) - filter2D->reset(); - - return startY; + CV_CPU_DISPATCH(FilterEngine__start, (*this, _wholeSize, sz, ofs), + CV_CPU_DISPATCH_MODES_ALL); } @@ -283,126 +184,33 @@ int FilterEngine::remainingOutputRows() const return roi.height - dstY; } -int FilterEngine::proceed( const uchar* src, int srcstep, int count, - uchar* dst, int dststep ) -{ - CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); - - const int *btab = &borderTab[0]; - int esz = (int)getElemSize(srcType), btab_esz = borderElemSize; - uchar** brows = &rows[0]; - int bufRows = (int)rows.size(); - int cn = CV_MAT_CN(bufType); - int width = roi.width, kwidth = ksize.width; - int kheight = ksize.height, ay = anchor.y; - int _dx1 = dx1, _dx2 = dx2; - int width1 = roi.width + kwidth - 1; - int xofs1 = std::min(roi.x, anchor.x); - bool isSep = isSeparable(); - bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT; - int dy = 0, i = 0; - - src -= xofs1*esz; - count = std::min(count, remainingInputRows()); - - CV_Assert( src && dst && count > 0 ); - - for(;; dst += dststep*i, dy += i) - { - int dcount = bufRows - ay - startY - rowCount + roi.y; - dcount = dcount > 0 ? dcount : bufRows - kheight + 1; - dcount = std::min(dcount, count); - count -= dcount; - for( ; dcount-- > 0; src += srcstep ) - { - int bi = (startY - startY0 + rowCount) % bufRows; - uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; - uchar* row = isSep ? &srcRow[0] : brow; - - if( ++rowCount > bufRows ) - { - --rowCount; - ++startY; - } - - memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz ); - - if( makeBorder ) - { - if( btab_esz*(int)sizeof(int) == esz ) - { - const int* isrc = (const int*)src; - int* irow = (int*)row; - - for( i = 0; i < _dx1*btab_esz; i++ ) - irow[i] = isrc[btab[i]]; - for( i = 0; i < _dx2*btab_esz; i++ ) - irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]]; - } - else - { - for( i = 0; i < _dx1*esz; i++ ) - row[i] = src[btab[i]]; - for( i = 0; i < _dx2*esz; i++ ) - row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]]; - } - } - - if( isSep ) - (*rowFilter)(row, brow, width, CV_MAT_CN(srcType)); - } - - int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1)); - for( i = 0; i < max_i; i++ ) - { - int srcY = borderInterpolate(dstY + dy + i + roi.y - ay, - wholeSize.height, columnBorderType); - if( srcY < 0 ) // can happen only with constant border type - brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN); - else - { - CV_Assert( srcY >= startY ); - if( srcY >= startY + rowCount ) - break; - int bi = (srcY - startY0) % bufRows; - brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; - } - } - if( i < kheight ) - break; - i -= kheight - 1; - if( isSeparable() ) - (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn); - else - (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn); - } - - dstY += dy; - CV_Assert( dstY <= roi.height ); - return dy; -} - -void FilterEngine::apply(const Mat& src, Mat& dst, const Size & wsz, const Point & ofs) +int FilterEngine::proceed(const uchar* src, int srcstep, int count, + uchar* dst, int dststep) { CV_INSTRUMENT_REGION(); - CV_Assert( src.type() == srcType && dst.type() == dstType ); + CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); - int y = start(src, wsz, ofs); - proceed(src.ptr() + y*src.step, - (int)src.step, - endY - startY, - dst.ptr(), - (int)dst.step ); + CV_CPU_DISPATCH(FilterEngine__proceed, (*this, src, srcstep, count, dst, dststep), + CV_CPU_DISPATCH_MODES_ALL); } +void FilterEngine::apply(const Mat& src, Mat& dst, const Size& wsz, const Point& ofs) +{ + CV_INSTRUMENT_REGION(); + + CV_CheckTypeEQ(src.type(), srcType, ""); + CV_CheckTypeEQ(dst.type(), dstType, ""); + + CV_CPU_DISPATCH(FilterEngine__apply, (*this, src, dst, wsz, ofs), + CV_CPU_DISPATCH_MODES_ALL); } /****************************************************************************************\ * Separable linear filter * \****************************************************************************************/ -int cv::getKernelType(InputArray filter_kernel, Point anchor) +int getKernelType(InputArray filter_kernel, Point anchor) { Mat _kernel = filter_kernel.getMat(); CV_Assert( _kernel.channels() == 1 ); @@ -439,2626 +247,39 @@ int cv::getKernelType(InputArray filter_kernel, Point anchor) } -namespace cv +Ptr getLinearRowFilter( + int srcType, int bufType, + InputArray _kernel, int anchor, + int symmetryType) { + CV_INSTRUMENT_REGION(); -struct RowNoVec -{ - RowNoVec() {} - RowNoVec(const Mat&) {} - int operator()(const uchar*, uchar*, int, int) const { return 0; } -}; - -struct ColumnNoVec -{ - ColumnNoVec() {} - ColumnNoVec(const Mat&, int, int, double) {} - int operator()(const uchar**, uchar*, int) const { return 0; } -}; - -struct SymmRowSmallNoVec -{ - SymmRowSmallNoVec() {} - SymmRowSmallNoVec(const Mat&, int) {} - int operator()(const uchar*, uchar*, int, int) const { return 0; } -}; - -struct SymmColumnSmallNoVec -{ - SymmColumnSmallNoVec() {} - SymmColumnSmallNoVec(const Mat&, int, int, double) {} - int operator()(const uchar**, uchar*, int) const { return 0; } -}; - -struct FilterNoVec -{ - FilterNoVec() {} - FilterNoVec(const Mat&, int, double) {} - int operator()(const uchar**, uchar*, int) const { return 0; } -}; - - -#if CV_SIMD - -///////////////////////////////////// 8u-16s & 8u-8u ////////////////////////////////// - -struct RowVec_8u32s -{ - RowVec_8u32s() { smallValues = false; } - RowVec_8u32s( const Mat& _kernel ) - { - kernel = _kernel; - smallValues = true; - int k, ksize = kernel.rows + kernel.cols - 1; - for( k = 0; k < ksize; k++ ) - { - int v = kernel.ptr()[k]; - if( v < SHRT_MIN || v > SHRT_MAX ) - { - smallValues = false; - break; - } - } - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { - int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; - int* dst = (int*)_dst; - const int* _kx = kernel.ptr(); - width *= cn; - - if( smallValues ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - const uchar* src = _src + i; - v_int32 s0 = vx_setzero_s32(); - v_int32 s1 = vx_setzero_s32(); - v_int32 s2 = vx_setzero_s32(); - v_int32 s3 = vx_setzero_s32(); - k = 0; - for (; k <= _ksize - 2; k += 2, src += 2 * cn) - { - v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); - v_uint8 x0, x1; - v_zip(vx_load(src), vx_load(src + cn), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); - s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); - s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); - } - if (k < _ksize) - { - v_int32 f = vx_setall_s32(_kx[k]); - v_uint16 x0, x1; - v_expand(vx_load(src), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f)); - s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f)); - s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f)); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); - } - if( i <= width - v_uint16::nlanes ) - { - const uchar* src = _src + i; - v_int32 s0 = vx_setzero_s32(); - v_int32 s1 = vx_setzero_s32(); - k = 0; - for( ; k <= _ksize - 2; k += 2, src += 2*cn ) - { - v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); - v_uint16 x0, x1; - v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); - } - if( k < _ksize ) - { - v_int32 f = vx_setall_s32(_kx[k]); - v_uint32 x0, x1; - v_expand(vx_load_expand(src), x0, x1); - s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f)); - s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f)); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 d = vx_setzero_s32(); - k = 0; - const uchar* src = _src + i; - for (; k <= _ksize - 2; k += 2, src += 2*cn) - { - v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16)); - v_uint32 x0, x1; - v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1); - d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f)); - } - if (k < _ksize) - d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k]))); - v_store(dst + i, d); - i += v_uint32::nlanes; - } - } - vx_cleanup(); - return i; - } - - Mat kernel; - bool smallValues; -}; - - -struct SymmRowSmallVec_8u32s -{ - SymmRowSmallVec_8u32s() { smallValues = false; symmetryType = 0; } - SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType ) - { - kernel = _kernel; - symmetryType = _symmetryType; - smallValues = true; - int k, ksize = kernel.rows + kernel.cols - 1; - for( k = 0; k < ksize; k++ ) - { - int v = kernel.ptr()[k]; - if( v < SHRT_MIN || v > SHRT_MAX ) - { - smallValues = false; - break; - } - } - } - - int operator()(const uchar* src, uchar* _dst, int width, int cn) const - { - int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; - int* dst = (int*)_dst; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int* kx = kernel.ptr() + _ksize/2; - if( !smallValues ) - return 0; - - src += (_ksize/2)*cn; - width *= cn; - - if( symmetrical ) - { - if( _ksize == 1 ) - return 0; - if( _ksize == 3 ) - { - if( kx[0] == 2 && kx[1] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l)); - x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h)); - v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l))); - v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src); - x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn))); - v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x))); - v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_uint32 x = vx_load_expand_q(src); - x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn); - v_store(dst + i, v_reinterpret_as_s32(x)); - i += v_uint32::nlanes; - } - } - else if( kx[0] == -2 && kx[1] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); - x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src); - x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); - x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x); - v_store(dst + i, x); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = vx_setall_s16((short)kx[0]); - v_int16 k1 = vx_setall_s16((short)kx[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - - v_int32 dl, dh; - v_int16 x0, x1; - v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh); - v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); - v_store(dst + i, dl); - v_store(dst + i + v_int32::nlanes, dh); - - v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh); - v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); - v_store(dst + i + 2*v_int32::nlanes, dl); - v_store(dst + i + 3*v_int32::nlanes, dh); - } - if ( i <= width - v_uint16::nlanes ) - { - v_int32 dl, dh; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh); - v_int16 x0, x1; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1); - dl += v_dotprod(x0, k1); - dh += v_dotprod(x1, k1); - v_store(dst + i, dl); - v_store(dst + i + v_int32::nlanes, dh); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if ( i <= width - v_uint32::nlanes ) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1]))); - i += v_uint32::nlanes; - } - } - } - else if( _ksize == 5 ) - { - if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h; - v_expand(vx_load(src - 2*cn), x0l, x0h); - v_expand(vx_load(src), x1l, x1h); - v_expand(vx_load(src + 2*cn), x2l, x2h); - x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l)); - x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l))); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h))); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src); - x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x)); - v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x))); - v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x))); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src)); - x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x); - v_store(dst + i, x); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = vx_setall_s16((short)(kx[0])); - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_int32 x0, x1, x2, x3; - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; - v_int16 xl, xh; - - v_expand(vx_load(src), x0l, x0h); - v_mul_expand(v_reinterpret_as_s16(x0l), k0, x0, x1); - v_mul_expand(v_reinterpret_as_s16(x0h), k0, x2, x3); - - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src + cn), x1l, x1h); - v_expand(vx_load(src - 2*cn), x2l, x2h); - v_expand(vx_load(src + 2*cn), x3l, x3h); - v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh); - x0 += v_dotprod(xl, k12); - x1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh); - x2 += v_dotprod(xl, k12); - x3 += v_dotprod(xh, k12); - - v_store(dst + i, x0); - v_store(dst + i + v_int32::nlanes, x1); - v_store(dst + i + 2*v_int32::nlanes, x2); - v_store(dst + i + 3*v_int32::nlanes, x3); - } - if( i <= width - v_uint16::nlanes ) - { - v_int32 x1, x2; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh); - x1 += v_dotprod(xl, k12); - x2 += v_dotprod(xh, k12); - - v_store(dst + i, x1); - v_store(dst + i + v_int32::nlanes, x2); - i += v_uint16::nlanes, src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), - v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2])))); - i += v_uint32::nlanes; - } - } - } - else - { - v_int16 k0 = vx_setall_s16((short)(kx[0])); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint8 v_src = vx_load(src); - v_int32 s0, s1, s2, s3; - v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); - v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); - for (k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn) - { - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); - - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src - j - cn); - v_uint8 v_src2 = vx_load(src + j); - v_uint8 v_src3 = vx_load(src + j + cn); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); - } - if( k < _ksize / 2 + 1 ) - { - v_int16 k1 = vx_setall_s16((short)(kx[k])); - - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src + j); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); - v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh); - s2 += v_dotprod(xl, k1); - s3 += v_dotprod(xh, k1); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); - } - if( i <= width - v_uint16::nlanes ) - { - v_int32 s0, s1; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); - for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh); - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16))); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - } - if ( k < _ksize / 2 + 1 ) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh); - v_int16 k1 = vx_setall_s16((short)(kx[k])); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); - for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn ) - s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0); - v_store(dst + i, s0); - i += v_uint32::nlanes; - } - } - } - else - { - if( _ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - { - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src + cn), x2l, x2h); - v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)); - v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)); - v_store(dst + i, v_expand_low(dl)); - v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); - v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh)); - v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh)); - } - if( i <= width - v_uint16::nlanes ) - { - v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))); - v_store(dst + i, v_expand_low(dl)); - v_store(dst + i + v_int32::nlanes, v_expand_high(dl)); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if (i <= width - v_uint32::nlanes) - { - v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn))); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x2l, x2h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src + cn), x2l, x2h); - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh); - v_store(dst + i, v_dotprod(xl, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); - v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh); - v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0)); - v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0)); - } - if( i <= width - v_uint16::nlanes ) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh); - v_store(dst + i, v_dotprod(xl, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0)); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if (i <= width - v_uint32::nlanes) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1]))); - i += v_uint32::nlanes; - } - } - } - else if( _ksize == 5 ) - { - v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16))); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h; - v_expand(vx_load(src - cn), x0l, x0h); - v_expand(vx_load(src - 2*cn), x1l, x1h); - v_expand(vx_load(src + cn), x2l, x2h); - v_expand(vx_load(src + 2*cn), x3l, x3h); - v_int16 x0, x1; - v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1); - v_store(dst + i, v_dotprod(x0, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); - v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1); - v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0)); - v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0)); - } - if( i <= width - v_uint16::nlanes ) - { - v_int16 x0, x1; - v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))), - v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1); - v_store(dst + i, v_dotprod(x0, k0)); - v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0)); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]), - (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2]))); - i += v_uint32::nlanes; - } - } - else - { - v_int16 k0 = vx_setall_s16((short)(kx[0])); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes ) - { - v_uint8 v_src = vx_load(src); - v_int32 s0, s1, s2, s3; - v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1); - v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3); - for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn ) - { - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); - - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src - j - cn); - v_uint8 v_src2 = vx_load(src + j); - v_uint8 v_src3 = vx_load(src + j + cn); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); - } - if( k < _ksize / 2 + 1 ) - { - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16))); - v_uint8 v_src0 = vx_load(src - j); - v_uint8 v_src1 = vx_load(src + j); - - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh); - s2 += v_dotprod(xl, k12); - s3 += v_dotprod(xh, k12); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - v_store(dst + i + 2*v_int32::nlanes, s2); - v_store(dst + i + 3*v_int32::nlanes, s3); - } - if( i <= width - v_uint16::nlanes ) - { - v_int32 s0, s1; - v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1); - for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn ) - { - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh); - v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16))); - s0 += v_dotprod(xl, k12); - s1 += v_dotprod(xh, k12); - } - if( k < _ksize / 2 + 1 ) - { - v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16))); - v_int16 xl, xh; - v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh); - s0 += v_dotprod(xl, k1); - s1 += v_dotprod(xh, k1); - } - v_store(dst + i, s0); - v_store(dst + i + v_int32::nlanes, s1); - i += v_uint16::nlanes; src += v_uint16::nlanes; - } - if( i <= width - v_uint32::nlanes ) - { - v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]); - for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn) - s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0); - v_store(dst + i, s0); - i += v_uint32::nlanes; - } - } - } - - vx_cleanup(); - return i; - } - - Mat kernel; - int symmetryType; - bool smallValues; -}; - - -struct SymmColumnVec_32s8u -{ - SymmColumnVec_32s8u() { symmetryType=0; delta = 0; } - SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta) - { - symmetryType = _symmetryType; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* dst, int width) const - { - int _ksize = kernel.rows + kernel.cols - 1; - if( _ksize == 1 ) - return 0; - int ksize2 = _ksize/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int** src = (const int**)_src; - - v_float32 d4 = vx_setall_f32(delta); - if( symmetrical ) - { - v_float32 f0 = vx_setall_f32(ky[0]); - v_float32 f1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - const int* S = src[0] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); - v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4); - v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4); - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) + vx_load(S1 + 2 * v_int32::nlanes)), f1, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) + vx_load(S1 + 3 * v_int32::nlanes)), f1, s3); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3); - } - v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); - } - if( i <= width - v_uint16::nlanes ) - { - const int* S = src[0] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4); - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f1, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f1, s1); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1); - } - v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } -#if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) -#else - if( i <= width - v_int32x4::nlanes ) -#endif - { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta)); - s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) + v_load(src[-1] + i)), v_setall_f32(ky[1]), s0); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; - } - } - else - { - v_float32 f1 = vx_setall_f32(ky[1]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); - v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2 * v_int32::nlanes) - vx_load(S1 + 2 * v_int32::nlanes)), f1, d4); - v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3 * v_int32::nlanes) - vx_load(S1 + 3 * v_int32::nlanes)), f1, d4); - for ( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); - s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2); - s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3); - } - v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); - } - if( i <= width - v_uint16::nlanes ) - { - const int* S0 = src[1] + i; - const int* S1 = src[-1] + i; - v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f1, d4); - v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f1, d4); - for ( k = 2; k <= ksize2; k++ ) - { - v_float32 f = vx_setall_f32(ky[k]); - S0 = src[k] + i; - S1 = src[-k] + i; - s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0); - s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1); - } - v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } -#if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) -#else - if( i <= width - v_int32x4::nlanes ) -#endif - { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[1] + i) - v_load(src[-1] + i)), v_setall_f32(ky[1]), v_setall_f32(delta)); - for (k = 2; k <= ksize2; k++) - s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -struct SymmColumnSmallVec_32s16s -{ - SymmColumnSmallVec_32s16s() { symmetryType=0; delta = 0; } - SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta) - { - symmetryType = _symmetryType; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int ksize2 = (kernel.rows + kernel.cols - 1)/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const int** src = (const int**)_src; - const int *S0 = src[-1], *S1 = src[0], *S2 = src[1]; - short* dst = (short*)_dst; - - v_float32 df4 = vx_setall_f32(delta); - int d = cvRound(delta); - v_int16 d8 = vx_setall_s16((short)d); - if( symmetrical ) - { - if( ky[0] == 2 && ky[1] == 1 ) - { - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_int32 s0 = vx_load(S1 + i); - v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); - v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); - v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (s0 + s0), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (s1 + s1)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) + (s2 + s2), - vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) + (s3 + s3)) + d8); - } - if( i <= width - v_int16::nlanes ) - { - v_int32 sl = vx_load(S1 + i); - v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + (sh + sh)) + d8); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) + (s + s)); - i += v_int32::nlanes; - } - } - else if( ky[0] == -2 && ky[1] == 1 ) - { - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_int32 s0 = vx_load(S1 + i); - v_int32 s1 = vx_load(S1 + i + v_int32::nlanes); - v_int32 s2 = vx_load(S1 + i + 2*v_int32::nlanes); - v_int32 s3 = vx_load(S1 + i + 3*v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (s0 + s0), - vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (s1 + s1)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes) - (s2 + s2), - vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes) - (s3 + s3)) + d8); - } - if( i <= width - v_int16::nlanes ) - { - v_int32 sl = vx_load(S1 + i); - v_int32 sh = vx_load(S1 + i + v_int32::nlanes); - v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) - (sh + sh)) + d8); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_int32 s = vx_load(S1 + i); - v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + vx_setall_s32(d) - (s + s)); - i += v_int32::nlanes; - } - } -#if CV_NEON - else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) ) - { - v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]); - v_int32 d4 = vx_setall_s32(d); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), - v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); - v_store(dst + i + v_int16::nlanes, v_pack(v_muladd(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 2*v_int32::nlanes), k0, d4)), - v_muladd(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + 3*v_int32::nlanes), k0, d4)))); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)), - v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4)))); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); - i += v_int32::nlanes; - } - } -#endif - else - { - v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 2*v_int32::nlanes) + vx_load(S2 + i + 2*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 2*v_int32::nlanes)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + 3*v_int32::nlanes) + vx_load(S2 + i + 3*v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + 3*v_int32::nlanes)), k0, df4))))); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))), - v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4))))); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4)))); - i += v_int32::nlanes; - } - } - } - else - { - if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) - { - if( ky[1] < 0 ) - std::swap(S0, S2); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); - v_store(dst + i + v_int16::nlanes, v_pack(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes), vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)) + d8); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i), vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)) + d8); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + vx_setall_s32(d)); - i += v_int32::nlanes; - } - } - else - { - v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 2*v_int32::nlanes) - vx_load(S0 + i + 2*v_int32::nlanes)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + 3*v_int32::nlanes) - vx_load(S0 + i + 3*v_int32::nlanes)), k1, df4)))); - } - if( i <= width - v_int16::nlanes ) - { - v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)), - v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4)))); - i += v_int16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4))); - i += v_int32::nlanes; - } - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -/////////////////////////////////////// 16s ////////////////////////////////// - -struct RowVec_16s32f -{ - RowVec_16s32f() {} - RowVec_16s32f( const Mat& _kernel ) - { - kernel = _kernel; - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { - int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; - float* dst = (float*)_dst; - const float* _kx = kernel.ptr(); - width *= cn; - - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - const short* src = (const short*)_src + i; - v_float32 s0 = vx_setzero_f32(); - v_float32 s1 = vx_setzero_f32(); - v_float32 s2 = vx_setzero_f32(); - v_float32 s3 = vx_setzero_f32(); - for( k = 0; k < _ksize; k++, src += cn ) - { - v_float32 f = vx_setall_f32(_kx[k]); - v_int16 xl = vx_load(src); - v_int16 xh = vx_load(src + v_int16::nlanes); - s0 = v_muladd(v_cvt_f32(v_expand_low(xl)), f, s0); - s1 = v_muladd(v_cvt_f32(v_expand_high(xl)), f, s1); - s2 = v_muladd(v_cvt_f32(v_expand_low(xh)), f, s2); - s3 = v_muladd(v_cvt_f32(v_expand_high(xh)), f, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - v_int16::nlanes ) - { - const short* src = (const short*)_src + i; - v_float32 s0 = vx_setzero_f32(); - v_float32 s1 = vx_setzero_f32(); - for( k = 0; k < _ksize; k++, src += cn ) - { - v_float32 f = vx_setall_f32(_kx[k]); - v_int16 x = vx_load(src); - s0 = v_muladd(v_cvt_f32(v_expand_low(x)), f, s0); - s1 = v_muladd(v_cvt_f32(v_expand_high(x)), f, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += v_int16::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - const short* src = (const short*)_src + i; - v_float32 s0 = vx_setzero_f32(); - for( k = 0; k < _ksize; k++, src += cn ) - s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - vx_cleanup(); - return i; - } - - Mat kernel; -}; - - -struct SymmColumnVec_32f16s -{ - SymmColumnVec_32f16s() { symmetryType=0; delta = 0; } - SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) - { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int _ksize = kernel.rows + kernel.cols - 1; - if( _ksize == 1 ) - return 0; - int ksize2 = _ksize / 2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - short* dst = (short*)_dst; - - v_float32 d4 = vx_setall_f32(delta); - if( symmetrical ) - { - v_float32 k0 = vx_setall_f32(ky[0]); - v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) + vx_load(src[-1] + i + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) + vx_load(src[-1] + i + 3*v_float32::nlanes), k1, s3); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); - } - if( i <= width - v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) + vx_load(src[-1] + i + v_float32::nlanes), k1, s1); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_int16::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - s0 = v_muladd(vx_load(src[1] + i) + vx_load(src[-1] + i), k1, s0); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_pack_store(dst + i, v_round(s0)); - i += v_float32::nlanes; - } - } - else - { - v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 2*v_int16::nlanes; i += 2*v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); - v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); - } - if( i <= width - v_int16::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_int16::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_pack_store(dst + i, v_round(s0)); - i += v_float32::nlanes; - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -/////////////////////////////////////// 32f ////////////////////////////////// - -struct RowVec_32f -{ - RowVec_32f() - { - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; -#if defined USE_IPP_SEP_FILTERS - bufsz = -1; -#endif - } - - RowVec_32f( const Mat& _kernel ) - { - kernel = _kernel; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; -#if defined USE_IPP_SEP_FILTERS - bufsz = -1; -#endif - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { -#if defined USE_IPP_SEP_FILTERS - CV_IPP_CHECK() - { - int ret = ippiOperator(_src, _dst, width, cn); - if (ret > 0) - return ret; - } -#endif - int _ksize = kernel.rows + kernel.cols - 1; - CV_DbgAssert(_ksize > 0); - const float* src0 = (const float*)_src; - float* dst = (float*)_dst; - const float* _kx = kernel.ptr(); - - int i = 0, k; - width *= cn; - -#if CV_TRY_AVX2 - if (haveAVX2) - return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); -#endif - v_float32 k0 = vx_setall_f32(_kx[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; - v_float32 s2 = vx_load(src + 2*v_float32::nlanes) * k0; - v_float32 s3 = vx_load(src + 3*v_float32::nlanes) * k0; - src += cn; - for( k = 1; k < _ksize; k++, src += cn ) - { - v_float32 k1 = vx_setall_f32(_kx[k]); - s0 = v_muladd(vx_load(src), k1, s0); - s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src + 3*v_float32::nlanes), k1, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - v_float32 s1 = vx_load(src + v_float32::nlanes) * k0; - src += cn; - for( k = 1; k < _ksize; k++, src += cn ) - { - v_float32 k1 = vx_setall_f32(_kx[k]); - s0 = v_muladd(vx_load(src), k1, s0); - s1 = v_muladd(vx_load(src + v_float32::nlanes), k1, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - const float* src = src0 + i; - v_float32 s0 = vx_load(src) * k0; - src += cn; - for( k = 1; k < _ksize; k++, src += cn ) - s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - vx_cleanup(); - return i; - } - - Mat kernel; - bool haveAVX2; -#if defined USE_IPP_SEP_FILTERS -private: - mutable int bufsz; - int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const - { - CV_INSTRUMENT_REGION_IPP(); - - int _ksize = kernel.rows + kernel.cols - 1; - if ((1 != cn && 3 != cn) || width < _ksize*8) - return 0; - - const float* src = (const float*)_src; - float* dst = (float*)_dst; - const float* _kx = (const float*)kernel.data; - - IppiSize roisz = { width, 1 }; - if( bufsz < 0 ) - { - if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) || - (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0)) - return 0; - } - AutoBuffer buf(bufsz + 64); - uchar* bufptr = alignPtr(buf.data(), 32); - int step = (int)(width*sizeof(dst[0])*cn); - float borderValue[] = {0.f, 0.f, 0.f}; - // here is the trick. IPP needs border type and extrapolates the row. We did it already. - // So we pass anchor=0 and ignore the right tail of results since they are incorrect there. - if( (cn == 1 && CV_INSTRUMENT_FUN_IPP(ippiFilterRowBorderPipeline_32f_C1R, src, step, &dst, roisz, _kx, _ksize, 0, - ippBorderRepl, borderValue[0], bufptr) < 0) || - (cn == 3 && CV_INSTRUMENT_FUN_IPP(ippiFilterRowBorderPipeline_32f_C3R, src, step, &dst, roisz, _kx, _ksize, 0, - ippBorderRepl, borderValue, bufptr) < 0)) - { - setIppErrorStatus(); - return 0; - } - CV_IMPL_ADD(CV_IMPL_IPP); - return width - _ksize + 1; - } -#endif -}; - - -struct SymmRowSmallVec_32f -{ - SymmRowSmallVec_32f() { symmetryType = 0; } - SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType ) - { - kernel = _kernel; - symmetryType = _symmetryType; - } - - int operator()(const uchar* _src, uchar* _dst, int width, int cn) const - { - int i = 0, _ksize = kernel.rows + kernel.cols - 1; - if( _ksize == 1 ) - return 0; - float* dst = (float*)_dst; - const float* src = (const float*)_src + (_ksize/2)*cn; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float* kx = kernel.ptr() + _ksize/2; - width *= cn; - - if( symmetrical ) - { - if( _ksize == 3 ) - { - if( fabs(kx[0]) == 2 && kx[1] == 1 ) - { -#if CV_FMA3 || CV_AVX2 - v_float32 k0 = vx_setall_f32(kx[0]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn))); -#else - if( kx[0] > 0 ) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - { - v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) + (x + x)); - } - else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - { - v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - cn) + vx_load(src + cn) - (x + x)); - } -#endif - } - else - { - v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)); - } - } - else if( _ksize == 5 ) - { - if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) - { -#if CV_FMA3 || CV_AVX2 - v_float32 k0 = vx_setall_f32(-2); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn))); -#else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - { - v_float32 x = vx_load(src); - v_store(dst + i, vx_load(src - 2*cn) + vx_load(src + 2*cn) - (x + x)); - } -#endif - } - else - { - v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1))); - } - } - } - else - { - if( _ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, vx_load(src + cn) - vx_load(src - cn)); - else - { - v_float32 k1 = vx_setall_f32(kx[1]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1); - } - } - else if( _ksize == 5 ) - { - v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1)); - } - } - - vx_cleanup(); - return i; - } - - Mat kernel; - int symmetryType; -}; - - -struct SymmColumnVec_32f -{ - SymmColumnVec_32f() { - symmetryType=0; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; - delta = 0; - } - SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) - { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int ksize2 = (kernel.rows + kernel.cols - 1)/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - float* dst = (float*)_dst; - - if( symmetrical ) - { - -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); -#endif - const v_float32 d4 = vx_setall_f32(delta); - const v_float32 k0 = vx_setall_f32(ky[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), k0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), k0, d4); - for( k = 1; k <= ksize2; k++ ) - { - v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) + vx_load(src[-k] + i + 2*v_float32::nlanes), k1, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) + vx_load(src[-k] + i + 3*v_float32::nlanes), k1, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4); - for( k = 1; k <= ksize2; k++ ) - { - v_float32 k1 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4); - for( k = 1; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - } - else - { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); -#endif - CV_DbgAssert(ksize2 > 0); - const v_float32 d4 = vx_setall_f32(delta); - const v_float32 k1 = vx_setall_f32(ky[1]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - v_float32 s2 = v_muladd(vx_load(src[1] + i + 2*v_float32::nlanes) - vx_load(src[-1] + i + 2*v_float32::nlanes), k1, d4); - v_float32 s3 = v_muladd(vx_load(src[1] + i + 3*v_float32::nlanes) - vx_load(src[-1] + i + 3*v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes) - vx_load(src[-k] + i + 2*v_float32::nlanes), k2, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes) - vx_load(src[-k] + i + 3*v_float32::nlanes), k2, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - v_float32 s1 = v_muladd(vx_load(src[1] + i + v_float32::nlanes) - vx_load(src[-1] + i + v_float32::nlanes), k1, d4); - for( k = 2; k <= ksize2; k++ ) - { - v_float32 k2 = vx_setall_f32(ky[k]); - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k2, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k2, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[1] + i) - vx_load(src[-1] + i), k1, d4); - for( k = 2; k <= ksize2; k++ ) - s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; - bool haveAVX2; -}; - - -struct SymmColumnSmallVec_32f -{ - SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; } - SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) - { - symmetryType = _symmetryType; - kernel = _kernel; - delta = (float)_delta; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - int ksize2 = (kernel.rows + kernel.cols - 1)/2; - const float* ky = kernel.ptr() + ksize2; - int i = 0; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - const float** src = (const float**)_src; - const float *S0 = src[-1], *S1 = src[0], *S2 = src[1]; - float* dst = (float*)_dst; - - v_float32 d4 = vx_setall_f32(delta); - if( symmetrical ) - { - if( fabs(ky[0]) == 2 && ky[1] == 1 ) - { -#if CV_FMA3 || CV_AVX2 - v_float32 k0 = vx_setall_f32(ky[0]); - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4)); -#else - if(ky[0] > 0) - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - { - v_float32 x = vx_load(S1 + i); - v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (x + x)); - } - else - for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - { - v_float32 x = vx_load(S1 + i); - v_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (x + x)); - } -#endif - } - else - { - v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4))); - } - } - else - { - if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) - { - if( ky[1] < 0 ) - std::swap(S0, S2); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4); - } - else - { - v_float32 k1 = vx_setall_f32(ky[1]); - for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes ) - v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4)); - } - } - - vx_cleanup(); - return i; - } - - int symmetryType; - float delta; - Mat kernel; -}; - - -/////////////////////////////// non-separable filters /////////////////////////////// - -///////////////////////////////// 8u<->8u, 8u<->16s ///////////////////////////////// - -struct FilterVec_8u -{ - FilterVec_8u() { delta = 0; _nz = 0; } - FilterVec_8u(const Mat& _kernel, int _bits, double _delta) - { - Mat kernel; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - std::vector coords; - preprocess2DKernel(kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** src, uchar* dst, int width) const - { - CV_DbgAssert(_nz > 0); - const float* kf = (const float*)&coeffs[0]; - int i = 0, k, nz = _nz; - - v_float32 d4 = vx_setall_f32(delta); - v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - v_uint16 xl, xh; - v_expand(vx_load(src[0] + i), xl, xh); - v_uint32 x0, x1, x2, x3; - v_expand(xl, x0, x1); - v_expand(xh, x2, x3); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); - v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f0, d4); - v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - v_expand(vx_load(src[k] + i), xl, xh); - v_expand(xl, x0, x1); - v_expand(xh, x2, x3); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); - s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f, s2); - s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f, s3); - } - v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3)))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint32 x0, x1; - v_expand(vx_load_expand(src[0] + i), x0, x1); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - v_expand(vx_load_expand(src[k] + i), x0, x1); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1); - } - v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } -#if CV_SIMD_WIDTH > 16 - while( i <= width - v_int32x4::nlanes ) -#else - if( i <= width - v_int32x4::nlanes ) -#endif - { - v_float32x4 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[0] + i))), v_setall_f32(kf[0]), v_setall_f32(delta)); - for( k = 1; k < nz; k++ ) - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0); - v_int32x4 s32 = v_round(s0); - v_int16x8 s16 = v_pack(s32, s32); - *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); - i += v_int32x4::nlanes; - } - - vx_cleanup(); - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - - -struct FilterVec_8u16s -{ - FilterVec_8u16s() { delta = 0; _nz = 0; } - FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta) - { - Mat kernel; - _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); - delta = (float)(_delta/(1 << _bits)); - std::vector coords; - preprocess2DKernel(kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** src, uchar* _dst, int width) const - { - CV_DbgAssert(_nz > 0); - const float* kf = (const float*)&coeffs[0]; - short* dst = (short*)_dst; - int i = 0, k, nz = _nz; - - v_float32 d4 = vx_setall_f32(delta); - v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes ) - { - v_uint16 xl, xh; - v_expand(vx_load(src[0] + i), xl, xh); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f0, d4); - v_float32 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f0, d4); - v_float32 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - v_expand(vx_load(src[k] + i), xl, xh); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1); - s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f, s2); - s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3))); - } - if( i <= width - v_uint16::nlanes ) - { - v_uint16 x = vx_load_expand(src[0] + i); - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f0, d4); - v_float32 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f = vx_setall_f32(kf[k]); - x = vx_load_expand(src[k] + i); - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0); - s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1); - } - v_store(dst + i, v_pack(v_round(s0), v_round(s1))); - i += v_uint16::nlanes; - } - if( i <= width - v_int32::nlanes ) - { - v_float32 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[0] + i))), f0, d4); - for( k = 1; k < nz; k++ ) - s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0); - v_pack_store(dst + i, v_round(s0)); - i += v_int32::nlanes; - } - - vx_cleanup(); - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - - -struct FilterVec_32f -{ - FilterVec_32f() { delta = 0; _nz = 0; } - FilterVec_32f(const Mat& _kernel, int, double _delta) - { - delta = (float)_delta; - std::vector coords; - preprocess2DKernel(_kernel, coords, coeffs); - _nz = (int)coords.size(); - } - - int operator()(const uchar** _src, uchar* _dst, int width) const - { - const float* kf = (const float*)&coeffs[0]; - const float** src = (const float**)_src; - float* dst = (float*)_dst; - int i = 0, k, nz = _nz; - - v_float32 d4 = vx_setall_f32(delta); - v_float32 f0 = vx_setall_f32(kf[0]); - for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); - v_float32 s2 = v_muladd(vx_load(src[0] + i + 2*v_float32::nlanes), f0, d4); - v_float32 s3 = v_muladd(vx_load(src[0] + i + 3*v_float32::nlanes), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f1 = vx_setall_f32(kf[k]); - s0 = v_muladd(vx_load(src[k] + i), f1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); - s2 = v_muladd(vx_load(src[k] + i + 2*v_float32::nlanes), f1, s2); - s3 = v_muladd(vx_load(src[k] + i + 3*v_float32::nlanes), f1, s3); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - v_store(dst + i + 2*v_float32::nlanes, s2); - v_store(dst + i + 3*v_float32::nlanes, s3); - } - if( i <= width - 2*v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), f0, d4); - for( k = 1; k < nz; k++ ) - { - v_float32 f1 = vx_setall_f32(kf[k]); - s0 = v_muladd(vx_load(src[k] + i), f1, s0); - s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes), f1, s1); - } - v_store(dst + i, s0); - v_store(dst + i + v_float32::nlanes, s1); - i += 2*v_float32::nlanes; - } - if( i <= width - v_float32::nlanes ) - { - v_float32 s0 = v_muladd(vx_load(src[0] + i), f0, d4); - for( k = 1; k < nz; k++ ) - s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0); - v_store(dst + i, s0); - i += v_float32::nlanes; - } - - vx_cleanup(); - return i; - } - - int _nz; - std::vector coeffs; - float delta; -}; - -#else - -typedef RowNoVec RowVec_8u32s; -typedef RowNoVec RowVec_16s32f; -typedef RowNoVec RowVec_32f; -typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s; -typedef SymmRowSmallNoVec SymmRowSmallVec_32f; -typedef ColumnNoVec SymmColumnVec_32s8u; -typedef ColumnNoVec SymmColumnVec_32f16s; -typedef ColumnNoVec SymmColumnVec_32f; -typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s; -typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f; -typedef FilterNoVec FilterVec_8u; -typedef FilterNoVec FilterVec_8u16s; -typedef FilterNoVec FilterVec_32f; - -#endif - - -template struct RowFilter : public BaseRowFilter -{ - RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() ) - { - if( _kernel.isContinuous() ) - kernel = _kernel; - else - _kernel.copyTo(kernel); - anchor = _anchor; - ksize = kernel.rows + kernel.cols - 1; - CV_Assert( kernel.type() == DataType
::type && - (kernel.rows == 1 || kernel.cols == 1)); - vecOp = _vecOp; - } - - void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - int _ksize = ksize; - const DT* kx = kernel.ptr
(); - const ST* S; - DT* D = (DT*)dst; - int i, k; - - i = vecOp(src, dst, width, cn); - width *= cn; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - S = (const ST*)src + i; - DT f = kx[0]; - DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3]; - - for( k = 1; k < _ksize; k++ ) - { - S += cn; - f = kx[k]; - s0 += f*S[0]; s1 += f*S[1]; - s2 += f*S[2]; s3 += f*S[3]; - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - S = (const ST*)src + i; - DT s0 = kx[0]*S[0]; - for( k = 1; k < _ksize; k++ ) - { - S += cn; - s0 += kx[k]*S[0]; - } - D[i] = s0; - } - } - - Mat kernel; - VecOp vecOp; -}; - - -template struct SymmRowSmallFilter : - public RowFilter -{ - SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType, - const VecOp& _vecOp = VecOp()) - : RowFilter( _kernel, _anchor, _vecOp ) - { - symmetryType = _symmetryType; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 ); - } - - void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - int ksize2 = this->ksize/2, ksize2n = ksize2*cn; - const DT* kx = this->kernel.template ptr
() + ksize2; - bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; - DT* D = (DT*)dst; - int i = this->vecOp(src, dst, width, cn), j, k; - const ST* S = (const ST*)src + i + ksize2n; - width *= cn; - - if( symmetrical ) - { - if( this->ksize == 1 && kx[0] == 1 ) - { - for( ; i <= width - 2; i += 2 ) - { - DT s0 = S[i], s1 = S[i+1]; - D[i] = s0; D[i+1] = s1; - } - S += i; - } - else if( this->ksize == 3 ) - { - if( kx[0] == 2 && kx[1] == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn]; - D[i] = s0; D[i+1] = s1; - } - else if( kx[0] == -2 && kx[1] == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn]; - D[i] = s0; D[i+1] = s1; - } - else - { - DT k0 = kx[0], k1 = kx[1]; - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1; - D[i] = s0; D[i+1] = s1; - } - } - } - else if( this->ksize == 5 ) - { - DT k0 = kx[0], k1 = kx[1], k2 = kx[2]; - if( k0 == -2 && k1 == 0 && k2 == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = -2*S[0] + S[-cn*2] + S[cn*2]; - DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2]; - D[i] = s0; D[i+1] = s1; - } - else - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2; - DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2; - D[i] = s0; D[i+1] = s1; - } - } - - for( ; i < width; i++, S++ ) - { - DT s0 = kx[0]*S[0]; - for( k = 1, j = cn; k <= ksize2; k++, j += cn ) - s0 += kx[k]*(S[j] + S[-j]); - D[i] = s0; - } - } - else - { - if( this->ksize == 3 ) - { - if( kx[0] == 0 && kx[1] == 1 ) - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn]; - D[i] = s0; D[i+1] = s1; - } - else - { - DT k1 = kx[1]; - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1; - D[i] = s0; D[i+1] = s1; - } - } - } - else if( this->ksize == 5 ) - { - DT k1 = kx[1], k2 = kx[2]; - for( ; i <= width - 2; i += 2, S += 2 ) - { - DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2; - DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2; - D[i] = s0; D[i+1] = s1; - } - } - - for( ; i < width; i++, S++ ) - { - DT s0 = kx[0]*S[0]; - for( k = 1, j = cn; k <= ksize2; k++, j += cn ) - s0 += kx[k]*(S[j] - S[-j]); - D[i] = s0; - } - } - } - - int symmetryType; -}; - - -template struct ColumnFilter : public BaseColumnFilter -{ - typedef typename CastOp::type1 ST; - typedef typename CastOp::rtype DT; - - ColumnFilter( const Mat& _kernel, int _anchor, - double _delta, const CastOp& _castOp=CastOp(), - const VecOp& _vecOp=VecOp() ) - { - if( _kernel.isContinuous() ) - kernel = _kernel; - else - _kernel.copyTo(kernel); - anchor = _anchor; - ksize = kernel.rows + kernel.cols - 1; - delta = saturate_cast(_delta); - castOp0 = _castOp; - vecOp = _vecOp; - CV_Assert( kernel.type() == DataType::type && - (kernel.rows == 1 || kernel.cols == 1)); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - const ST* ky = kernel.template ptr(); - ST _delta = delta; - int _ksize = ksize; - int i, k; - CastOp castOp = castOp0; - - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = vecOp(src, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST f = ky[0]; - const ST* S = (const ST*)src[0] + i; - ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, - s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; - - for( k = 1; k < _ksize; k++ ) - { - S = (const ST*)src[k] + i; f = ky[k]; - s0 += f*S[0]; s1 += f*S[1]; - s2 += f*S[2]; s3 += f*S[3]; - } - - D[i] = castOp(s0); D[i+1] = castOp(s1); - D[i+2] = castOp(s2); D[i+3] = castOp(s3); - } - #endif - for( ; i < width; i++ ) - { - ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; - for( k = 1; k < _ksize; k++ ) - s0 += ky[k]*((const ST*)src[k])[i]; - D[i] = castOp(s0); - } - } - } - - Mat kernel; - CastOp castOp0; - VecOp vecOp; - ST delta; -}; - - -template struct SymmColumnFilter : public ColumnFilter -{ - typedef typename CastOp::type1 ST; - typedef typename CastOp::rtype DT; - - SymmColumnFilter( const Mat& _kernel, int _anchor, - double _delta, int _symmetryType, - const CastOp& _castOp=CastOp(), - const VecOp& _vecOp=VecOp()) - : ColumnFilter( _kernel, _anchor, _delta, _castOp, _vecOp ) - { - symmetryType = _symmetryType; - CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int ksize2 = this->ksize/2; - const ST* ky = this->kernel.template ptr() + ksize2; - int i, k; - bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; - ST _delta = this->delta; - CastOp castOp = this->castOp0; - src += ksize2; - - if( symmetrical ) - { - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = (this->vecOp)(src, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST f = ky[0]; - const ST* S = (const ST*)src[0] + i, *S2; - ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, - s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; - - for( k = 1; k <= ksize2; k++ ) - { - S = (const ST*)src[k] + i; - S2 = (const ST*)src[-k] + i; - f = ky[k]; - s0 += f*(S[0] + S2[0]); - s1 += f*(S[1] + S2[1]); - s2 += f*(S[2] + S2[2]); - s3 += f*(S[3] + S2[3]); - } - - D[i] = castOp(s0); D[i+1] = castOp(s1); - D[i+2] = castOp(s2); D[i+3] = castOp(s3); - } - #endif - for( ; i < width; i++ ) - { - ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; - for( k = 1; k <= ksize2; k++ ) - s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]); - D[i] = castOp(s0); - } - } - } - else - { - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = this->vecOp(src, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST f = ky[0]; - const ST *S, *S2; - ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta; - - for( k = 1; k <= ksize2; k++ ) - { - S = (const ST*)src[k] + i; - S2 = (const ST*)src[-k] + i; - f = ky[k]; - s0 += f*(S[0] - S2[0]); - s1 += f*(S[1] - S2[1]); - s2 += f*(S[2] - S2[2]); - s3 += f*(S[3] - S2[3]); - } - - D[i] = castOp(s0); D[i+1] = castOp(s1); - D[i+2] = castOp(s2); D[i+3] = castOp(s3); - } - #endif - for( ; i < width; i++ ) - { - ST s0 = _delta; - for( k = 1; k <= ksize2; k++ ) - s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]); - D[i] = castOp(s0); - } - } - } - } - - int symmetryType; -}; - - -template -struct SymmColumnSmallFilter : public SymmColumnFilter -{ - typedef typename CastOp::type1 ST; - typedef typename CastOp::rtype DT; - - SymmColumnSmallFilter( const Mat& _kernel, int _anchor, - double _delta, int _symmetryType, - const CastOp& _castOp=CastOp(), - const VecOp& _vecOp=VecOp()) - : SymmColumnFilter( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp ) - { - CV_Assert( this->ksize == 3 ); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int ksize2 = this->ksize/2; - const ST* ky = this->kernel.template ptr() + ksize2; - int i; - bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; - bool is_1_2_1 = ky[0] == 2 && ky[1] == 1; - bool is_1_m2_1 = ky[0] == -2 && ky[1] == 1; - bool is_m1_0_1 = ky[0] == 0 && (ky[1] == 1 || ky[1] == -1); - ST f0 = ky[0], f1 = ky[1]; - ST _delta = this->delta; - CastOp castOp = this->castOp0; - src += ksize2; - - for( ; count--; dst += dststep, src++ ) - { - DT* D = (DT*)dst; - i = (this->vecOp)(src, dst, width); - const ST* S0 = (const ST*)src[-1]; - const ST* S1 = (const ST*)src[0]; - const ST* S2 = (const ST*)src[1]; - - if( symmetrical ) - { - if( is_1_2_1 ) - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta; - ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta; - s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta; - D[i] = castOp(s0); - } - } - else if( is_1_m2_1 ) - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta; - ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta; - s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta; - D[i] = castOp(s0); - } - } - else - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta; - ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta; - s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta; - D[i] = castOp(s0); - } - } - } - else - { - if( is_m1_0_1 ) - { - if( f1 < 0 ) - std::swap(S0, S2); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = S2[i] - S0[i] + _delta; - ST s1 = S2[i+1] - S0[i+1] + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = S2[i+2] - S0[i+2] + _delta; - s1 = S2[i+3] - S0[i+3] + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i ++ ) - { - ST s0 = S2[i] - S0[i] + _delta; - D[i] = castOp(s0); - } - if( f1 < 0 ) - std::swap(S0, S2); - } - else - { - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - ST s0 = (S2[i] - S0[i])*f1 + _delta; - ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta; - D[i] = castOp(s0); - D[i+1] = castOp(s1); - - s0 = (S2[i+2] - S0[i+2])*f1 + _delta; - s1 = (S2[i+3] - S0[i+3])*f1 + _delta; - D[i+2] = castOp(s0); - D[i+3] = castOp(s1); - } - #endif - for( ; i < width; i++ ) - D[i] = castOp((S2[i] - S0[i])*f1 + _delta); - } - } - } - } -}; - -template struct Cast -{ - typedef ST type1; - typedef DT rtype; - - DT operator()(ST val) const { return saturate_cast
(val); } -}; - -template struct FixedPtCast -{ - typedef ST type1; - typedef DT rtype; - enum { SHIFT = bits, DELTA = 1 << (bits-1) }; - - DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } -}; - -template struct FixedPtCastEx -{ - typedef ST type1; - typedef DT rtype; - - FixedPtCastEx() : SHIFT(0), DELTA(0) {} - FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {} - DT operator()(ST val) const { return saturate_cast
((val + DELTA)>>SHIFT); } - int SHIFT, DELTA; -}; - -} - -cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, - InputArray _kernel, int anchor, - int symmetryType ) -{ - Mat kernel = _kernel.getMat(); - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType); - int cn = CV_MAT_CN(srcType); - CV_Assert( cn == CV_MAT_CN(bufType) && - ddepth >= std::max(sdepth, CV_32S) && - kernel.type() == ddepth ); - int ksize = kernel.rows + kernel.cols - 1; - - if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 ) - { - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr > - (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)); - if( sdepth == CV_32F && ddepth == CV_32F ) - return makePtr > - (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)); - } - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr > - (kernel, anchor, RowVec_8u32s(kernel)); - if( sdepth == CV_8U && ddepth == CV_32F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_16U && ddepth == CV_32F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_16S && ddepth == CV_32F ) - return makePtr > - (kernel, anchor, RowVec_16s32f(kernel)); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_32F && ddepth == CV_32F ) - return makePtr > - (kernel, anchor, RowVec_32f(kernel)); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(kernel, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, bufType)); + Mat kernelMat = _kernel.getMat(); + CV_CPU_DISPATCH(getLinearRowFilter, (srcType, bufType, kernelMat, anchor, symmetryType), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getLinearColumnFilter( int bufType, int dstType, - InputArray _kernel, int anchor, - int symmetryType, double delta, - int bits ) +Ptr getLinearColumnFilter( + int bufType, int dstType, + InputArray kernel, int anchor, + int symmetryType, double delta, + int bits) { - Mat kernel = _kernel.getMat(); - int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType); - int cn = CV_MAT_CN(dstType); - CV_Assert( cn == CV_MAT_CN(bufType) && - sdepth >= std::max(ddepth, CV_32S) && - kernel.type() == sdepth ); + CV_INSTRUMENT_REGION(); - if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) ) - { - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, FixedPtCastEx(bits)); - if( ddepth == CV_8U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_8U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16S && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_16S && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_32F && sdepth == CV_32F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - if( ddepth == CV_64F && sdepth == CV_64F ) - return makePtr, ColumnNoVec> >(kernel, anchor, delta); - } - else - { - int ksize = kernel.rows + kernel.cols - 1; - if( ksize == 3 ) - { - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr, SymmColumnVec_32s8u> > - (kernel, anchor, delta, symmetryType, FixedPtCastEx(bits), - SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)); - if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 ) - return makePtr, - SymmColumnSmallVec_32s16s> >(kernel, anchor, delta, symmetryType, - Cast(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)); - if( ddepth == CV_32F && sdepth == CV_32F ) - return makePtr,SymmColumnSmallVec_32f> > - (kernel, anchor, delta, symmetryType, Cast(), - SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)); - } - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr, SymmColumnVec_32s8u> > - (kernel, anchor, delta, symmetryType, FixedPtCastEx(bits), - SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)); - if( ddepth == CV_8U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_8U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16U && sdepth == CV_32F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16U && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16S && sdepth == CV_32S ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_16S && sdepth == CV_32F ) - return makePtr, SymmColumnVec_32f16s> > - (kernel, anchor, delta, symmetryType, Cast(), - SymmColumnVec_32f16s(kernel, symmetryType, 0, delta)); - if( ddepth == CV_16S && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - if( ddepth == CV_32F && sdepth == CV_32F ) - return makePtr, SymmColumnVec_32f> > - (kernel, anchor, delta, symmetryType, Cast(), - SymmColumnVec_32f(kernel, symmetryType, 0, delta)); - if( ddepth == CV_64F && sdepth == CV_64F ) - return makePtr, ColumnNoVec> > - (kernel, anchor, delta, symmetryType); - } - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of buffer format (=%d), and destination format (=%d)", - bufType, dstType)); + Mat kernelMat = kernel.getMat(); + CV_CPU_DISPATCH(getLinearColumnFilter, (bufType, dstType, kernelMat, anchor, symmetryType, delta, bits), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::createSeparableLinearFilter( - int _srcType, int _dstType, - InputArray __rowKernel, InputArray __columnKernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) +Ptr createSeparableLinearFilter( + int _srcType, int _dstType, + InputArray __rowKernel, InputArray __columnKernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) { Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat(); _srcType = CV_MAT_TYPE(_srcType); @@ -3124,9 +345,6 @@ cv::Ptr cv::createSeparableLinearFilter( * Non-separable linear filter * \****************************************************************************************/ -namespace cv -{ - void preprocess2DKernel( const Mat& kernel, std::vector& coords, std::vector& coeffs ) { int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); @@ -3729,89 +947,25 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, #endif -} - -cv::Ptr cv::getLinearFilter(int srcType, int dstType, - InputArray filter_kernel, Point anchor, - double delta, int bits) +Ptr getLinearFilter( + int srcType, int dstType, + InputArray filter_kernel, Point anchor, + double delta, int bits) { - Mat _kernel = filter_kernel.getMat(); - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); - int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth(); - CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth ); + CV_INSTRUMENT_REGION(); - anchor = normalizeAnchor(anchor, _kernel.size()); - - /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S ) - return makePtr, FilterVec_8u> > - (_kernel, anchor, delta, FixedPtCastEx(bits), - FilterVec_8u(_kernel, bits, delta)); - if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S ) - return makePtr, FilterVec_8u16s> > - (_kernel, anchor, delta, FixedPtCastEx(bits), - FilterVec_8u16s(_kernel, bits, delta));*/ - - kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F; - Mat kernel; - if( _kernel.type() == kdepth ) - kernel = _kernel; - else - _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.); - - if( sdepth == CV_8U && ddepth == CV_8U ) - return makePtr, FilterVec_8u> > - (kernel, anchor, delta, Cast(), FilterVec_8u(kernel, 0, delta)); - if( sdepth == CV_8U && ddepth == CV_16U ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_8U && ddepth == CV_16S ) - return makePtr, FilterVec_8u16s> > - (kernel, anchor, delta, Cast(), FilterVec_8u16s(kernel, 0, delta)); - if( sdepth == CV_8U && ddepth == CV_32F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - if( sdepth == CV_16U && ddepth == CV_16U ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16U && ddepth == CV_32F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - if( sdepth == CV_16S && ddepth == CV_16S ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16S && ddepth == CV_32F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - if( sdepth == CV_32F && ddepth == CV_32F ) - return makePtr, FilterVec_32f> > - (kernel, anchor, delta, Cast(), FilterVec_32f(kernel, 0, delta)); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr, FilterNoVec> >(kernel, anchor, delta); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and destination format (=%d)", - srcType, dstType)); + Mat kernelMat = filter_kernel.getMat(); + CV_CPU_DISPATCH(getLinearFilter, (srcType, dstType, kernelMat, anchor, delta, bits), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::createLinearFilter( int _srcType, int _dstType, - InputArray filter_kernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) +Ptr createLinearFilter( + int _srcType, int _dstType, + InputArray filter_kernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) { Mat _kernel = filter_kernel.getMat(); _srcType = CV_MAT_TYPE(_srcType); @@ -3844,8 +998,6 @@ cv::Ptr cv::createLinearFilter( int _srcType, int _dstType, // HAL interface //================================================================ -using namespace cv; - static bool replacementFilter2D(int stype, int dtype, int kernel_type, uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, @@ -4083,7 +1235,6 @@ static void ocvSepFilter(int stype, int dtype, int ktype, // HAL functions //=================================================================== -namespace cv { namespace hal { @@ -4191,16 +1342,15 @@ void sepFilter2D(int stype, int dtype, int ktype, anchor_x, anchor_y, delta, borderType); } -} // cv::hal:: -} // cv:: +} // namespace cv::hal:: //================================================================ // Main interface //================================================================ -void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor0, - double delta, int borderType ) +void filter2D(InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernel, Point anchor0, + double delta, int borderType) { CV_INSTRUMENT_REGION(); @@ -4229,9 +1379,9 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, delta, borderType, src.isSubmatrix()); } -void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) +void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType) { CV_INSTRUMENT_REGION(); @@ -4266,6 +1416,7 @@ void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, anchor.x, anchor.y, delta, borderType & ~BORDER_ISOLATED); } +} // namespace CV_IMPL void cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp index 93f3f177e6..198c8c336c 100644 --- a/modules/imgproc/src/filter.hpp +++ b/modules/imgproc/src/filter.hpp @@ -56,6 +56,8 @@ namespace cv InputArray _kernelX, InputArray _kernelY, Point anchor, double delta, int borderType ); #endif + + void preprocess2DKernel(const Mat& kernel, std::vector& coords, std::vector& coeffs); } #endif diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index 43200218dc..48675152fa 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -41,160 +41,85 @@ //M*/ #include "precomp.hpp" -#include "opencv2/core/opencl/ocl_defs.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "hal_replacement.hpp" #include "opencv2/core/hal/intrin.hpp" #include "filter.hpp" +#if defined(CV_CPU_BASELINE_MODE) +#if IPP_VERSION_X100 >= 710 +#define USE_IPP_SEP_FILTERS 1 +#else +#undef USE_IPP_SEP_FILTERS +#endif +#endif + /****************************************************************************************\ Base Image Filter \****************************************************************************************/ -#if IPP_VERSION_X100 >= 710 -#define USE_IPP_SEP_FILTERS 1 -#else -#undef USE_IPP_SEP_FILTERS -#endif +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs); +int FilterEngine__proceed(FilterEngine& this_, const uchar* src, int srcstep, int count, + uchar* dst, int dststep); +void FilterEngine__apply(FilterEngine& this_, const Mat& src, Mat& dst, const Size& wsz, const Point& ofs); -namespace cv -{ +Ptr getLinearRowFilter( + int srcType, int bufType, + const Mat& kernel, int anchor, + int symmetryType); -BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } -BaseRowFilter::~BaseRowFilter() {} +Ptr getLinearColumnFilter( + int bufType, int dstType, + const Mat& kernel, int anchor, + int symmetryType, double delta, + int bits); -BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; } -BaseColumnFilter::~BaseColumnFilter() {} -void BaseColumnFilter::reset() {} - -BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); } -BaseFilter::~BaseFilter() {} -void BaseFilter::reset() {} - -FilterEngine::FilterEngine() - : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), - rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), - borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) -{ -} +Ptr getLinearFilter( + int srcType, int dstType, + const Mat& filter_kernel, Point anchor, + double delta, int bits); -FilterEngine::FilterEngine( const Ptr& _filter2D, - const Ptr& _rowFilter, - const Ptr& _columnFilter, - int _srcType, int _dstType, int _bufType, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) - : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), - rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), - borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) -{ - init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType, - _rowBorderType, _columnBorderType, _borderValue); -} - -FilterEngine::~FilterEngine() -{ -} - - -void FilterEngine::init( const Ptr& _filter2D, - const Ptr& _rowFilter, - const Ptr& _columnFilter, - int _srcType, int _dstType, int _bufType, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - _srcType = CV_MAT_TYPE(_srcType); - _bufType = CV_MAT_TYPE(_bufType); - _dstType = CV_MAT_TYPE(_dstType); - - srcType = _srcType; - int srcElemSize = (int)getElemSize(srcType); - dstType = _dstType; - bufType = _bufType; - - filter2D = _filter2D; - rowFilter = _rowFilter; - columnFilter = _columnFilter; - - if( _columnBorderType < 0 ) - _columnBorderType = _rowBorderType; - - rowBorderType = _rowBorderType; - columnBorderType = _columnBorderType; - - CV_Assert( columnBorderType != BORDER_WRAP ); - - if( isSeparable() ) - { - CV_Assert( rowFilter && columnFilter ); - ksize = Size(rowFilter->ksize, columnFilter->ksize); - anchor = Point(rowFilter->anchor, columnFilter->anchor); - } - else - { - CV_Assert( bufType == srcType ); - ksize = filter2D->ksize; - anchor = filter2D->anchor; - } - - CV_Assert( 0 <= anchor.x && anchor.x < ksize.width && - 0 <= anchor.y && anchor.y < ksize.height ); - - borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1); - int borderLength = std::max(ksize.width - 1, 1); - borderTab.resize(borderLength*borderElemSize); - - maxWidth = bufStep = 0; - constBorderRow.clear(); - - if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT ) - { - constBorderValue.resize(srcElemSize*borderLength); - int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4)); - scalarToRawData(_borderValue, &constBorderValue[0], srcType1, - borderLength*CV_MAT_CN(srcType)); - } - - wholeSize = Size(-1,-1); -} +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #define VEC_ALIGN CV_MALLOC_ALIGN -int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs) +int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs) { + CV_INSTRUMENT_REGION(); + int i, j; - wholeSize = _wholeSize; - roi = Rect(ofs, sz); - CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 && - roi.x + roi.width <= wholeSize.width && - roi.y + roi.height <= wholeSize.height ); + this_.wholeSize = _wholeSize; + this_.roi = Rect(ofs, sz); + CV_Assert( this_.roi.x >= 0 && this_.roi.y >= 0 && this_.roi.width >= 0 && this_.roi.height >= 0 && + this_.roi.x + this_.roi.width <= this_.wholeSize.width && + this_.roi.y + this_.roi.height <= this_.wholeSize.height ); - int esz = (int)getElemSize(srcType); - int bufElemSize = (int)getElemSize(bufType); - const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0; + int esz = (int)getElemSize(this_.srcType); + int bufElemSize = (int)getElemSize(this_.bufType); + const uchar* constVal = !this_.constBorderValue.empty() ? &this_.constBorderValue[0] : 0; - int _maxBufRows = std::max(ksize.height + 3, - std::max(anchor.y, - ksize.height-anchor.y-1)*2+1); + int _maxBufRows = std::max(this_.ksize.height + 3, + std::max(this_.anchor.y, + this_.ksize.height-this_.anchor.y-1)*2+1); - if( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) + if (this_.maxWidth < this_.roi.width || _maxBufRows != (int)this_.rows.size() ) { - rows.resize(_maxBufRows); - maxWidth = std::max(maxWidth, roi.width); - int cn = CV_MAT_CN(srcType); - srcRow.resize(esz*(maxWidth + ksize.width - 1)); - if( columnBorderType == BORDER_CONSTANT ) + this_.rows.resize(_maxBufRows); + this_.maxWidth = std::max(this_.maxWidth, this_.roi.width); + int cn = CV_MAT_CN(this_.srcType); + this_.srcRow.resize(esz*(this_.maxWidth + this_.ksize.width - 1)); + if (this_.columnBorderType == BORDER_CONSTANT) { CV_Assert(constVal != NULL); - constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN)); - uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst; - int n = (int)constBorderValue.size(), N; - N = (maxWidth + ksize.width - 1)*esz; - tdst = isSeparable() ? &srcRow[0] : dst; + this_.constBorderRow.resize(getElemSize(this_.bufType)*(this_.maxWidth + this_.ksize.width - 1 + VEC_ALIGN)); + uchar *dst = alignPtr(&this_.constBorderRow[0], VEC_ALIGN); + int n = (int)this_.constBorderValue.size(); + int N = (this_.maxWidth + this_.ksize.width - 1)*esz; + uchar *tdst = this_.isSeparable() ? &this_.srcRow[0] : dst; for( i = 0; i < N; i += n ) { @@ -203,126 +128,113 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs tdst[i+j] = constVal[j]; } - if( isSeparable() ) - (*rowFilter)(&srcRow[0], dst, maxWidth, cn); + if (this_.isSeparable()) + (*this_.rowFilter)(&this_.srcRow[0], dst, this_.maxWidth, cn); } - int maxBufStep = bufElemSize*(int)alignSize(maxWidth + - (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); - ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN); + int maxBufStep = bufElemSize*(int)alignSize(this_.maxWidth + + (!this_.isSeparable() ? this_.ksize.width - 1 : 0), VEC_ALIGN); + this_.ringBuf.resize(maxBufStep*this_.rows.size()+VEC_ALIGN); } // adjust bufstep so that the used part of the ring buffer stays compact in memory - bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); + this_.bufStep = bufElemSize*(int)alignSize(this_.roi.width + (!this_.isSeparable() ? this_.ksize.width - 1 : 0), VEC_ALIGN); - dx1 = std::max(anchor.x - roi.x, 0); - dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); + this_.dx1 = std::max(this_.anchor.x - this_.roi.x, 0); + this_.dx2 = std::max(this_.ksize.width - this_.anchor.x - 1 + this_.roi.x + this_.roi.width - this_.wholeSize.width, 0); // recompute border tables - if( dx1 > 0 || dx2 > 0 ) + if (this_.dx1 > 0 || this_.dx2 > 0) { - if( rowBorderType == BORDER_CONSTANT ) + if (this_.rowBorderType == BORDER_CONSTANT ) { CV_Assert(constVal != NULL); - int nr = isSeparable() ? 1 : (int)rows.size(); + int nr = this_.isSeparable() ? 1 : (int)this_.rows.size(); for( i = 0; i < nr; i++ ) { - uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i; - memcpy( dst, constVal, dx1*esz ); - memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz ); + uchar* dst = this_.isSeparable() ? &this_.srcRow[0] : alignPtr(&this_.ringBuf[0], VEC_ALIGN) + this_.bufStep*i; + memcpy(dst, constVal, this_.dx1*esz); + memcpy(dst + (this_.roi.width + this_.ksize.width - 1 - this_.dx2)*esz, constVal, this_.dx2*esz); } } else { - int xofs1 = std::min(roi.x, anchor.x) - roi.x; + int xofs1 = std::min(this_.roi.x, this_.anchor.x) - this_.roi.x; - int btab_esz = borderElemSize, wholeWidth = wholeSize.width; - int* btab = (int*)&borderTab[0]; + int btab_esz = this_.borderElemSize, wholeWidth = this_.wholeSize.width; + int* btab = (int*)&this_.borderTab[0]; - for( i = 0; i < dx1; i++ ) + for( i = 0; i < this_.dx1; i++ ) { - int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz; + int p0 = (borderInterpolate(i-this_.dx1, wholeWidth, this_.rowBorderType) + xofs1)*btab_esz; for( j = 0; j < btab_esz; j++ ) btab[i*btab_esz + j] = p0 + j; } - for( i = 0; i < dx2; i++ ) + for( i = 0; i < this_.dx2; i++ ) { - int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz; + int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, this_.rowBorderType) + xofs1)*btab_esz; for( j = 0; j < btab_esz; j++ ) - btab[(i + dx1)*btab_esz + j] = p0 + j; + btab[(i + this_.dx1)*btab_esz + j] = p0 + j; } } } - rowCount = dstY = 0; - startY = startY0 = std::max(roi.y - anchor.y, 0); - endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height); - if( columnFilter ) - columnFilter->reset(); - if( filter2D ) - filter2D->reset(); + this_.rowCount = this_.dstY = 0; + this_.startY = this_.startY0 = std::max(this_.roi.y - this_.anchor.y, 0); + this_.endY = std::min(this_.roi.y + this_.roi.height + this_.ksize.height - this_.anchor.y - 1, this_.wholeSize.height); - return startY; + if (this_.columnFilter) + this_.columnFilter->reset(); + if (this_.filter2D) + this_.filter2D->reset(); + + return this_.startY; } -int FilterEngine::start(const Mat& src, const Size &wsz, const Point &ofs) +int FilterEngine__proceed(FilterEngine& this_, const uchar* src, int srcstep, int count, + uchar* dst, int dststep) { - start( wsz, src.size(), ofs); - return startY - ofs.y; -} + CV_INSTRUMENT_REGION(); -int FilterEngine::remainingInputRows() const -{ - return endY - startY - rowCount; -} + CV_DbgAssert(this_.wholeSize.width > 0 && this_.wholeSize.height > 0 ); -int FilterEngine::remainingOutputRows() const -{ - return roi.height - dstY; -} - -int FilterEngine::proceed( const uchar* src, int srcstep, int count, - uchar* dst, int dststep ) -{ - CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); - - const int *btab = &borderTab[0]; - int esz = (int)getElemSize(srcType), btab_esz = borderElemSize; - uchar** brows = &rows[0]; - int bufRows = (int)rows.size(); - int cn = CV_MAT_CN(bufType); - int width = roi.width, kwidth = ksize.width; - int kheight = ksize.height, ay = anchor.y; - int _dx1 = dx1, _dx2 = dx2; - int width1 = roi.width + kwidth - 1; - int xofs1 = std::min(roi.x, anchor.x); - bool isSep = isSeparable(); - bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT; + const int *btab = &this_.borderTab[0]; + int esz = (int)getElemSize(this_.srcType), btab_esz = this_.borderElemSize; + uchar** brows = &this_.rows[0]; + int bufRows = (int)this_.rows.size(); + int cn = CV_MAT_CN(this_.bufType); + int width = this_.roi.width, kwidth = this_.ksize.width; + int kheight = this_.ksize.height, ay = this_.anchor.y; + int _dx1 = this_.dx1, _dx2 = this_.dx2; + int width1 = this_.roi.width + kwidth - 1; + int xofs1 = std::min(this_.roi.x, this_.anchor.x); + bool isSep = this_.isSeparable(); + bool makeBorder = (_dx1 > 0 || _dx2 > 0) && this_.rowBorderType != BORDER_CONSTANT; int dy = 0, i = 0; src -= xofs1*esz; - count = std::min(count, remainingInputRows()); + count = std::min(count, this_.remainingInputRows()); - CV_Assert( src && dst && count > 0 ); + CV_Assert(src && dst && count > 0); for(;; dst += dststep*i, dy += i) { - int dcount = bufRows - ay - startY - rowCount + roi.y; + int dcount = bufRows - ay - this_.startY - this_.rowCount + this_.roi.y; dcount = dcount > 0 ? dcount : bufRows - kheight + 1; dcount = std::min(dcount, count); count -= dcount; for( ; dcount-- > 0; src += srcstep ) { - int bi = (startY - startY0 + rowCount) % bufRows; - uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; - uchar* row = isSep ? &srcRow[0] : brow; + int bi = (this_.startY - this_.startY0 + this_.rowCount) % bufRows; + uchar* brow = alignPtr(&this_.ringBuf[0], VEC_ALIGN) + bi*this_.bufStep; + uchar* row = isSep ? &this_.srcRow[0] : brow; - if( ++rowCount > bufRows ) + if (++this_.rowCount > bufRows) { - --rowCount; - ++startY; + --this_.rowCount; + ++this_.startY; } memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz ); @@ -349,99 +261,55 @@ int FilterEngine::proceed( const uchar* src, int srcstep, int count, } if( isSep ) - (*rowFilter)(row, brow, width, CV_MAT_CN(srcType)); + (*this_.rowFilter)(row, brow, width, CV_MAT_CN(this_.srcType)); } - int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1)); + int max_i = std::min(bufRows, this_.roi.height - (this_.dstY + dy) + (kheight - 1)); for( i = 0; i < max_i; i++ ) { - int srcY = borderInterpolate(dstY + dy + i + roi.y - ay, - wholeSize.height, columnBorderType); + int srcY = borderInterpolate(this_.dstY + dy + i + this_.roi.y - ay, + this_.wholeSize.height, this_.columnBorderType); if( srcY < 0 ) // can happen only with constant border type - brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN); + brows[i] = alignPtr(&this_.constBorderRow[0], VEC_ALIGN); else { - CV_Assert( srcY >= startY ); - if( srcY >= startY + rowCount ) + CV_Assert(srcY >= this_.startY); + if( srcY >= this_.startY + this_.rowCount) break; - int bi = (srcY - startY0) % bufRows; - brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; + int bi = (srcY - this_.startY0) % bufRows; + brows[i] = alignPtr(&this_.ringBuf[0], VEC_ALIGN) + bi*this_.bufStep; } } if( i < kheight ) break; i -= kheight - 1; - if( isSeparable() ) - (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn); + if (isSep) + (*this_.columnFilter)((const uchar**)brows, dst, dststep, i, this_.roi.width*cn); else - (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn); + (*this_.filter2D)((const uchar**)brows, dst, dststep, i, this_.roi.width, cn); } - dstY += dy; - CV_Assert( dstY <= roi.height ); + this_.dstY += dy; + CV_Assert(this_.dstY <= this_.roi.height); return dy; } -void FilterEngine::apply(const Mat& src, Mat& dst, const Size & wsz, const Point & ofs) +void FilterEngine__apply(FilterEngine& this_, const Mat& src, Mat& dst, const Size& wsz, const Point& ofs) { CV_INSTRUMENT_REGION(); - CV_Assert( src.type() == srcType && dst.type() == dstType ); + CV_DbgAssert(src.type() == this_.srcType && dst.type() == this_.dstType); - int y = start(src, wsz, ofs); - proceed(src.ptr() + y*src.step, + FilterEngine__start(this_, wsz, src.size(), ofs); + int y = this_.startY - ofs.y; + FilterEngine__proceed(this_, + src.ptr() + y*src.step, (int)src.step, - endY - startY, + this_.endY - this_.startY, dst.ptr(), (int)dst.step ); } -} - -/****************************************************************************************\ -* Separable linear filter * -\****************************************************************************************/ - -int cv::getKernelType(InputArray filter_kernel, Point anchor) -{ - Mat _kernel = filter_kernel.getMat(); - CV_Assert( _kernel.channels() == 1 ); - int i, sz = _kernel.rows*_kernel.cols; - - Mat kernel; - _kernel.convertTo(kernel, CV_64F); - - const double* coeffs = kernel.ptr(); - double sum = 0; - int type = KERNEL_SMOOTH + KERNEL_INTEGER; - if( (_kernel.rows == 1 || _kernel.cols == 1) && - anchor.x*2 + 1 == _kernel.cols && - anchor.y*2 + 1 == _kernel.rows ) - type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL); - - for( i = 0; i < sz; i++ ) - { - double a = coeffs[i], b = coeffs[sz - i - 1]; - if( a != b ) - type &= ~KERNEL_SYMMETRICAL; - if( a != -b ) - type &= ~KERNEL_ASYMMETRICAL; - if( a < 0 ) - type &= ~KERNEL_SMOOTH; - if( a != saturate_cast(a) ) - type &= ~KERNEL_INTEGER; - sum += a; - } - - if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) ) - type &= ~KERNEL_SMOOTH; - return type; -} - - -namespace cv -{ - struct RowNoVec { RowNoVec() {} @@ -503,6 +371,8 @@ struct RowVec_8u32s int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; const int* _kx = kernel.ptr(); @@ -587,7 +457,6 @@ struct RowVec_8u32s i += v_uint32::nlanes; } } - vx_cleanup(); return i; } @@ -618,6 +487,8 @@ struct SymmRowSmallVec_8u32s int operator()(const uchar* src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -1083,8 +954,6 @@ struct SymmRowSmallVec_8u32s } } } - - vx_cleanup(); return i; } @@ -1107,6 +976,8 @@ struct SymmColumnVec_32s8u int operator()(const uchar** _src, uchar* dst, int width) const { + CV_INSTRUMENT_REGION(); + int _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1237,8 +1108,6 @@ struct SymmColumnVec_32s8u i += v_int32x4::nlanes; } } - - vx_cleanup(); return i; } @@ -1261,6 +1130,8 @@ struct SymmColumnSmallVec_32s16s int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; @@ -1420,8 +1291,6 @@ struct SymmColumnSmallVec_32s16s } } } - - vx_cleanup(); return i; } @@ -1443,6 +1312,8 @@ struct RowVec_16s32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; float* dst = (float*)_dst; const float* _kx = kernel.ptr(); @@ -1495,7 +1366,6 @@ struct RowVec_16s32f v_store(dst + i, s0); i += v_float32::nlanes; } - vx_cleanup(); return i; } @@ -1516,6 +1386,8 @@ struct SymmColumnVec_32f16s int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1620,7 +1492,6 @@ struct SymmColumnVec_32f16s } } - vx_cleanup(); return i; } @@ -1653,6 +1524,8 @@ struct RowVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + #if defined USE_IPP_SEP_FILTERS CV_IPP_CHECK() { @@ -1722,7 +1595,6 @@ struct RowVec_32f v_store(dst + i, s0); i += v_float32::nlanes; } - vx_cleanup(); return i; } @@ -1782,6 +1654,8 @@ struct SymmRowSmallVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1868,8 +1742,6 @@ struct SymmRowSmallVec_32f v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1)); } } - - vx_cleanup(); return i; } @@ -1896,6 +1768,8 @@ struct SymmColumnVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; @@ -2005,8 +1879,6 @@ struct SymmColumnVec_32f i += v_float32::nlanes; } } - - vx_cleanup(); return i; } @@ -2030,6 +1902,8 @@ struct SymmColumnSmallVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; @@ -2085,8 +1959,6 @@ struct SymmColumnSmallVec_32f v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4)); } } - - vx_cleanup(); return i; } @@ -2115,6 +1987,8 @@ struct FilterVec_8u int operator()(const uchar** src, uchar* dst, int width) const { + CV_INSTRUMENT_REGION(); + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; int i = 0, k, nz = _nz; @@ -2175,8 +2049,6 @@ struct FilterVec_8u *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); i += v_int32x4::nlanes; } - - vx_cleanup(); return i; } @@ -2201,6 +2073,8 @@ struct FilterVec_8u16s int operator()(const uchar** src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; short* dst = (short*)_dst; @@ -2251,8 +2125,6 @@ struct FilterVec_8u16s v_pack_store(dst + i, v_round(s0)); i += v_int32::nlanes; } - - vx_cleanup(); return i; } @@ -2275,6 +2147,8 @@ struct FilterVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + const float* kf = (const float*)&coeffs[0]; const float** src = (const float**)_src; float* dst = (float*)_dst; @@ -2323,8 +2197,6 @@ struct FilterVec_32f v_store(dst + i, s0); i += v_float32::nlanes; } - - vx_cleanup(); return i; } @@ -2369,6 +2241,8 @@ template struct RowFilter : public BaseRo void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int _ksize = ksize; const DT* kx = kernel.ptr
(); const ST* S; @@ -2427,6 +2301,8 @@ template struct SymmRowSmallFilter : void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2, ksize2n = ksize2*cn; const DT* kx = this->kernel.template ptr
() + ksize2; bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -2566,6 +2442,8 @@ template struct ColumnFilter : public BaseColumnFilte void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const ST* ky = kernel.template ptr(); ST _delta = delta; int _ksize = ksize; @@ -2629,6 +2507,8 @@ template struct SymmColumnFilter : public ColumnFilte void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2; const ST* ky = this->kernel.template ptr() + ksize2; int i, k; @@ -2735,6 +2615,8 @@ struct SymmColumnSmallFilter : public SymmColumnFilter void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2; const ST* ky = this->kernel.template ptr() + ksize2; int i; @@ -2904,13 +2786,14 @@ template struct FixedPtCastEx int SHIFT, DELTA; }; -} -cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, - InputArray _kernel, int anchor, - int symmetryType ) +Ptr getLinearRowFilter( + int srcType, int bufType, + const Mat& kernel, int anchor, + int symmetryType) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType); int cn = CV_MAT_CN(srcType); CV_Assert( cn == CV_MAT_CN(bufType) && @@ -2958,12 +2841,14 @@ cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, } -cv::Ptr cv::getLinearColumnFilter( int bufType, int dstType, - InputArray _kernel, int anchor, - int symmetryType, double delta, - int bits ) +Ptr getLinearColumnFilter( + int bufType, int dstType, + const Mat& kernel, int anchor, + int symmetryType, double delta, + int bits) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(dstType); CV_Assert( cn == CV_MAT_CN(bufType) && @@ -3053,131 +2938,6 @@ cv::Ptr cv::getLinearColumnFilter( int bufType, int dstTyp } -cv::Ptr cv::createSeparableLinearFilter( - int _srcType, int _dstType, - InputArray __rowKernel, InputArray __columnKernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat(); - _srcType = CV_MAT_TYPE(_srcType); - _dstType = CV_MAT_TYPE(_dstType); - int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); - int cn = CV_MAT_CN(_srcType); - CV_Assert( cn == CV_MAT_CN(_dstType) ); - int rsize = _rowKernel.rows + _rowKernel.cols - 1; - int csize = _columnKernel.rows + _columnKernel.cols - 1; - if( _anchor.x < 0 ) - _anchor.x = rsize/2; - if( _anchor.y < 0 ) - _anchor.y = csize/2; - int rtype = getKernelType(_rowKernel, - _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x)); - int ctype = getKernelType(_columnKernel, - _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y)); - Mat rowKernel, columnKernel; - - int bdepth = std::max(CV_32F,std::max(sdepth, ddepth)); - int bits = 0; - - if( sdepth == CV_8U && - ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ddepth == CV_8U) || - ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && - (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && - (rtype & ctype & KERNEL_INTEGER) && - ddepth == CV_16S)) ) - { - bdepth = CV_32S; - bits = ddepth == CV_8U ? 8 : 0; - _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits ); - _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits ); - bits *= 2; - _delta *= (1 << bits); - } - else - { - if( _rowKernel.type() != bdepth ) - _rowKernel.convertTo( rowKernel, bdepth ); - else - rowKernel = _rowKernel; - if( _columnKernel.type() != bdepth ) - _columnKernel.convertTo( columnKernel, bdepth ); - else - columnKernel = _columnKernel; - } - - int _bufType = CV_MAKETYPE(bdepth, cn); - Ptr _rowFilter = getLinearRowFilter( - _srcType, _bufType, rowKernel, _anchor.x, rtype); - Ptr _columnFilter = getLinearColumnFilter( - _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits ); - - return Ptr( new FilterEngine(Ptr(), _rowFilter, _columnFilter, - _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue )); -} - - -/****************************************************************************************\ -* Non-separable linear filter * -\****************************************************************************************/ - -namespace cv -{ - -void preprocess2DKernel( const Mat& kernel, std::vector& coords, std::vector& coeffs ) -{ - int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); - if(nz == 0) - nz = 1; - CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F ); - coords.resize(nz); - coeffs.resize(nz*getElemSize(ktype)); - uchar* _coeffs = &coeffs[0]; - - for( i = k = 0; i < kernel.rows; i++ ) - { - const uchar* krow = kernel.ptr(i); - for( j = 0; j < kernel.cols; j++ ) - { - if( ktype == CV_8U ) - { - uchar val = krow[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - _coeffs[k++] = val; - } - else if( ktype == CV_32S ) - { - int val = ((const int*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((int*)_coeffs)[k++] = val; - } - else if( ktype == CV_32F ) - { - float val = ((const float*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((float*)_coeffs)[k++] = val; - } - else - { - double val = ((const double*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((double*)_coeffs)[k++] = val; - } - } - } -} - template struct Filter2D : public BaseFilter { @@ -3253,489 +3013,14 @@ template struct Filter2D : public BaseFi VecOp vecOp; }; -#ifdef HAVE_OPENCL -#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) -#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) - -// prepare kernel: transpose and make double rows (+align). Returns size of aligned row -// Samples: -// a b c -// Input: d e f -// g h i -// Output, last two zeros is the alignment: -// a d g a d g 0 0 -// b e h b e h 0 0 -// c f i c f i 0 0 -template -static int _prepareKernelFilter2D(std::vector & data, const Mat & kernel) +Ptr getLinearFilter( + int srcType, int dstType, + const Mat& _kernel, Point anchor, + double delta, int bits) { - Mat _kernel; kernel.convertTo(_kernel, DataDepth::value); - int size_y_aligned = ROUNDUP(kernel.rows * 2, 4); - data.clear(); data.resize(size_y_aligned * kernel.cols, 0); - for (int x = 0; x < kernel.cols; x++) - { - for (int y = 0; y < kernel.rows; y++) - { - data[x * size_y_aligned + y] = _kernel.at(y, x); - data[x * size_y_aligned + y + kernel.rows] = _kernel.at(y, x); - } - } - return size_y_aligned; -} + CV_INSTRUMENT_REGION(); -static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor, - double delta, int borderType ) -{ - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - ddepth = ddepth < 0 ? sdepth : ddepth; - int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F), - wtype = CV_MAKE_TYPE(wdepth, cn); - if (cn > 4) - return false; - - Size ksize = _kernel.size(); - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - const cv::ocl::Device &device = cv::ocl::Device::getDefault(); - bool doubleSupport = device.doubleFPConfig() > 0; - if (wdepth == CV_64F && !doubleSupport) - return false; - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", - "BORDER_WRAP", "BORDER_REFLECT_101" }; - - cv::Mat kernelMat = _kernel.getMat(); - cv::Size sz = _src.size(), wholeSize; - size_t globalsize[2] = { (size_t)sz.width, (size_t)sz.height }; - size_t localsize_general[2] = {0, 1}; - size_t* localsize = NULL; - - ocl::Kernel k; - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - size_t tryWorkItems = device.maxWorkGroupSize(); - if (device.isIntel() && 128 < tryWorkItems) - tryWorkItems = 128; - char cvt[2][40]; - - // For smaller filter kernels, there is a special kernel that is more - // efficient than the general one. - UMat kernalDataUMat; - if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) && - ((ksize.width < 5 && ksize.height < 5) || - (ksize.width == 5 && ksize.height == 5 && cn == 1))) - { - kernelMat = kernelMat.reshape(0, 1); - String kerStr = ocl::kernelToStr(kernelMat, CV_32F); - int h = isolated ? sz.height : wholeSize.height; - int w = isolated ? sz.width : wholeSize.width; - - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1; - int pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = sz.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = sz.width % 2 ? 1 : 2; - pxPerWorkItemY = sz.height % 2 ? 1 : 2; - } - globalsize[0] = sz.width / pxPerWorkItemX; - globalsize[1] = sz.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = ROUNDUP(globalsize[0], wgRound); - - char build_options[1024]; - sprintf(build_options, "-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s %s", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str()); - - if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options)) - return false; - } - else - { - localsize = localsize_general; - std::vector kernelMatDataFloat; - int kernel_size_y2_aligned = _prepareKernelFilter2D(kernelMatDataFloat, kernelMat); - String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F); - - for ( ; ; ) - { - size_t BLOCK_SIZE = tryWorkItems; - while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2) - BLOCK_SIZE /= 2; - - if ((size_t)ksize.width > BLOCK_SIZE) - return false; - - int requiredTop = anchor.y; - int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x; - int requiredBottom = ksize.height - 1 - anchor.y; - int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; - int h = isolated ? sz.height : wholeSize.height; - int w = isolated ? sz.width : wholeSize.width; - bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; - - if ((w < ksize.width) || (h < ksize.height)) - return false; - - String opts = format("-D LOCAL_SIZE=%d -D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s", - (int)BLOCK_SIZE, cn, anchor.x, anchor.y, - ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType], - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(), - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1])); - - localsize[0] = BLOCK_SIZE; - globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE; - globalsize[1] = sz.height; - - if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts)) - return false; - - size_t kernelWorkGroupSize = k.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE < kernelWorkGroupSize) - return false; - tryWorkItems = kernelWorkGroupSize; - } - } - - _dst.create(sz, dtype); - UMat dst = _dst.getUMat(); - - int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); - int srcOffsetY = (int)(src.offset / src.step); - int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width); - int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height); - - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY, - srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta); - - return k.run(2, globalsize, localsize, false); -} - -const int shift_bits = 8; - -static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, - int borderType, int ddepth, bool fast8uc1, bool int_arithm) -{ - int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - Size bufSize = buf.size(); - int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); - - if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) - return false; - -#ifdef __ANDROID__ - size_t localsize[2] = {16, 10}; -#else - size_t localsize[2] = {16, 16}; -#endif - - size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; - if (fast8uc1) - globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; - - int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; - - bool isolated = (borderType & BORDER_ISOLATED) != 0; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, - * const btype = borderMap[borderType & ~BORDER_ISOLATED]; - - bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1; - extra_extrapolation |= src.rows < radiusY; - extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; - extra_extrapolation |= src.cols < radiusX; - - char cvt[40]; - cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s", - radiusX, (int)localsize[0], (int)localsize[1], cn, btype, - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - ocl::typeToStr(type), ocl::typeToStr(buf_type), - ocl::convertTypeStr(sdepth, bdepth, cn, cvt), - ocl::typeToStr(sdepth), ocl::typeToStr(bdepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", - int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - build_options += ocl::kernelToStr(kernelX, bdepth); - - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - - String kernelName("row_filter"); - if (fast8uc1) - kernelName += "_C1_D0"; - - ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, - build_options); - if (k.empty()) - return false; - - if (fast8uc1) - k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, - srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, - ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), - buf.cols, buf.rows, radiusY); - else - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x, - srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, - ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY); - - return k.run(2, globalsize, localsize, false); -} - -static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) -{ - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if (dst.depth() == CV_64F && !doubleSupport) - return false; - -#ifdef __ANDROID__ - size_t localsize[2] = { 16, 10 }; -#else - size_t localsize[2] = { 16, 16 }; -#endif - size_t globalsize[2] = { 0, 0 }; - - int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); - Size sz = dst.size(); - int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); - - globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; - globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; - - char cvt[40]; - cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s" - " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", - anchor, (int)localsize[0], (int)localsize[1], cn, - ocl::typeToStr(buf_type), ocl::typeToStr(dtype), - ocl::convertTypeStr(bdepth, ddepth, cn, cvt), - ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), - 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", - int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - build_options += ocl::kernelToStr(kernelY, bdepth); - - ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, - build_options); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); - - return k.run(2, globalsize, localsize, false); -} - -const int optimizedSepFilterLocalWidth = 16; -const int optimizedSepFilterLocalHeight = 8; - -static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, - Mat row_kernel, Mat col_kernel, - double delta, int borderType, int ddepth, int bdepth, bool int_arithm) -{ - Size size = _src.size(), wholeSize; - Point origin; - int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), - esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), - dtype = CV_MAKE_TYPE(ddepth, cn); - size_t src_step = _src.step(), src_offset = _src.offset(); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - - if (esz == 0 || src_step == 0 - || (src_offset % src_step) % esz != 0 - || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) - || !(borderType == BORDER_CONSTANT - || borderType == BORDER_REPLICATE - || borderType == BORDER_REFLECT - || borderType == BORDER_WRAP - || borderType == BORDER_REFLECT_101)) - return false; - - size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; - size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; - - char cvt[2][40]; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", - "BORDER_REFLECT_101" }; - - String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" - " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", - (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, - ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), - ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), - cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - - ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, dtype); - UMat dst = _dst.getUMat(); - - int src_offset_x = static_cast((src_offset % src_step) / esz); - int src_offset_y = static_cast(src_offset / src_step); - - src.locateROI(wholeSize, origin); - - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, - wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); - - return k.run(2, gt2, lt2, false); -} - -bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) -{ - const ocl::Device & d = ocl::Device::getDefault(); - Size imgSize = _src.size(); - - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - if (cn > 4) - return false; - - Mat kernelX = _kernelX.getMat().reshape(1, 1); - if (kernelX.cols % 2 != 1) - return false; - Mat kernelY = _kernelY.getMat().reshape(1, 1); - if (kernelY.cols % 2 != 1) - return false; - - if (ddepth < 0) - ddepth = sdepth; - - if (anchor.x < 0) - anchor.x = kernelX.cols >> 1; - if (anchor.y < 0) - anchor.y = kernelY.cols >> 1; - - int rtype = getKernelType(kernelX, - kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); - int ctype = getKernelType(kernelY, - kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); - - int bdepth = CV_32F; - bool int_arithm = false; - if( sdepth == CV_8U && ddepth == CV_8U && - rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) - { - if (ocl::Device::getDefault().isIntel()) - { - for (int i=0; i(0, i) = (float) cvRound(kernelX.at(0, i) * (1 << shift_bits)); - if (kernelX.data != kernelY.data) - for (int i=0; i(0, i) = (float) cvRound(kernelY.at(0, i) * (1 << shift_bits)); - } else - { - bdepth = CV_32S; - kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); - kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); - } - int_arithm = true; - } - - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - imgSize.width > optimizedSepFilterLocalWidth + anchor.x && - imgSize.height > optimizedSepFilterLocalHeight + anchor.y && - (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && - anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && - OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT - ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) - - UMat src = _src.getUMat(); - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - - bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0; - - Size srcSize = src.size(); - Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); - UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) - return false; - - _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); - UMat dst = _dst.getUMat(); - - return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); -} - -#endif - -} - -cv::Ptr cv::getLinearFilter(int srcType, int dstType, - InputArray filter_kernel, Point anchor, - double delta, int bits) -{ - Mat _kernel = filter_kernel.getMat(); int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth(); CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth ); @@ -3806,476 +3091,6 @@ cv::Ptr cv::getLinearFilter(int srcType, int dstType, srcType, dstType)); } - -cv::Ptr cv::createLinearFilter( int _srcType, int _dstType, - InputArray filter_kernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat _kernel = filter_kernel.getMat(); - _srcType = CV_MAT_TYPE(_srcType); - _dstType = CV_MAT_TYPE(_dstType); - int cn = CV_MAT_CN(_srcType); - CV_Assert( cn == CV_MAT_CN(_dstType) ); - - Mat kernel = _kernel; - int bits = 0; - - /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); - int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor); - if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) && - _kernel.rows*_kernel.cols <= (1 << 10) ) - { - bits = (ktype & KERNEL_INTEGER) ? 0 : 11; - _kernel.convertTo(kernel, CV_32S, 1 << bits); - }*/ - - Ptr _filter2D = getLinearFilter(_srcType, _dstType, - kernel, _anchor, _delta, bits); - - return makePtr(_filter2D, Ptr(), - Ptr(), _srcType, _dstType, _srcType, - _rowBorderType, _columnBorderType, _borderValue ); -} - - -//================================================================ -// HAL interface -//================================================================ - -using namespace cv; - -static bool replacementFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, bool isSubmatrix) -{ - cvhalFilter2D* ctx; - int res = cv_hal_filterInit(&ctx, kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, width, height, - stype, dtype, borderType, delta, anchor_x, anchor_y, isSubmatrix, src_data == dst_data); - if (res != CV_HAL_ERROR_OK) - return false; - res = cv_hal_filter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - bool success = (res == CV_HAL_ERROR_OK); - res = cv_hal_filterFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - return success; -} - -#ifdef HAVE_IPP -static bool ippFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, - bool isSubmatrix) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - ::ipp::IwiSize iwSize(width, height); - ::ipp::IwiSize kernelSize(kernel_width, kernel_height); - IppDataType type = ippiGetDataType(CV_MAT_DEPTH(stype)); - int channels = CV_MAT_CN(stype); - - CV_UNUSED(isSubmatrix); - -#if IPP_VERSION_X100 >= 201700 && IPP_VERSION_X100 <= 201702 // IPP bug with 1x1 kernel - if(kernel_width == 1 && kernel_height == 1) - return false; #endif - -#if IPP_DISABLE_FILTER2D_BIG_MASK - // Too big difference compared to OpenCV FFT-based convolution - if(kernel_type == CV_32FC1 && (type == ipp16s || type == ipp16u) && (kernel_width > 7 || kernel_height > 7)) - return false; - - // Poor optimization for big kernels - if(kernel_width > 7 || kernel_height > 7) - return false; -#endif - - if(src_data == dst_data) - return false; - - if(stype != dtype) - return false; - - if(kernel_type != CV_16SC1 && kernel_type != CV_32FC1) - return false; - - // TODO: Implement offset for 8u, 16u - if(std::fabs(delta) >= DBL_EPSILON) - return false; - - if(!ippiCheckAnchor(anchor_x, anchor_y, kernel_width, kernel_height)) - return false; - - try - { - ::ipp::IwiBorderSize iwBorderSize; - ::ipp::IwiBorderType iwBorderType; - ::ipp::IwiImage iwKernel(ippiSize(kernel_width, kernel_height), ippiGetDataType(CV_MAT_DEPTH(kernel_type)), CV_MAT_CN(kernel_type), 0, (void*)kernel_data, kernel_step); - ::ipp::IwiImage iwSrc(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)src_data, src_step); - ::ipp::IwiImage iwDst(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)dst_data, dst_step); - - iwBorderSize = ::ipp::iwiSizeToBorderSize(kernelSize); - iwBorderType = ippiGetBorder(iwSrc, borderType, iwBorderSize); - if(!iwBorderType) - return false; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilter, iwSrc, iwDst, iwKernel, ::ipp::IwiFilterParams(1, 0, ippAlgHintNone, ippRndFinancial), iwBorderType); - } - catch(const ::ipp::IwException& ex) - { - CV_UNUSED(ex); - return false; - } - - return true; -#else - CV_UNUSED(stype); CV_UNUSED(dtype); CV_UNUSED(kernel_type); CV_UNUSED(src_data); CV_UNUSED(src_step); - CV_UNUSED(dst_data); CV_UNUSED(dst_step); CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(full_width); - CV_UNUSED(full_height); CV_UNUSED(offset_x); CV_UNUSED(offset_y); CV_UNUSED(kernel_data); CV_UNUSED(kernel_step); - CV_UNUSED(kernel_width); CV_UNUSED(kernel_height); CV_UNUSED(anchor_x); CV_UNUSED(anchor_y); CV_UNUSED(delta); - CV_UNUSED(borderType); CV_UNUSED(isSubmatrix); - return false; -#endif -} -#endif - -static bool dftFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType) -{ - { - int sdepth = CV_MAT_DEPTH(stype); - int ddepth = CV_MAT_DEPTH(dtype); - int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50; - if (kernel_width * kernel_height < dft_filter_size) - return false; - } - - Point anchor = Point(anchor_x, anchor_y); - Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - - Mat src(Size(full_width-offset_x, full_height-offset_y), stype, src_data, src_step); - Mat dst(Size(full_width, full_height), dtype, dst_data, dst_step); - Mat temp; - int src_channels = CV_MAT_CN(stype); - int dst_channels = CV_MAT_CN(dtype); - int ddepth = CV_MAT_DEPTH(dtype); - // crossCorr doesn't accept non-zero delta with multiple channels - if (src_channels != 1 && delta != 0) { - // The semantics of filter2D require that the delta be applied - // as floating-point math. So wee need an intermediate Mat - // with a float datatype. If the dest is already floats, - // we just use that. - int corrDepth = ddepth; - if ((ddepth == CV_32F || ddepth == CV_64F) && src_data != dst_data) { - temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); - } else { - corrDepth = ddepth == CV_64F ? CV_64F : CV_32F; - temp.create(Size(full_width, full_height), CV_MAKETYPE(corrDepth, dst_channels)); - } - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(corrDepth, src_channels), - anchor, 0, borderType); - add(temp, delta, temp); - if (temp.data != dst_data) { - temp.convertTo(dst, dst.type()); - } - } else { - if (src_data != dst_data) - temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); - else - temp.create(Size(full_width, full_height), dtype); - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(ddepth, src_channels), - anchor, delta, borderType); - if (temp.data != dst_data) - temp.copyTo(dst); - } - return true; -} - -static void ocvFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType) -{ - int borderTypeValue = borderType & ~BORDER_ISOLATED; - Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - Ptr f = createLinearFilter(stype, dtype, kernel, Point(anchor_x, anchor_y), delta, - borderTypeValue); - Mat src(Size(width, height), stype, src_data, src_step); - Mat dst(Size(width, height), dtype, dst_data, dst_step); - f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); -} - -static bool replacementSepFilter(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - cvhalFilter2D *ctx; - int res = cv_hal_sepFilterInit(&ctx, stype, dtype, ktype, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); - if (res != CV_HAL_ERROR_OK) - return false; - res = cv_hal_sepFilter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - bool success = (res == CV_HAL_ERROR_OK); - res = cv_hal_sepFilterFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - return success; -} - -static void ocvSepFilter(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - Mat kernelX(Size(kernelx_len, 1), ktype, kernelx_data); - Mat kernelY(Size(kernely_len, 1), ktype, kernely_data); - Ptr f = createSeparableLinearFilter(stype, dtype, kernelX, kernelY, - Point(anchor_x, anchor_y), - delta, borderType & ~BORDER_ISOLATED); - Mat src(Size(width, height), stype, src_data, src_step); - Mat dst(Size(width, height), dtype, dst_data, dst_step); - f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); -}; - -//=================================================================== -// HAL functions -//=================================================================== - -namespace cv { -namespace hal { - - -CV_DEPRECATED Ptr Filter2D::create(uchar * , size_t , int , - int , int , - int , int , - int , int , - int , double , - int , int , - bool , bool ) { return Ptr(); } - -CV_DEPRECATED Ptr SepFilter2D::create(int , int , int , - uchar * , int , - uchar * , int , - int , int , - double , int ) { return Ptr(); } - - -void filter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, - bool isSubmatrix) -{ - bool res; - res = replacementFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType, isSubmatrix); - if (res) - return; - - CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType, isSubmatrix)) - - res = dftFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType); - if (res) - return; - ocvFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType); -} - -//--------------------------------------------------------------- - -void sepFilter2D(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - - bool res = replacementSepFilter(stype, dtype, ktype, - src_data, src_step, dst_data, dst_step, - width, height, full_width, full_height, - offset_x, offset_y, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); - if (res) - return; - ocvSepFilter(stype, dtype, ktype, - src_data, src_step, dst_data, dst_step, - width, height, full_width, full_height, - offset_x, offset_y, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); -} - -} // cv::hal:: -} // cv:: - -//================================================================ -// Main interface -//================================================================ - -void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor0, - double delta, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, - ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType)) - - Mat src = _src.getMat(), kernel = _kernel.getMat(); - - if( ddepth < 0 ) - ddepth = src.depth(); - - _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); - Mat dst = _dst.getMat(); - Point anchor = normalizeAnchor(anchor0, kernel.size()); - - Point ofs; - Size wsz(src.cols, src.rows); - if( (borderType & BORDER_ISOLATED) == 0 ) - src.locateROI( wsz, ofs ); - - hal::filter2D(src.type(), dst.type(), kernel.type(), - src.data, src.step, dst.data, dst.step, - dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, - kernel.data, kernel.step, kernel.cols, kernel.rows, - anchor.x, anchor.y, - delta, borderType, src.isSubmatrix()); -} - -void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(), - ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) - - Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); - - if( ddepth < 0 ) - ddepth = src.depth(); - - _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if( (borderType & BORDER_ISOLATED) == 0 ) - src.locateROI( wsz, ofs ); - - CV_Assert( kernelX.type() == kernelY.type() && - (kernelX.cols == 1 || kernelX.rows == 1) && - (kernelY.cols == 1 || kernelY.rows == 1) ); - - Mat contKernelX = kernelX.isContinuous() ? kernelX : kernelX.clone(); - Mat contKernelY = kernelY.isContinuous() ? kernelY : kernelY.clone(); - - hal::sepFilter2D(src.type(), dst.type(), kernelX.type(), - src.data, src.step, dst.data, dst.step, - dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, - contKernelX.data, kernelX.cols + kernelX.rows - 1, - contKernelY.data, kernelY.cols + kernelY.rows - 1, - anchor.x, anchor.y, delta, borderType & ~BORDER_ISOLATED); -} - - -CV_IMPL void -cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); - cv::Mat kernel = cv::cvarrToMat(_kernel); - - CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() ); - - cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE ); -} - -/* End of file. */ +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From 8546ac3ce6bd955de51c743f9ca87b0f27a15f12 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 12:57:48 +0000 Subject: [PATCH 16/21] imgproc: get rid of filter.avx2.cpp --- modules/imgproc/src/filter.avx2.cpp | 197 ---------------------------- modules/imgproc/src/filter.hpp | 6 - modules/imgproc/src/filter.simd.hpp | 115 +++++++++++++--- 3 files changed, 99 insertions(+), 219 deletions(-) delete mode 100644 modules/imgproc/src/filter.avx2.cpp diff --git a/modules/imgproc/src/filter.avx2.cpp b/modules/imgproc/src/filter.avx2.cpp deleted file mode 100644 index e9ced20e36..0000000000 --- a/modules/imgproc/src/filter.avx2.cpp +++ /dev/null @@ -1,197 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" -#include "filter.hpp" - -namespace cv -{ - -int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize) -{ - int i = 0, k; - for (; i <= width - 8; i += 8) - { - const float* src = src0 + i; - __m256 f, x0; - __m256 s0 = _mm256_set1_ps(0.0f); - for (k = 0; k < _ksize; k++, src += cn) - { - f = _mm256_set1_ps(_kx[k]); - x0 = _mm256_loadu_ps(src); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - } - _mm256_storeu_ps(dst + i, s0); - } - _mm256_zeroupper(); - return i; -} - -int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) -{ - int i = 0, k; - const float *S, *S2; - const __m128 d4 = _mm_set1_ps(delta); - const __m256 d8 = _mm256_set1_ps(delta); - - for( ; i <= width - 16; i += 16 ) - { - __m256 f = _mm256_set1_ps(ky[0]); - __m256 s0, s1; - __m256 x0; - S = src[0] + i; - s0 = _mm256_loadu_ps(S); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(s0, f, d8); -#else - s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); -#endif - s1 = _mm256_loadu_ps(S+8); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(s1, f, d8); -#else - s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); -#endif - - for( k = 1; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - - for( ; i <= width - 4; i += 4 ) - { - __m128 f = _mm_set1_ps(ky[0]); - __m128 x0, s0 = _mm_load_ps(src[0] + i); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - - for( k = 1; k <= ksize2; k++ ) - { - f = _mm_set1_ps(ky[k]); - x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); - } - - _mm256_zeroupper(); - return i; -} - -int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) -{ - int i = 0, k; - const float *S2; - const __m128 d4 = _mm_set1_ps(delta); - const __m256 d8 = _mm256_set1_ps(delta); - - for (; i <= width - 16; i += 16) - { - __m256 f, s0 = d8, s1 = d8; - __m256 x0; - - for (k = 1; k <= ksize2; k++) - { - const float *S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - - for (; i <= width - 4; i += 4) - { - __m128 f, x0, s0 = d4; - - for (k = 1; k <= ksize2; k++) - { - f = _mm_set1_ps(ky[k]); - x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); - } - - _mm256_zeroupper(); - return i; -} - -} - -/* End of file. */ diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp index 198c8c336c..7b792d1935 100644 --- a/modules/imgproc/src/filter.hpp +++ b/modules/imgproc/src/filter.hpp @@ -45,12 +45,6 @@ namespace cv { -#if CV_TRY_AVX2 - int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize); - int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); - int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); -#endif - #ifdef HAVE_OPENCL bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY, Point anchor, diff --git a/modules/imgproc/src/filter.simd.hpp b/modules/imgproc/src/filter.simd.hpp index 48675152fa..f09cd1ec1d 100644 --- a/modules/imgproc/src/filter.simd.hpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -1507,7 +1507,6 @@ struct RowVec_32f { RowVec_32f() { - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1516,7 +1515,6 @@ struct RowVec_32f RowVec_32f( const Mat& _kernel ) { kernel = _kernel; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1543,9 +1541,24 @@ struct RowVec_32f int i = 0, k; width *= cn; -#if CV_TRY_AVX2 - if (haveAVX2) - return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); +#if CV_AVX + for (; i <= width - 8; i += 8) + { + const float* src = src0 + i; + __m256 f, x0; + __m256 s0 = _mm256_set1_ps(0.0f); + for (k = 0; k < _ksize; k++, src += cn) + { + f = _mm256_set1_ps(_kx[k]); + x0 = _mm256_loadu_ps(src); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + } + _mm256_storeu_ps(dst + i, s0); + } #endif v_float32 k0 = vx_setall_f32(_kx[0]); for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) @@ -1599,7 +1612,6 @@ struct RowVec_32f } Mat kernel; - bool haveAVX2; #if defined USE_IPP_SEP_FILTERS private: mutable int bufsz; @@ -1754,7 +1766,6 @@ struct SymmColumnVec_32f { SymmColumnVec_32f() { symmetryType=0; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; delta = 0; } SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) @@ -1762,7 +1773,6 @@ struct SymmColumnVec_32f symmetryType = _symmetryType; kernel = _kernel; delta = (float)_delta; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); } @@ -1780,9 +1790,53 @@ struct SymmColumnVec_32f if( symmetrical ) { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); +#if CV_AVX + { + const float *S, *S2; + const __m256 d8 = _mm256_set1_ps(delta); + + for( ; i <= width - 16; i += 16 ) + { + __m256 f = _mm256_set1_ps(ky[0]); + __m256 s0, s1; + __m256 x0; + S = src[0] + i; + s0 = _mm256_loadu_ps(S); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(s0, f, d8); +#else + s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); +#endif + s1 = _mm256_loadu_ps(S+8); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(s1, f, d8); +#else + s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); +#endif + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + } #endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k0 = vx_setall_f32(ky[0]); @@ -1830,11 +1884,41 @@ struct SymmColumnVec_32f } else { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); -#endif CV_DbgAssert(ksize2 > 0); +#if CV_AVX + { + const float *S2; + const __m256 d8 = _mm256_set1_ps(delta); + + for (; i <= width - 16; i += 16) + { + __m256 f, s0 = d8, s1 = d8; + __m256 x0; + + for (k = 1; k <= ksize2; k++) + { + const float *S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + } +#endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k1 = vx_setall_f32(ky[1]); for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) @@ -1885,7 +1969,6 @@ struct SymmColumnVec_32f int symmetryType; float delta; Mat kernel; - bool haveAVX2; }; From 6ec08f268f90c39747d9ee0126821761e9b9ad31 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 18:55:42 +0000 Subject: [PATCH 17/21] imgproc: dispatch medianBlur --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/median_blur.dispatch.cpp | 937 +------------------ modules/imgproc/src/median_blur.simd.hpp | 288 +----- 3 files changed, 29 insertions(+), 1197 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index d3afe151bd..c149edb9b3 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -4,4 +4,5 @@ ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/median_blur.dispatch.cpp b/modules/imgproc/src/median_blur.dispatch.cpp index c98cd9215a..d993fbad5b 100644 --- a/modules/imgproc/src/median_blur.dispatch.cpp +++ b/modules/imgproc/src/median_blur.dispatch.cpp @@ -50,895 +50,10 @@ #include "opencv2/core/openvx/ovx_defs.hpp" -/* - * This file includes the code, contributed by Simon Perreault - * (the function icvMedianBlur_8u_O1) - * - * Constant-time median filtering -- http://nomis80.org/ctmf.html - * Copyright (C) 2006 Simon Perreault - * - * Contact: - * Laboratoire de vision et systemes numeriques - * Pavillon Adrien-Pouliot - * Universite Laval - * Sainte-Foy, Quebec, Canada - * G1K 7P4 - * - * perreaul@gel.ulaval.ca - */ +#include "median_blur.simd.hpp" +#include "median_blur.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content -/****************************************************************************************\ - Median Filter -\****************************************************************************************/ - -namespace cv -{ - -static void -medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) -{ - typedef ushort HT; - - /** - * This structure represents a two-tier histogram. The first tier (known as the - * "coarse" level) is 4 bit wide and the second tier (known as the "fine" level) - * is 8 bit wide. Pixels inserted in the fine level also get inserted into the - * coarse bucket designated by the 4 MSBs of the fine bucket value. - * - * The structure is aligned on 16 bits, which is a prerequisite for SIMD - * instructions. Each bucket is 16 bit wide, which means that extra care must be - * taken to prevent overflow. - */ - typedef struct - { - HT coarse[16]; - HT fine[16][16]; - } Histogram; - -/** - * HOP is short for Histogram OPeration. This macro makes an operation \a op on - * histogram \a h for pixel value \a x. It takes care of handling both levels. - */ -#define HOP(h,x,op) \ - h.coarse[x>>4] op, \ - *((HT*)h.fine + x) op - -#define COP(c,j,x,op) \ - h_coarse[ 16*(n*c+j) + (x>>4) ] op, \ - h_fine[ 16 * (n*(16*c+(x>>4)) + j) + (x & 0xF) ] op - - int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2; - CV_Assert(cn > 0 && cn <= 4); - size_t sstep = _src.step, dstep = _dst.step; - - int STRIPE_SIZE = std::min( _dst.cols, 512/cn ); - -#if defined(CV_SIMD_WIDTH) && CV_SIMD_WIDTH >= 16 -# define CV_ALIGNMENT CV_SIMD_WIDTH -#else -# define CV_ALIGNMENT 16 -#endif - - std::vector _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT); - std::vector _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT); - HT* h_coarse = alignPtr(&_h_coarse[0], CV_ALIGNMENT); - HT* h_fine = alignPtr(&_h_fine[0], CV_ALIGNMENT); - - for( int x = 0; x < _dst.cols; x += STRIPE_SIZE ) - { - int i, j, k, c, n = std::min(_dst.cols - x, STRIPE_SIZE) + r*2; - const uchar* src = _src.ptr() + x*cn; - uchar* dst = _dst.ptr() + (x - r)*cn; - - memset( h_coarse, 0, 16*n*cn*sizeof(h_coarse[0]) ); - memset( h_fine, 0, 16*16*n*cn*sizeof(h_fine[0]) ); - - // First row initialization - for( c = 0; c < cn; c++ ) - { - for( j = 0; j < n; j++ ) - COP( c, j, src[cn*j+c], += (HT)(r+2) ); - - for( i = 1; i < r; i++ ) - { - const uchar* p = src + sstep*std::min(i, m-1); - for ( j = 0; j < n; j++ ) - COP( c, j, p[cn*j+c], ++ ); - } - } - - for( i = 0; i < m; i++ ) - { - const uchar* p0 = src + sstep * std::max( 0, i-r-1 ); - const uchar* p1 = src + sstep * std::min( m-1, i+r ); - - for( c = 0; c < cn; c++ ) - { - Histogram CV_DECL_ALIGNED(CV_ALIGNMENT) H; - HT CV_DECL_ALIGNED(CV_ALIGNMENT) luc[16]; - - memset(&H, 0, sizeof(H)); - memset(luc, 0, sizeof(luc)); - - // Update column histograms for the entire row. - for( j = 0; j < n; j++ ) - { - COP( c, j, p0[j*cn + c], -- ); - COP( c, j, p1[j*cn + c], ++ ); - } - - // First column initialization - for (k = 0; k < 16; ++k) - { -#if CV_SIMD256 - v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k])); -#elif CV_SIMD128 - v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k])); - v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8)); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]); -#endif - } - -#if CV_SIMD256 - v_uint16x16 v_coarse = v256_load(H.coarse); -#elif CV_SIMD128 - v_uint16x8 v_coarsel = v_load(H.coarse); - v_uint16x8 v_coarseh = v_load(H.coarse + 8); -#endif - HT* px = h_coarse + 16 * n*c; - for( j = 0; j < 2*r; ++j, px += 16 ) - { -#if CV_SIMD256 - v_coarse += v256_load(px); -#elif CV_SIMD128 - v_coarsel += v_load(px); - v_coarseh += v_load(px + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.coarse[ind] += px[ind]; -#endif - } - - for( j = r; j < n-r; j++ ) - { - int t = 2*r*r + 2*r, b, sum = 0; - HT* segment; - - px = h_coarse + 16 * (n*c + std::min(j + r, n - 1)); -#if CV_SIMD256 - v_coarse += v256_load(px); - v_store(H.coarse, v_coarse); -#elif CV_SIMD128 - v_coarsel += v_load(px); - v_coarseh += v_load(px + 8); - v_store(H.coarse, v_coarsel); - v_store(H.coarse + 8, v_coarseh); -#else - for (int ind = 0; ind < 16; ++ind) - H.coarse[ind] += px[ind]; -#endif - - // Find median at coarse level - for ( k = 0; k < 16 ; ++k ) - { - sum += H.coarse[k]; - if ( sum > t ) - { - sum -= H.coarse[k]; - break; - } - } - CV_Assert( k < 16 ); - - /* Update corresponding histogram segment */ -#if CV_SIMD256 - v_uint16x16 v_fine; -#elif CV_SIMD128 - v_uint16x8 v_finel; - v_uint16x8 v_fineh; -#endif - if ( luc[k] <= j-r ) - { -#if CV_SIMD256 - v_fine = v256_setzero_u16(); -#elif CV_SIMD128 - v_finel = v_setzero_u16(); - v_fineh = v_setzero_u16(); -#else - memset(&H.fine[k], 0, 16 * sizeof(HT)); -#endif - px = h_fine + 16 * (n*(16 * c + k) + j - r); - for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16) - { -#if CV_SIMD256 - v_fine += v256_load(px); -#elif CV_SIMD128 - v_finel += v_load(px); - v_fineh += v_load(px + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] += px[ind]; -#endif - } - - if ( luc[k] < j+r+1 ) - { - px = h_fine + 16 * (n*(16 * c + k) + (n - 1)); -#if CV_SIMD256 - v_fine += v_mul_wrap(v256_load(px), v256_setall_u16(j + r + 1 - n)); -#elif CV_SIMD128 - v_finel += v_mul_wrap(v_load(px), v_setall_u16((ushort)(j + r + 1 - n))); - v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]); -#endif - luc[k] = (HT)(j+r+1); - } - } - else - { -#if CV_SIMD256 - v_fine = v256_load(H.fine[k]); -#elif CV_SIMD128 - v_finel = v_load(H.fine[k]); - v_fineh = v_load(H.fine[k] + 8); -#endif - px = h_fine + 16*n*(16 * c + k); - for ( ; luc[k] < j+r+1; ++luc[k] ) - { -#if CV_SIMD256 - v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); -#elif CV_SIMD128 - v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1) ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); - v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind]; -#endif - } - } - - px = h_coarse + 16 * (n*c + MAX(j - r, 0)); -#if CV_SIMD256 - v_store(H.fine[k], v_fine); - v_coarse -= v256_load(px); -#elif CV_SIMD128 - v_store(H.fine[k], v_finel); - v_store(H.fine[k] + 8, v_fineh); - v_coarsel -= v_load(px); - v_coarseh -= v_load(px + 8); -#else - for (int ind = 0; ind < 16; ++ind) - H.coarse[ind] -= px[ind]; -#endif - - /* Find median in segment */ - segment = H.fine[k]; - for ( b = 0; b < 16 ; b++ ) - { - sum += segment[b]; - if ( sum > t ) - { - dst[dstep*i+cn*j+c] = (uchar)(16*k + b); - break; - } - } - CV_Assert( b < 16 ); - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -#undef HOP -#undef COP -} - -static void -medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m ) -{ - #define N 16 - int zone0[4][N]; - int zone1[4][N*N]; - int x, y; - int n2 = m*m/2; - Size size = _dst.size(); - const uchar* src = _src.ptr(); - uchar* dst = _dst.ptr(); - int src_step = (int)_src.step, dst_step = (int)_dst.step; - int cn = _src.channels(); - const uchar* src_max = src + size.height*src_step; - CV_Assert(cn > 0 && cn <= 4); - - #define UPDATE_ACC01( pix, cn, op ) \ - { \ - int p = (pix); \ - zone1[cn][p] op; \ - zone0[cn][p >> 4] op; \ - } - - //CV_Assert( size.height >= nx && size.width >= nx ); - for( x = 0; x < size.width; x++, src += cn, dst += cn ) - { - uchar* dst_cur = dst; - const uchar* src_top = src; - const uchar* src_bottom = src; - int k, c; - int src_step1 = src_step, dst_step1 = dst_step; - - if( x % 2 != 0 ) - { - src_bottom = src_top += src_step*(size.height-1); - dst_cur += dst_step*(size.height-1); - src_step1 = -src_step1; - dst_step1 = -dst_step1; - } - - // init accumulator - memset( zone0, 0, sizeof(zone0[0])*cn ); - memset( zone1, 0, sizeof(zone1[0])*cn ); - - for( y = 0; y <= m/2; y++ ) - { - for( c = 0; c < cn; c++ ) - { - if( y > 0 ) - { - for( k = 0; k < m*cn; k += cn ) - UPDATE_ACC01( src_bottom[k+c], c, ++ ); - } - else - { - for( k = 0; k < m*cn; k += cn ) - UPDATE_ACC01( src_bottom[k+c], c, += m/2+1 ); - } - } - - if( (src_step1 > 0 && y < size.height-1) || - (src_step1 < 0 && size.height-y-1 > 0) ) - src_bottom += src_step1; - } - - for( y = 0; y < size.height; y++, dst_cur += dst_step1 ) - { - // find median - for( c = 0; c < cn; c++ ) - { - int s = 0; - for( k = 0; ; k++ ) - { - int t = s + zone0[c][k]; - if( t > n2 ) break; - s = t; - } - - for( k *= N; ;k++ ) - { - s += zone1[c][k]; - if( s > n2 ) break; - } - - dst_cur[c] = (uchar)k; - } - - if( y+1 == size.height ) - break; - - if( cn == 1 ) - { - for( k = 0; k < m; k++ ) - { - int p = src_top[k]; - int q = src_bottom[k]; - zone1[0][p]--; - zone0[0][p>>4]--; - zone1[0][q]++; - zone0[0][q>>4]++; - } - } - else if( cn == 3 ) - { - for( k = 0; k < m*3; k += 3 ) - { - UPDATE_ACC01( src_top[k], 0, -- ); - UPDATE_ACC01( src_top[k+1], 1, -- ); - UPDATE_ACC01( src_top[k+2], 2, -- ); - - UPDATE_ACC01( src_bottom[k], 0, ++ ); - UPDATE_ACC01( src_bottom[k+1], 1, ++ ); - UPDATE_ACC01( src_bottom[k+2], 2, ++ ); - } - } - else - { - assert( cn == 4 ); - for( k = 0; k < m*4; k += 4 ) - { - UPDATE_ACC01( src_top[k], 0, -- ); - UPDATE_ACC01( src_top[k+1], 1, -- ); - UPDATE_ACC01( src_top[k+2], 2, -- ); - UPDATE_ACC01( src_top[k+3], 3, -- ); - - UPDATE_ACC01( src_bottom[k], 0, ++ ); - UPDATE_ACC01( src_bottom[k+1], 1, ++ ); - UPDATE_ACC01( src_bottom[k+2], 2, ++ ); - UPDATE_ACC01( src_bottom[k+3], 3, ++ ); - } - } - - if( (src_step1 > 0 && src_bottom + src_step1 < src_max) || - (src_step1 < 0 && src_bottom + src_step1 >= src) ) - src_bottom += src_step1; - - if( y >= m/2 ) - src_top += src_step1; - } - } -#undef N -#undef UPDATE_ACC -} - - -namespace { - -struct MinMax8u -{ - typedef uchar value_type; - typedef int arg_type; - enum { SIZE = 1 }; - arg_type load(const uchar* ptr) { return *ptr; } - void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; } - void operator()(arg_type& a, arg_type& b) const - { - int t = CV_FAST_CAST_8U(a - b); - b += t; a -= t; - } -}; - -struct MinMax16u -{ - typedef ushort value_type; - typedef int arg_type; - enum { SIZE = 1 }; - arg_type load(const ushort* ptr) { return *ptr; } - void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = std::min(a, b); - b = std::max(b, t); - } -}; - -struct MinMax16s -{ - typedef short value_type; - typedef int arg_type; - enum { SIZE = 1 }; - arg_type load(const short* ptr) { return *ptr; } - void store(short* ptr, arg_type val) { *ptr = (short)val; } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = std::min(a, b); - b = std::max(b, t); - } -}; - -struct MinMax32f -{ - typedef float value_type; - typedef float arg_type; - enum { SIZE = 1 }; - arg_type load(const float* ptr) { return *ptr; } - void store(float* ptr, arg_type val) { *ptr = val; } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = std::min(a, b); - b = std::max(b, t); - } -}; - -#if CV_SIMD - -struct MinMaxVec8u -{ - typedef uchar value_type; - typedef v_uint8x16 arg_type; - enum { SIZE = v_uint8x16::nlanes }; - arg_type load(const uchar* ptr) { return v_load(ptr); } - void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_uint8 warg_type; - enum { WSIZE = v_uint8::nlanes }; - warg_type wload(const uchar* ptr) { return vx_load(ptr); } - void store(uchar* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - - -struct MinMaxVec16u -{ - typedef ushort value_type; - typedef v_uint16x8 arg_type; - enum { SIZE = v_uint16x8::nlanes }; - arg_type load(const ushort* ptr) { return v_load(ptr); } - void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_uint16 warg_type; - enum { WSIZE = v_uint16::nlanes }; - warg_type wload(const ushort* ptr) { return vx_load(ptr); } - void store(ushort* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - - -struct MinMaxVec16s -{ - typedef short value_type; - typedef v_int16x8 arg_type; - enum { SIZE = v_int16x8::nlanes }; - arg_type load(const short* ptr) { return v_load(ptr); } - void store(short* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_int16 warg_type; - enum { WSIZE = v_int16::nlanes }; - warg_type wload(const short* ptr) { return vx_load(ptr); } - void store(short* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - - -struct MinMaxVec32f -{ - typedef float value_type; - typedef v_float32x4 arg_type; - enum { SIZE = v_float32x4::nlanes }; - arg_type load(const float* ptr) { return v_load(ptr); } - void store(float* ptr, const arg_type &val) { v_store(ptr, val); } - void operator()(arg_type& a, arg_type& b) const - { - arg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#if CV_SIMD_WIDTH > 16 - typedef v_float32 warg_type; - enum { WSIZE = v_float32::nlanes }; - warg_type wload(const float* ptr) { return vx_load(ptr); } - void store(float* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif -}; - -#else - -typedef MinMax8u MinMaxVec8u; -typedef MinMax16u MinMaxVec16u; -typedef MinMax16s MinMaxVec16s; -typedef MinMax32f MinMaxVec32f; - -#endif - -template -static void -medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) -{ - typedef typename Op::value_type T; - typedef typename Op::arg_type WT; - typedef typename VecOp::arg_type VT; -#if CV_SIMD_WIDTH > 16 - typedef typename VecOp::warg_type WVT; -#endif - - const T* src = _src.ptr(); - T* dst = _dst.ptr(); - int sstep = (int)(_src.step/sizeof(T)); - int dstep = (int)(_dst.step/sizeof(T)); - Size size = _dst.size(); - int i, j, k, cn = _src.channels(); - Op op; - VecOp vop; - - if( m == 3 ) - { - if( size.width == 1 || size.height == 1 ) - { - int len = size.width + size.height - 1; - int sdelta = size.height == 1 ? cn : sstep; - int sdelta0 = size.height == 1 ? 0 : sstep - cn; - int ddelta = size.height == 1 ? cn : dstep; - - for( i = 0; i < len; i++, src += sdelta0, dst += ddelta ) - for( j = 0; j < cn; j++, src++ ) - { - WT p0 = src[i > 0 ? -sdelta : 0]; - WT p1 = src[0]; - WT p2 = src[i < len - 1 ? sdelta : 0]; - - op(p0, p1); op(p1, p2); op(p0, p1); - dst[j] = (T)p1; - } - return; - } - - size.width *= cn; - for( i = 0; i < size.height; i++, dst += dstep ) - { - const T* row0 = src + std::max(i - 1, 0)*sstep; - const T* row1 = src + i*sstep; - const T* row2 = src + std::min(i + 1, size.height-1)*sstep; - int limit = cn; - - for(j = 0;; ) - { - for( ; j < limit; j++ ) - { - int j0 = j >= cn ? j - cn : j; - int j2 = j < size.width - cn ? j + cn : j; - WT p0 = row0[j0], p1 = row0[j], p2 = row0[j2]; - WT p3 = row1[j0], p4 = row1[j], p5 = row1[j2]; - WT p6 = row2[j0], p7 = row2[j], p8 = row2[j2]; - - op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1); - op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5); - op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7); - op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); - op(p4, p2); op(p6, p4); op(p4, p2); - dst[j] = (T)p4; - } - - if( limit == size.width ) - break; - -#if CV_SIMD_WIDTH > 16 - for( ; j <= size.width - VecOp::WSIZE - cn; j += VecOp::WSIZE ) - { - WVT p0 = vop.wload(row0+j-cn), p1 = vop.wload(row0+j), p2 = vop.wload(row0+j+cn); - WVT p3 = vop.wload(row1+j-cn), p4 = vop.wload(row1+j), p5 = vop.wload(row1+j+cn); - WVT p6 = vop.wload(row2+j-cn), p7 = vop.wload(row2+j), p8 = vop.wload(row2+j+cn); - - vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); - vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); - vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); - vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); - vop(p4, p2); vop(p6, p4); vop(p4, p2); - vop.store(dst+j, p4); - } -#endif - for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE ) - { - VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn); - VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn); - VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn); - - vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); - vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); - vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); - vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); - vop(p4, p2); vop(p6, p4); vop(p4, p2); - vop.store(dst+j, p4); - } - - limit = size.width; - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - else if( m == 5 ) - { - if( size.width == 1 || size.height == 1 ) - { - int len = size.width + size.height - 1; - int sdelta = size.height == 1 ? cn : sstep; - int sdelta0 = size.height == 1 ? 0 : sstep - cn; - int ddelta = size.height == 1 ? cn : dstep; - - for( i = 0; i < len; i++, src += sdelta0, dst += ddelta ) - for( j = 0; j < cn; j++, src++ ) - { - int i1 = i > 0 ? -sdelta : 0; - int i0 = i > 1 ? -sdelta*2 : i1; - int i3 = i < len-1 ? sdelta : 0; - int i4 = i < len-2 ? sdelta*2 : i3; - WT p0 = src[i0], p1 = src[i1], p2 = src[0], p3 = src[i3], p4 = src[i4]; - - op(p0, p1); op(p3, p4); op(p2, p3); op(p3, p4); op(p0, p2); - op(p2, p4); op(p1, p3); op(p1, p2); - dst[j] = (T)p2; - } - return; - } - - size.width *= cn; - for( i = 0; i < size.height; i++, dst += dstep ) - { - const T* row[5]; - row[0] = src + std::max(i - 2, 0)*sstep; - row[1] = src + std::max(i - 1, 0)*sstep; - row[2] = src + i*sstep; - row[3] = src + std::min(i + 1, size.height-1)*sstep; - row[4] = src + std::min(i + 2, size.height-1)*sstep; - int limit = cn*2; - - for(j = 0;; ) - { - for( ; j < limit; j++ ) - { - WT p[25]; - int j1 = j >= cn ? j - cn : j; - int j0 = j >= cn*2 ? j - cn*2 : j1; - int j3 = j < size.width - cn ? j + cn : j; - int j4 = j < size.width - cn*2 ? j + cn*2 : j3; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = rowk[j0]; p[k*5+1] = rowk[j1]; - p[k*5+2] = rowk[j]; p[k*5+3] = rowk[j3]; - p[k*5+4] = rowk[j4]; - } - - op(p[1], p[2]); op(p[0], p[1]); op(p[1], p[2]); op(p[4], p[5]); op(p[3], p[4]); - op(p[4], p[5]); op(p[0], p[3]); op(p[2], p[5]); op(p[2], p[3]); op(p[1], p[4]); - op(p[1], p[2]); op(p[3], p[4]); op(p[7], p[8]); op(p[6], p[7]); op(p[7], p[8]); - op(p[10], p[11]); op(p[9], p[10]); op(p[10], p[11]); op(p[6], p[9]); op(p[8], p[11]); - op(p[8], p[9]); op(p[7], p[10]); op(p[7], p[8]); op(p[9], p[10]); op(p[0], p[6]); - op(p[4], p[10]); op(p[4], p[6]); op(p[2], p[8]); op(p[2], p[4]); op(p[6], p[8]); - op(p[1], p[7]); op(p[5], p[11]); op(p[5], p[7]); op(p[3], p[9]); op(p[3], p[5]); - op(p[7], p[9]); op(p[1], p[2]); op(p[3], p[4]); op(p[5], p[6]); op(p[7], p[8]); - op(p[9], p[10]); op(p[13], p[14]); op(p[12], p[13]); op(p[13], p[14]); op(p[16], p[17]); - op(p[15], p[16]); op(p[16], p[17]); op(p[12], p[15]); op(p[14], p[17]); op(p[14], p[15]); - op(p[13], p[16]); op(p[13], p[14]); op(p[15], p[16]); op(p[19], p[20]); op(p[18], p[19]); - op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[21], p[23]); op(p[22], p[24]); - op(p[22], p[23]); op(p[18], p[21]); op(p[20], p[23]); op(p[20], p[21]); op(p[19], p[22]); - op(p[22], p[24]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[12], p[18]); - op(p[16], p[22]); op(p[16], p[18]); op(p[14], p[20]); op(p[20], p[24]); op(p[14], p[16]); - op(p[18], p[20]); op(p[22], p[24]); op(p[13], p[19]); op(p[17], p[23]); op(p[17], p[19]); - op(p[15], p[21]); op(p[15], p[17]); op(p[19], p[21]); op(p[13], p[14]); op(p[15], p[16]); - op(p[17], p[18]); op(p[19], p[20]); op(p[21], p[22]); op(p[23], p[24]); op(p[0], p[12]); - op(p[8], p[20]); op(p[8], p[12]); op(p[4], p[16]); op(p[16], p[24]); op(p[12], p[16]); - op(p[2], p[14]); op(p[10], p[22]); op(p[10], p[14]); op(p[6], p[18]); op(p[6], p[10]); - op(p[10], p[12]); op(p[1], p[13]); op(p[9], p[21]); op(p[9], p[13]); op(p[5], p[17]); - op(p[13], p[17]); op(p[3], p[15]); op(p[11], p[23]); op(p[11], p[15]); op(p[7], p[19]); - op(p[7], p[11]); op(p[11], p[13]); op(p[11], p[12]); - dst[j] = (T)p[12]; - } - - if( limit == size.width ) - break; - -#if CV_SIMD_WIDTH > 16 - for( ; j <= size.width - VecOp::WSIZE - cn*2; j += VecOp::WSIZE ) - { - WVT p[25]; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = vop.wload(rowk+j-cn*2); p[k*5+1] = vop.wload(rowk+j-cn); - p[k*5+2] = vop.wload(rowk+j); p[k*5+3] = vop.wload(rowk+j+cn); - p[k*5+4] = vop.wload(rowk+j+cn*2); - } - - vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); - vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); - vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); - vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); - vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); - vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); - vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); - vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); - vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); - vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); - vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); - vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); - vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); - vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); - vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); - vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); - vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); - vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); - vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); - vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); - vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); - vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); - vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); - vop.store(dst+j, p[12]); - } -#endif - for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE ) - { - VT p[25]; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn); - p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn); - p[k*5+4] = vop.load(rowk+j+cn*2); - } - - vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); - vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); - vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); - vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); - vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); - vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); - vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); - vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); - vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); - vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); - vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); - vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); - vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); - vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); - vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); - vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); - vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); - vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); - vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); - vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); - vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); - vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); - vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); - vop.store(dst+j, p[12]); - } - - limit = size.width; - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } -} +namespace cv { #ifdef HAVE_OPENCL @@ -1160,7 +275,6 @@ static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize) } } #endif -} void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) { @@ -1194,49 +308,10 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) return; #endif - bool useSortNet = ksize == 3 || (ksize == 5 -#if !(CV_SIMD) - && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 ) -#endif - ); - - Mat src; - if( useSortNet ) - { - if( dst.data != src0.data ) - src = src0; - else - src0.copyTo(src); - - if( src.depth() == CV_8U ) - medianBlur_SortNet( src, dst, ksize ); - else if( src.depth() == CV_16U ) - medianBlur_SortNet( src, dst, ksize ); - else if( src.depth() == CV_16S ) - medianBlur_SortNet( src, dst, ksize ); - else if( src.depth() == CV_32F ) - medianBlur_SortNet( src, dst, ksize ); - else - CV_Error(CV_StsUnsupportedFormat, ""); - - return; - } - else - { - cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE|BORDER_ISOLATED); - - int cn = src0.channels(); - CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) ); - - double img_size_mp = (double)(src0.total())/(1 << 20); - if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)* - (CV_SIMD ? 1 : 3)) - medianBlur_8u_Om( src, dst, ksize ); - else - medianBlur_8u_O1( src, dst, ksize ); - } + CV_CPU_DISPATCH(medianBlur, (src0, dst, ksize), + CV_CPU_DISPATCH_MODES_ALL); } -} +} // namespace /* End of file. */ diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp index c98cd9215a..c3203f2a07 100644 --- a/modules/imgproc/src/median_blur.simd.hpp +++ b/modules/imgproc/src/median_blur.simd.hpp @@ -46,9 +46,11 @@ #include #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "opencv2/core/openvx/ovx_defs.hpp" +#ifdef _MSC_VER +#pragma warning(disable: 4244) // warning C4244: 'argument': conversion from 'int' to 'ushort', possible loss of data + // triggered on intrinsic code from medianBlur_8u_O1() +#endif /* * This file includes the code, contributed by Simon Perreault @@ -71,12 +73,18 @@ Median Filter \****************************************************************************************/ -namespace cv -{ +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void medianBlur(const Mat& src0, /*const*/ Mat& dst, int ksize); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY static void medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) { + CV_INSTRUMENT_REGION(); + typedef ushort HT; /** @@ -330,9 +338,6 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) } } } -#if CV_SIMD - vx_cleanup(); -#endif } #undef HOP @@ -342,6 +347,8 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) static void medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m ) { + CV_INSTRUMENT_REGION(); + #define N 16 int zone0[4][N]; int zone1[4][N*N]; @@ -671,6 +678,8 @@ template static void medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) { + CV_INSTRUMENT_REGION(); + typedef typename Op::value_type T; typedef typename Op::arg_type WT; typedef typename VecOp::arg_type VT; @@ -770,9 +779,6 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) limit = size.width; } } -#if CV_SIMD - vx_cleanup(); -#endif } else if( m == 5 ) { @@ -934,266 +940,15 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) limit = size.width; } } -#if CV_SIMD - vx_cleanup(); -#endif } } -#ifdef HAVE_OPENCL +} // namespace anon -#define DIVUP(total, grain) ((total + grain - 1) / (grain)) - -static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m) -{ - size_t localsize[2] = { 16, 16 }; - size_t globalsize[2]; - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) ) - return false; - - Size imgSize = _src.size(); - bool useOptimized = (1 == cn) && - (size_t)imgSize.width >= localsize[0] * 8 && - (size_t)imgSize.height >= localsize[1] * 8 && - imgSize.width % 4 == 0 && - imgSize.height % 4 == 0 && - (ocl::Device::getDefault().isIntel()); - - cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ; - cv::String kdefs = useOptimized ? - format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type), - ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn) - : - format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ; - - ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() ); - - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(src.size(), type); - UMat dst = _dst.getUMat(); - - k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); - - if( useOptimized ) - { - globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0]; - globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1]; - } - else - { - globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0]; - globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1]; - } - - return k.run(2, globalsize, localsize, false); -} - -#undef DIVUP - -#endif - -#ifdef HAVE_OPENVX -namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 1280 * 720; } -} -static bool openvx_medianFilter(InputArray _src, OutputArray _dst, int ksize) -{ - if (_src.type() != CV_8UC1 || _dst.type() != CV_8U -#ifndef VX_VERSION_1_1 - || ksize != 3 -#endif - ) - return false; - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - if ( -#ifdef VX_VERSION_1_1 - ksize != 3 ? ovx::skipSmallImages(src.cols, src.rows) : -#endif - ovx::skipSmallImages(src.cols, src.rows) - ) - return false; - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); -#ifdef VX_VERSION_1_1 - if ((vx_size)ksize > ctx.nonlinearMaxDimension()) - return false; -#endif - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(VX_BORDER_REPLICATE); -#ifdef VX_VERSION_1_1 - if (ksize == 3) -#endif - { - ivx::IVX_CHECK_STATUS(vxuMedian3x3(ctx, ia, ib)); - } -#ifdef VX_VERSION_1_1 - else - { - ivx::Matrix mtx; - if(ksize == 5) - mtx = ivx::Matrix::createFromPattern(ctx, VX_PATTERN_BOX, ksize, ksize); - else - { - vx_size supportedSize; - ivx::IVX_CHECK_STATUS(vxQueryContext(ctx, VX_CONTEXT_NONLINEAR_MAX_DIMENSION, &supportedSize, sizeof(supportedSize))); - if ((vx_size)ksize > supportedSize) - { - ctx.setImmediateBorder(prevBorder); - return false; - } - Mat mask(ksize, ksize, CV_8UC1, Scalar(255)); - mtx = ivx::Matrix::create(ctx, VX_TYPE_UINT8, ksize, ksize); - mtx.copyFrom(mask); - } - ivx::IVX_CHECK_STATUS(vxuNonLinearFilter(ctx, VX_NONLINEAR_FILTER_MEDIAN, ia, mtx, ib)); - } -#endif - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; -} -#endif - -#ifdef HAVE_IPP -static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize) -{ - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Degradations for big kernel - if(ksize > 7) - return false; -#endif - - { - int bufSize; - IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize); - IppDataType ippType = ippiGetDataType(src0.type()); - int channels = src0.channels(); - IppAutoBuffer buffer; - - if(src0.isSubmatrix()) - return false; - - Mat src; - if(dst.data != src0.data) - src = src0; - else - src0.copyTo(src); - - if(ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, ippType, channels, &bufSize) < 0) - return false; - - buffer.allocate(bufSize); - - switch(ippType) - { - case ipp8u: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp16u: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp16s: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp32f: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_32f_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - default: - return false; - } - } -} -#endif -} - -void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) +void medianBlur(const Mat& src0, /*const*/ Mat& dst, int ksize) { CV_INSTRUMENT_REGION(); - CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 )); - - if( ksize <= 1 || _src0.empty() ) - { - _src0.copyTo(_dst); - return; - } - - CV_OCL_RUN(_dst.isUMat(), - ocl_medianFilter(_src0,_dst, ksize)) - - Mat src0 = _src0.getMat(); - _dst.create( src0.size(), src0.type() ); - Mat dst = _dst.getMat(); - - CALL_HAL(medianBlur, cv_hal_medianBlur, src0.data, src0.step, dst.data, dst.step, src0.cols, src0.rows, src0.depth(), - src0.channels(), ksize); - - CV_OVX_RUN(true, - openvx_medianFilter(_src0, _dst, ksize)) - - CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize)); - -#ifdef HAVE_TEGRA_OPTIMIZATION - if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize)) - return; -#endif - bool useSortNet = ksize == 3 || (ksize == 5 #if !(CV_SIMD) && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 ) @@ -1223,6 +978,7 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) } else { + // TODO AVX guard (external call) cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE|BORDER_ISOLATED); int cn = src0.channels(); @@ -1237,6 +993,6 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) } } -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From b99c9145bfbc4fca4d8358e988fe1322f807df88 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 19:20:24 +0000 Subject: [PATCH 18/21] imgproc: dispatch smooth --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/fixedpoint.inl.hpp | 5 +- modules/imgproc/src/smooth.dispatch.cpp | 1990 +---------------------- modules/imgproc/src/smooth.simd.hpp | 541 +----- 4 files changed, 49 insertions(+), 2488 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index c149edb9b3..d28d6b9046 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -5,4 +5,5 @@ ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp index a1a75a29e1..40b1c3faa1 100644 --- a/modules/imgproc/src/fixedpoint.inl.hpp +++ b/modules/imgproc/src/fixedpoint.inl.hpp @@ -9,10 +9,7 @@ #ifndef _CV_FIXEDPOINT_HPP_ #define _CV_FIXEDPOINT_HPP_ -#include "opencv2/core/softfloat.hpp" - -namespace -{ +namespace { class fixedpoint64 { diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp index 909ffa919c..4e514eb8b8 100644 --- a/modules/imgproc/src/smooth.dispatch.cpp +++ b/modules/imgproc/src/smooth.dispatch.cpp @@ -52,13 +52,22 @@ #include "filter.hpp" +#include "opencv2/core/softfloat.hpp" + +namespace cv { #include "fixedpoint.inl.hpp" +} + +#include "smooth.simd.hpp" +#include "smooth.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { /****************************************************************************************\ Gaussian Blur \****************************************************************************************/ -cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) +Mat getGaussianKernel(int n, double sigma, int ktype) { CV_Assert(n > 0); const int SMALL_GAUSSIAN_SIZE = 7; @@ -112,8 +121,6 @@ cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) return kernel; } -namespace cv { - template static std::vector getFixedpointGaussianKernel( int n, double sigma ) { @@ -161,1964 +168,6 @@ static std::vector getFixedpointGaussianKernel( int n, double sigma ) return kernel; }; -template -void hlineSmooth1N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int) -{ - for (int i = 0; i < len*cn; i++, src++, dst++) - *dst = (*m) * (*src); -} -template <> -void hlineSmooth1N(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int) -{ - int lencn = len*cn; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)); - for (; i <= lencn - VECSZ; i += VECSZ) - v_store((uint16_t*)dst + i, v_mul_wrap(v_mul, vx_load_expand(src + i))); -#endif - for (; i < lencn; i++) - dst[i] = m[0] * src[i]; -} -template -void hlineSmooth1N1(const ET* src, int cn, const FT*, int, FT* dst, int len, int) -{ - for (int i = 0; i < len*cn; i++, src++, dst++) - *dst = *src; -} -template <> -void hlineSmooth1N1(const uint8_t* src, int cn, const ufixedpoint16*, int, ufixedpoint16* dst, int len, int) -{ - int lencn = len*cn; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i += VECSZ) - v_store((uint16_t*)dst + i, v_shl<8>(vx_load_expand(src + i))); -#endif - for (; i < lencn; i++) - dst[i] = src[i]; -} -template -void hlineSmooth3N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[2] * src[cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[0] * src[src_idx*cn + k]; - } - - src += cn; dst += cn; - for (int i = cn; i < (len - 1)*cn; i++, src++, dst++) - *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn]; - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[2] * src[src_idx + k]; - } - } -} -template <> -void hlineSmooth3N(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[2] * src[cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[0] * src[src_idx*cn + k]; - } - - src += cn; dst += cn; - int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - v_uint16 v_mul2 = vx_setall_u16(_m[2]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn), v_mul0) + - v_mul_wrap(vx_load_expand(src), v_mul1) + - v_mul_wrap(vx_load_expand(src + cn), v_mul2)); -#endif - for (; i < lencn; i++, src++, dst++) - *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn]; - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[2] * src[src_idx + k]; - } - } -} -template -void hlineSmooth3N121(const ET* src, int cn, const FT*, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - if(borderType != BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = FT(src[k]); - else - for (int k = 0; k < cn; k++) - dst[k] = FT(src[k])>>1; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = (FT(src[k])>>1) + (FT(src[cn + k])>>2); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (FT(src[src_idx*cn + k])>>2); - } - - src += cn; dst += cn; - for (int i = cn; i < (len - 1)*cn; i++, src++, dst++) - *dst = (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[0])>>1); - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = (FT(src[k - cn])>>2) + (FT(src[k])>>1); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (FT(src[src_idx + k])>>2); - } - } -} -template <> -void hlineSmooth3N121(const uint8_t* src, int cn, const ufixedpoint16*, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - if (borderType != BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = ufixedpoint16(src[k]); - else - for (int k = 0; k < cn; k++) - dst[k] = ufixedpoint16(src[k]) >> 1; - } - else - { - // Point that fall left from border - for (int k = 0; k < cn; k++) - dst[k] = (ufixedpoint16(src[k])>>1) + (ufixedpoint16(src[cn + k])>>2); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (ufixedpoint16(src[src_idx*cn + k])>>2); - } - - src += cn; dst += cn; - int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << 6); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = (uint16_t(src[-cn]) + uint16_t(src[cn]) + (uint16_t(src[0]) << 1)) << 6; - - // Point that fall right from border - for (int k = 0; k < cn; k++) - dst[k] = (ufixedpoint16(src[k - cn])>>2) + (ufixedpoint16(src[k])>>1); - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + (ufixedpoint16(src[src_idx + k])>>2); - } - } -} -template -void hlineSmooth3Naba(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? (m[0]<<1) + m[1] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[cn + k] + m[0] * src[src_idx*cn + k]; - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[cn + k]; - } - - src += cn; dst += cn; - for (int i = cn; i < (len - 1)*cn; i++, src++, dst++) - *dst = m[1] * src[0] + m[0] * src[-cn] + m[0] * src[cn]; - - // Point that fall right from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[k - cn] + m[0] * src[src_idx + k]; - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - } - } -} -template <> -void hlineSmooth3Naba(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? (m[0]<<1) + m[1] : m[1]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else - { - // Point that fall left from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = borderInterpolate(-1, len, borderType); - for (int k = 0; k < cn; k++) - ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[cn + k]) + (uint16_t)(src[src_idx*cn + k])); - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[1] * src[k] + m[0] * src[cn + k]; - } - - src += cn; dst += cn; - int i = cn, lencn = (len - 1)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul0) + - v_mul_wrap(vx_load_expand(src), v_mul1)); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])); - - // Point that fall right from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn; - for (int k = 0; k < cn; k++) - ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[src_idx + k])); - } - else - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k - cn] + m[1] * src[k]; - } - } -} -template -void hlineSmooth5N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] + m[3] + m[4] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k ] = m[2] * src[k] + m[3] * src[k+cn]; - dst[k+cn] = m[1] * src[k] + m[2] * src[k+cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k ] = m[1] * src[k + idxm1] + m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + idxp1] + m[0] * src[k + idxm2]; - dst[k + cn] = m[0] * src[k + idxm1] + m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k ] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2*cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2*cn]; - dst[k + 2*cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2*cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k ] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2*cn] + m[0] * src[k + idxm2] + m[1] * src[k + idxm1]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2*cn] + m[0] * src[k + idxm1] + m[4] * src[k + idxp1]; - dst[k + 2*cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2*cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[cn + k] + m[4] * src[2*cn + k]; - dst[k + cn] = m[1] * src[k] + m[2] * src[cn + k] + m[3] * src[2*cn + k] + m[4] * src[3*cn + k]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxm2 + k] + m[1] * src[idxm1 + k]; - dst[k + cn] = dst[k + cn] + m[0] * src[idxm1 + k]; - } - } - - src += 2*cn; dst += 2*cn; - for (int i = 2*cn; i < (len - 2)*cn; i++, src++, dst++) - *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn]; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[0] * src[k - 2*cn] + m[1] * src[k - cn] + m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len+1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[4] * src[idxp1 + k]; - dst[k + cn] = dst[k + cn] + m[3] * src[idxp1 + k] + m[4] * src[idxp2 + k]; - } - } - } -} -template <> -void hlineSmooth5N(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] + m[3] + m[4] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[1] * src[k + idxm1] + m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + idxp1] + m[0] * src[k + idxm2]; - dst[k + cn] = m[0] * src[k + idxm1] + m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2 * cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2 * cn]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[k + cn] + m[4] * src[k + 2 * cn] + m[0] * src[k + idxm2] + m[1] * src[k + idxm1]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[3] * src[k + 2 * cn] + m[0] * src[k + idxm1] + m[4] * src[k + idxp1]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn] + m[3] * src[k + idxp1] + m[4] * src[k + idxp2]; - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[3] * src[cn + k] + m[4] * src[2 * cn + k]; - dst[k + cn] = m[1] * src[k] + m[2] * src[cn + k] + m[3] * src[2 * cn + k] + m[4] * src[3 * cn + k]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxm2 + k] + m[1] * src[idxm1 + k]; - dst[k + cn] = dst[k + cn] + m[0] * src[idxm1 + k]; - } - } - - src += 2 * cn; dst += 2 * cn; - int i = 2*cn, lencn = (len - 2)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - v_uint16 v_mul2 = vx_setall_u16(_m[2]); - v_uint16 v_mul3 = vx_setall_u16(_m[3]); - v_uint16 v_mul4 = vx_setall_u16(_m[4]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn), v_mul0) + - v_mul_wrap(vx_load_expand(src - cn), v_mul1) + - v_mul_wrap(vx_load_expand(src), v_mul2) + - v_mul_wrap(vx_load_expand(src + cn), v_mul3) + - v_mul_wrap(vx_load_expand(src + 2 * cn), v_mul4)); -#endif - for (; i < lencn; i++, src++, dst++) - *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn]; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[0] * src[k - 2 * cn] + m[1] * src[k - cn] + m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[4] * src[idxp1 + k]; - dst[k + cn] = dst[k + cn] + m[3] * src[idxp1 + k] + m[4] * src[idxp2 + k]; - } - } - } -} -template -void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = (FT(src[k])>>3)*(uint8_t)3; - else - for (int k = 0; k < cn; k++) - dst[k] = src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2); - dst[k + cn] = (FT(src[k]) >> 2) + (FT(src[k + cn])>>4)*(uint8_t)6; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + idxm1])>>2) + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>4) + (FT(src[k + idxm2])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp2])>>4); - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + 2 * cn])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2); - dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k])>>4); - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + idxm1])>>2) + (FT(src[k + 2 * cn])>>4) + (FT(src[k + idxm2])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp1])>>4); - dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k])>>4) + (FT(src[k + idxp2])>>4); - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[cn + k])>>2) + (FT(src[2 * cn + k])>>4); - dst[k + cn] = (FT(src[cn + k])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[2 * cn + k])>>2) + (FT(src[3 * cn + k])>>4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (FT(src[idxm2 + k])>>4) + (FT(src[idxm1 + k])>>2); - dst[k + cn] = dst[k + cn] + (FT(src[idxm1 + k])>>4); - } - } - - src += 2 * cn; dst += 2 * cn; - for (int i = 2 * cn; i < (len - 2)*cn; i++, src++, dst++) - *dst = (FT(src[0])>>4)*(uint8_t)6 + (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[-2 * cn])>>4) + (FT(src[2 * cn])>>4); - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k - cn])>>2) + (FT(src[k + cn])>>2) + (FT(src[k - 2 * cn])>>4); - dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k - cn])>>4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (FT(src[idxp1 + k])>>4); - dst[k + cn] = dst[k + cn] + (FT(src[idxp1 + k])>>2) + (FT(src[idxp2 + k])>>4); - } - } - } -} -template <> -void hlineSmooth5N14641(const uint8_t* src, int cn, const ufixedpoint16*, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - dst[k] = (ufixedpoint16(src[k])>>3) * (uint8_t)3; - else - { - for (int k = 0; k < cn; k++) - dst[k] = src[k]; - } - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2); - dst[k + cn] = (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4); - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2); - dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k]) >> 4); - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp1]) >> 4); - dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4); - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[cn + k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 4); - dst[k + cn] = (ufixedpoint16(src[cn + k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 2) + (ufixedpoint16(src[3 * cn + k]) >> 4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (ufixedpoint16(src[idxm2 + k]) >> 4) + (ufixedpoint16(src[idxm1 + k]) >> 2); - dst[k + cn] = dst[k + cn] + (ufixedpoint16(src[idxm1 + k]) >> 4); - } - } - - src += 2 * cn; dst += 2 * cn; - int i = 2 * cn, lencn = (len - 2)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - v_uint16 v_6 = vx_setall_u16(6); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, (v_mul_wrap(vx_load_expand(src), v_6) + ((vx_load_expand(src - cn) + vx_load_expand(src + cn)) << 2) + vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn)) << 4); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = (uint16_t(src[0]) * 6 + ((uint16_t(src[-cn]) + uint16_t(src[cn])) << 2) + uint16_t(src[-2 * cn]) + uint16_t(src[2 * cn])) << 4; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k - cn]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k - 2 * cn]) >> 4); - dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k - cn]) >> 4); - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + (ufixedpoint16(src[idxp1 + k]) >> 4); - dst[k + cn] = dst[k + cn] + (ufixedpoint16(src[idxp1 + k]) >> 2) + (ufixedpoint16(src[idxp2 + k]) >> 4); - } - } - } -} -template -void hlineSmooth5Nabcba(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType) -{ - if (len == 1) - { - FT msum = borderType != BORDER_CONSTANT ? ((m[0] + m[1])<<1) + m[2] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[1] * src[k + idxm1] + m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + idxp1] + m[0] * src[k + idxm2]; - dst[k + cn] = m[0] * src[k + idxm1] + m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + idxp1] + m[0] * src[k + idxp2]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + 2 * cn]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn] + m[0] * src[k + idxm2] + m[1] * src[k + idxm1]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + 2 * cn] + m[0] * src[k + idxm1] + m[0] * src[k + idxp1]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn] + m[1] * src[k + idxp1] + m[0] * src[k + idxp2]; - } - } - } - else - { - // Points that fall left from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[cn + k] + m[0] * src[2 * cn + k]; - dst[k + cn] = m[1] * src[k] + m[2] * src[cn + k] + m[1] * src[2 * cn + k] + m[0] * src[3 * cn + k]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxm2 + k] + m[1] * src[idxm1 + k]; - dst[k + cn] = dst[k + cn] + m[0] * src[idxm1 + k]; - } - } - - src += 2 * cn; dst += 2 * cn; - for (int i = 2 * cn; i < (len - 2)*cn; i++, src++, dst++) - *dst = m[0] * src[-2 * cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2 * cn]; - - // Points that fall right from border - for (int k = 0; k < cn; k++) - { - dst[k] = m[0] * src[k - 2 * cn] + m[1] * src[k - cn] + m[2] * src[k] + m[3] * src[k + cn]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - dst[k] = dst[k] + m[0] * src[idxp1 + k]; - dst[k + cn] = dst[k + cn] + m[1] * src[idxp1 + k] + m[0] * src[idxp2 + k]; - } - } - } -} -template <> -void hlineSmooth5Nabcba(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType) -{ - if (len == 1) - { - ufixedpoint16 msum = borderType != BORDER_CONSTANT ? ((m[0] + m[1]) << 1) + m[2] : m[2]; - for (int k = 0; k < cn; k++) - dst[k] = msum * src[k]; - } - else if (len == 2) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn]; - dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(2, len, borderType)*cn; - int idxp2 = borderInterpolate(3, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxp1]) + (uint16_t)(src[k + idxm2])); - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + idxp2])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + idxp1])) + ((uint16_t*)m)[2] * src[k + cn]; - } - } - } - else if (len == 3) - { - if (borderType == BORDER_CONSTANT) - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn]; - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + 2 * cn])) + ((uint16_t*)m)[2] * src[k + cn]; - dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn]; - } - else - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - int idxp1 = borderInterpolate(3, len, borderType)*cn; - int idxp2 = borderInterpolate(4, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[1] * ((uint16_t)(src[k + cn]) + (uint16_t)(src[k + idxm1])) + ((uint16_t*)m)[0] * ((uint16_t)(src[k + 2 * cn]) + (uint16_t)(src[k + idxm2])); - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[2] * src[k + cn] + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + 2 * cn])) + ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + idxp1])); - ((uint16_t*)dst)[k + 2 * cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k]) + (uint16_t)(src[k + idxp2])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k + cn]) + (uint16_t)(src[k + idxp1])) + ((uint16_t*)m)[2] * src[k + 2 * cn]; - } - } - } - else - { - // Points that fall left from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxm2 = borderInterpolate(-2, len, borderType)*cn; - int idxm1 = borderInterpolate(-1, len, borderType)*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[1] * ((uint16_t)(src[cn + k]) + (uint16_t)(src[idxm1 + k])) + ((uint16_t*)m)[0] * ((uint16_t)(src[2 * cn + k]) + (uint16_t)(src[idxm2 + k])); - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[2 * cn + k])) + ((uint16_t*)m)[2] * src[cn + k] + ((uint16_t*)m)[0] * ((uint16_t)(src[3 * cn + k]) + (uint16_t)(src[idxm1 + k])); - } - } - else - { - for (int k = 0; k < cn; k++) - { - dst[k] = m[2] * src[k] + m[1] * src[cn + k] + m[0] * src[2 * cn + k]; - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[2 * cn + k])) + ((uint16_t*)m)[2] * src[cn + k] + ((uint16_t*)m)[0] * src[3 * cn + k]; - } - } - - src += 2 * cn; dst += 2 * cn; - int i = 2 * cn, lencn = (len - 2)*cn; -#if CV_SIMD - const uint16_t* _m = (const uint16_t*)m; - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul0 = vx_setall_u16(_m[0]); - v_uint16 v_mul1 = vx_setall_u16(_m[1]); - v_uint16 v_mul2 = vx_setall_u16(_m[2]); - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - v_store((uint16_t*)dst, v_mul_wrap(vx_load_expand(src - 2 * cn) + vx_load_expand(src + 2 * cn), v_mul0) + - v_mul_wrap(vx_load_expand(src - cn) + vx_load_expand(src + cn), v_mul1) + - v_mul_wrap(vx_load_expand(src), v_mul2)); -#endif - for (; i < lencn; i++, src++, dst++) - *((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0]; - - // Points that fall right from border - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn; - int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn; - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[0] * ((uint16_t)(src[k - 2 * cn]) + (uint16_t)(src[idxp1 + k])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k]; - ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[idxp2 + k])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[idxp1 + k])) + ((uint16_t*)m)[2] * src[k + cn]; - } - } - else - { - for (int k = 0; k < cn; k++) - { - ((uint16_t*)dst)[k] = ((uint16_t*)m)[0] * src[k - 2 * cn] + ((uint16_t*)m)[1] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k]; - dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn]; - } - } - } -} -template -void hlineSmooth(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift-i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - for (; i < (len - post_shift + 1)*cn; i++, src++, dst++) - { - *dst = m[0] * src[0]; - for (int j = 1; j < n; j++) - *dst = *dst + m[j] * src[j*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template <> -void hlineSmooth(const uint8_t* src, int cn, const ufixedpoint16* m, int n, ufixedpoint16* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift - i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - int lencn = (len - post_shift + 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i+=VECSZ, src+=VECSZ, dst+=VECSZ) - { - v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src), vx_setall_u16(*((uint16_t*)m))); - for (int j = 1; j < n; j++) - v_res0 += v_mul_wrap(vx_load_expand(src + j * cn), vx_setall_u16(*((uint16_t*)(m + j)))); - v_store((uint16_t*)dst, v_res0); - } -#endif - for (; i < lencn; i++, src++, dst++) - { - *dst = m[0] * src[0]; - for (int j = 1; j < n; j++) - *dst = *dst + m[j] * src[j*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template -void hlineSmoothONa_yzy_a(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift - i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - for (; i < (len - post_shift + 1)*cn; i++, src++, dst++) - { - *dst = m[pre_shift] * src[pre_shift*cn]; - for (int j = 0; j < pre_shift; j++) - *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n-1-j)*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template <> -void hlineSmoothONa_yzy_a(const uint8_t* src, int cn, const ufixedpoint16* m, int n, ufixedpoint16* dst, int len, int borderType) -{ - int pre_shift = n / 2; - int post_shift = n - pre_shift; - int i = 0; - for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[pre_shift - i] * src[k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - int j, mid; - for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT) - for (; j < i + post_shift; j++, mid++) - { - int src_idx = borderInterpolate(j, len, borderType); - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[mid] * src[src_idx*cn + k]; - } - } - i *= cn; - int lencn = (len - post_shift + 1)*cn; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ) - { - v_uint16 v_res0 = v_mul_wrap(vx_load_expand(src + pre_shift * cn), vx_setall_u16(*((uint16_t*)(m + pre_shift)))); - for (int j = 0; j < pre_shift; j ++) - v_res0 += v_mul_wrap(vx_load_expand(src + j * cn) + vx_load_expand(src + (n - 1 - j)*cn), vx_setall_u16(*((uint16_t*)(m + j)))); - v_store((uint16_t*)dst, v_res0); - } -#endif - for (; i < lencn; i++, src++, dst++) - { - *dst = m[pre_shift] * src[pre_shift*cn]; - for (int j = 0; j < pre_shift; j++) - *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n - 1 - j)*cn]; - } - i /= cn; - for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border - { - for (int k = 0; k < cn; k++) - dst[k] = m[0] * src[k]; - int j = 1; - for (; j < len - i; j++) - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[j*cn + k]; - if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - for (; j < n; j++) - { - int src_idx = borderInterpolate(i + j, len, borderType) - i; - for (int k = 0; k < cn; k++) - dst[k] = dst[k] + m[j] * src[src_idx*cn + k]; - } - } -} -template -void vlineSmooth1N(const FT* const * src, const FT* m, int, ET* dst, int len) -{ - const FT* src0 = src[0]; - for (int i = 0; i < len; i++) - dst[i] = *m * src0[i]; -} -template <> -void vlineSmooth1N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) -{ - const ufixedpoint16* src0 = src[0]; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - v_uint16 v_mul = vx_setall_u16(*((uint16_t*)m)<<1); - for (; i <= len - VECSZ; i += VECSZ) - v_rshr_pack_store<1>(dst + i, v_mul_hi(vx_load((uint16_t*)src0 + i), v_mul)); -#endif - for (; i < len; i++) - dst[i] = m[0] * src0[i]; -} -template -void vlineSmooth1N1(const FT* const * src, const FT*, int, ET* dst, int len) -{ - const FT* src0 = src[0]; - for (int i = 0; i < len; i++) - dst[i] = src0[i]; -} -template <> -void vlineSmooth1N1(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) -{ - const ufixedpoint16* src0 = src[0]; - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= len - VECSZ; i += VECSZ) - v_rshr_pack_store<8>(dst + i, vx_load((uint16_t*)(src0 + i))); -#endif - for (; i < len; i++) - dst[i] = src0[i]; -} -template -void vlineSmooth3N(const FT* const * src, const FT* m, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i]; -} -template <> -void vlineSmooth3N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; - if (len >= VECSZ) - { - ufixedpoint32 val[] = { (m[0] + m[1] + m[2]) * ufixedpoint16((uint8_t)128) }; - v_128_4 = vx_setall_s32(*((int32_t*)val)); - } - v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); - v_int16 v_mul2 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 2)))); - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13; - v_int16 v_tmp0, v_tmp1; - - const int16_t* src0 = (const int16_t*)src[0] + i; - const int16_t* src1 = (const int16_t*)src[1] + i; - v_src00 = vx_load(src0); - v_src01 = vx_load(src0 + VECSZ); - v_src02 = vx_load(src0 + 2*VECSZ); - v_src03 = vx_load(src0 + 3*VECSZ); - v_src10 = vx_load(src1); - v_src11 = vx_load(src1 + VECSZ); - v_src12 = vx_load(src1 + 2*VECSZ); - v_src13 = vx_load(src1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_int32 v_res0 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res1 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_int32 v_res2 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res3 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_int32 v_res4 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res5 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_int32 v_res6 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res7 = v_dotprod(v_tmp1, v_mul01); - - v_int32 v_resj0, v_resj1; - const int16_t* src2 = (const int16_t*)src[2] + i; - v_src00 = vx_load(src2); - v_src01 = vx_load(src2 + VECSZ); - v_src02 = vx_load(src2 + 2*VECSZ); - v_src03 = vx_load(src2 + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul2, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; - v_mul_expand(v_add_wrap(v_src01, v_128), v_mul2, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; - v_mul_expand(v_add_wrap(v_src02, v_128), v_mul2, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; - v_mul_expand(v_add_wrap(v_src03, v_128), v_mul2, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } -#endif - for (; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i]; -} -template -void vlineSmooth3N121(const FT* const * src, const FT*, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = (FT::WT(src[0][i]) >> 2) + (FT::WT(src[2][i]) >> 2) + (FT::WT(src[1][i]) >> 1); -} -template <> -void vlineSmooth3N121(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - for (; i <= len - 2*VECSZ; i += 2*VECSZ) - { - v_uint32 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; - v_expand(vx_load((uint16_t*)(src[0]) + i), v_src00, v_src01); - v_expand(vx_load((uint16_t*)(src[0]) + i + VECSZ), v_src02, v_src03); - v_expand(vx_load((uint16_t*)(src[1]) + i), v_src10, v_src11); - v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13); - v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21); - v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23); - v_store(dst + i, v_pack(v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)), - v_rshr_pack<10>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13)))); - } -#endif - for (; i < len; i++) - dst[i] = (((uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[2]))[i]) + ((uint32_t)(((uint16_t*)(src[1]))[i]) << 1)) + (1 << 9)) >> 10; -} -template -void vlineSmooth5N(const FT* const * src, const FT* m, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i] + m[3] * src[3][i] + m[4] * src[4][i]; -} -template <> -void vlineSmooth5N(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - const int VECSZ = v_uint16::nlanes; - if (len >= 4 * VECSZ) - { - ufixedpoint32 val[] = { (m[0] + m[1] + m[2] + m[3] + m[4]) * ufixedpoint16((uint8_t)128) }; - v_int32 v_128_4 = vx_setall_s32(*((int32_t*)val)); - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int16 v_mul01 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); - v_int16 v_mul23 = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m + 2)))); - v_int16 v_mul4 = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + 4)))); - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13; - v_int16 v_tmp0, v_tmp1; - - const int16_t* src0 = (const int16_t*)src[0] + i; - const int16_t* src1 = (const int16_t*)src[1] + i; - v_src00 = vx_load(src0); - v_src01 = vx_load(src0 + VECSZ); - v_src02 = vx_load(src0 + 2*VECSZ); - v_src03 = vx_load(src0 + 3*VECSZ); - v_src10 = vx_load(src1); - v_src11 = vx_load(src1 + VECSZ); - v_src12 = vx_load(src1 + 2*VECSZ); - v_src13 = vx_load(src1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_int32 v_res0 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res1 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_int32 v_res2 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res3 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_int32 v_res4 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res5 = v_dotprod(v_tmp1, v_mul01); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_int32 v_res6 = v_dotprod(v_tmp0, v_mul01); - v_int32 v_res7 = v_dotprod(v_tmp1, v_mul01); - - const int16_t* src2 = (const int16_t*)src[2] + i; - const int16_t* src3 = (const int16_t*)src[3] + i; - v_src00 = vx_load(src2); - v_src01 = vx_load(src2 + VECSZ); - v_src02 = vx_load(src2 + 2*VECSZ); - v_src03 = vx_load(src2 + 3*VECSZ); - v_src10 = vx_load(src3); - v_src11 = vx_load(src3 + VECSZ); - v_src12 = vx_load(src3 + 2*VECSZ); - v_src13 = vx_load(src3 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul23); - v_res1 += v_dotprod(v_tmp1, v_mul23); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_res2 += v_dotprod(v_tmp0, v_mul23); - v_res3 += v_dotprod(v_tmp1, v_mul23); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_res4 += v_dotprod(v_tmp0, v_mul23); - v_res5 += v_dotprod(v_tmp1, v_mul23); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_res6 += v_dotprod(v_tmp0, v_mul23); - v_res7 += v_dotprod(v_tmp1, v_mul23); - - v_int32 v_resj0, v_resj1; - const int16_t* src4 = (const int16_t*)src[4] + i; - v_src00 = vx_load(src4); - v_src01 = vx_load(src4 + VECSZ); - v_src02 = vx_load(src4 + 2*VECSZ); - v_src03 = vx_load(src4 + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; - v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; - v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; - v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } - } -#endif - for (; i < len; i++) - dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i] + m[3] * src[3][i] + m[4] * src[4][i]; -} -template -void vlineSmooth5N14641(const FT* const * src, const FT*, int, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - dst[i] = (FT::WT(src[2][i])*(uint8_t)6 + ((FT::WT(src[1][i]) + FT::WT(src[3][i]))<<2) + FT::WT(src[0][i]) + FT::WT(src[4][i])) >> 4; -} -template <> -void vlineSmooth5N14641(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - v_uint32 v_6 = vx_setall_u32(6); - const int VECSZ = v_uint16::nlanes; - for (; i <= len - 2*VECSZ; i += 2*VECSZ) - { - v_uint32 v_src00, v_src10, v_src20, v_src30, v_src40; - v_uint32 v_src01, v_src11, v_src21, v_src31, v_src41; - v_uint32 v_src02, v_src12, v_src22, v_src32, v_src42; - v_uint32 v_src03, v_src13, v_src23, v_src33, v_src43; - v_expand(vx_load((uint16_t*)(src[0]) + i), v_src00, v_src01); - v_expand(vx_load((uint16_t*)(src[0]) + i + VECSZ), v_src02, v_src03); - v_expand(vx_load((uint16_t*)(src[1]) + i), v_src10, v_src11); - v_expand(vx_load((uint16_t*)(src[1]) + i + VECSZ), v_src12, v_src13); - v_expand(vx_load((uint16_t*)(src[2]) + i), v_src20, v_src21); - v_expand(vx_load((uint16_t*)(src[2]) + i + VECSZ), v_src22, v_src23); - v_expand(vx_load((uint16_t*)(src[3]) + i), v_src30, v_src31); - v_expand(vx_load((uint16_t*)(src[3]) + i + VECSZ), v_src32, v_src33); - v_expand(vx_load((uint16_t*)(src[4]) + i), v_src40, v_src41); - v_expand(vx_load((uint16_t*)(src[4]) + i + VECSZ), v_src42, v_src43); - v_store(dst + i, v_pack(v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40, - v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41), - v_rshr_pack<12>(v_src22*v_6 + ((v_src12 + v_src32) << 2) + v_src02 + v_src42, - v_src23*v_6 + ((v_src13 + v_src33) << 2) + v_src03 + v_src43))); - } -#endif - for (; i < len; i++) - dst[i] = ((uint32_t)(((uint16_t*)(src[2]))[i]) * 6 + - (((uint32_t)(((uint16_t*)(src[1]))[i]) + (uint32_t)(((uint16_t*)(src[3]))[i])) << 2) + - (uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[4]))[i]) + (1 << 11)) >> 12; -} -template -void vlineSmooth(const FT* const * src, const FT* m, int n, ET* dst, int len) -{ - for (int i = 0; i < len; i++) - { - typename FT::WT val = m[0] * src[0][i]; - for (int j = 1; j < n; j++) - val = val + m[j] * src[j][i]; - dst[i] = val; - } -} -template <> -void vlineSmooth(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; - if (len >= VECSZ) - { - ufixedpoint16 msum = m[0] + m[1]; - for (int j = 2; j < n; j++) - msum = msum + m[j]; - ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) }; - v_128_4 = vx_setall_s32(*((int32_t*)val)); - } - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13; - v_int16 v_tmp0, v_tmp1; - - v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)m))); - - const int16_t* src0 = (const int16_t*)src[0] + i; - const int16_t* src1 = (const int16_t*)src[1] + i; - v_src00 = vx_load(src0); - v_src01 = vx_load(src0 + VECSZ); - v_src02 = vx_load(src0 + 2*VECSZ); - v_src03 = vx_load(src0 + 3*VECSZ); - v_src10 = vx_load(src1); - v_src11 = vx_load(src1 + VECSZ); - v_src12 = vx_load(src1 + 2*VECSZ); - v_src13 = vx_load(src1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_int32 v_res0 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res1 = v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_int32 v_res2 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res3 = v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_int32 v_res4 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res5 = v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_int32 v_res6 = v_dotprod(v_tmp0, v_mul); - v_int32 v_res7 = v_dotprod(v_tmp1, v_mul); - - int j = 2; - for (; j < n - 1; j+=2) - { - v_mul = v_reinterpret_as_s16(vx_setall_u32(*((uint32_t*)(m+j)))); - - const int16_t* srcj0 = (const int16_t*)src[j] + i; - const int16_t* srcj1 = (const int16_t*)src[j + 1] + i; - v_src00 = vx_load(srcj0); - v_src01 = vx_load(srcj0 + VECSZ); - v_src02 = vx_load(srcj0 + 2*VECSZ); - v_src03 = vx_load(srcj0 + 3*VECSZ); - v_src10 = vx_load(srcj1); - v_src11 = vx_load(srcj1 + VECSZ); - v_src12 = vx_load(srcj1 + 2*VECSZ); - v_src13 = vx_load(srcj1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul); - v_res1 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1); - v_res2 += v_dotprod(v_tmp0, v_mul); - v_res3 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1); - v_res4 += v_dotprod(v_tmp0, v_mul); - v_res5 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1); - v_res6 += v_dotprod(v_tmp0, v_mul); - v_res7 += v_dotprod(v_tmp1, v_mul); - } - if(j < n) - { - v_int32 v_resj0, v_resj1; - v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + j)))); - const int16_t* srcj = (const int16_t*)src[j] + i; - v_src00 = vx_load(srcj); - v_src01 = vx_load(srcj + VECSZ); - v_src02 = vx_load(srcj + 2*VECSZ); - v_src03 = vx_load(srcj + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_resj0, v_resj1); - v_res0 += v_resj0; - v_res1 += v_resj1; - v_mul_expand(v_add_wrap(v_src01, v_128), v_mul, v_resj0, v_resj1); - v_res2 += v_resj0; - v_res3 += v_resj1; - v_mul_expand(v_add_wrap(v_src02, v_128), v_mul, v_resj0, v_resj1); - v_res4 += v_resj0; - v_res5 += v_resj1; - v_mul_expand(v_add_wrap(v_src03, v_128), v_mul, v_resj0, v_resj1); - v_res6 += v_resj0; - v_res7 += v_resj1; - } - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } -#endif - for (; i < len; i++) - { - ufixedpoint32 val = m[0] * src[0][i]; - for (int j = 1; j < n; j++) - { - val = val + m[j] * src[j][i]; - } - dst[i] = val; - } -} -template -void vlineSmoothONa_yzy_a(const FT* const * src, const FT* m, int n, ET* dst, int len) -{ - int pre_shift = n / 2; - for (int i = 0; i < len; i++) - { - typename FT::WT val = m[pre_shift] * src[pre_shift][i]; - for (int j = 0; j < pre_shift; j++) - val = val + m[j] * src[j][i] + m[j] * src[(n - 1 - j)][i]; - dst[i] = val; - } -} -template <> -void vlineSmoothONa_yzy_a(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len) -{ - int i = 0; -#if CV_SIMD - int pre_shift = n / 2; - static const v_int16 v_128 = v_reinterpret_as_s16(vx_setall_u16((uint16_t)1 << 15)); - v_int32 v_128_4 = vx_setall_s32(128 << 16); - const int VECSZ = v_uint16::nlanes; - if (len >= VECSZ) - { - ufixedpoint16 msum = m[0] + m[pre_shift] + m[n - 1]; - for (int j = 1; j < pre_shift; j++) - msum = msum + m[j] + m[n - 1 - j]; - ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) }; - v_128_4 = vx_setall_s32(*((int32_t*)val)); - } - for (; i <= len - 4*VECSZ; i += 4*VECSZ) - { - v_int16 v_src00, v_src10, v_src20, v_src30, v_src01, v_src11, v_src21, v_src31; - v_int32 v_res0, v_res1, v_res2, v_res3, v_res4, v_res5, v_res6, v_res7; - v_int16 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7; - - v_int16 v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + pre_shift)))); - const int16_t* srcp = (const int16_t*)src[pre_shift] + i; - v_src00 = vx_load(srcp); - v_src10 = vx_load(srcp + VECSZ); - v_src20 = vx_load(srcp + 2*VECSZ); - v_src30 = vx_load(srcp + 3*VECSZ); - v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_res0, v_res1); - v_mul_expand(v_add_wrap(v_src10, v_128), v_mul, v_res2, v_res3); - v_mul_expand(v_add_wrap(v_src20, v_128), v_mul, v_res4, v_res5); - v_mul_expand(v_add_wrap(v_src30, v_128), v_mul, v_res6, v_res7); - - int j = 0; - for (; j < pre_shift; j++) - { - v_mul = v_reinterpret_as_s16(vx_setall_u16(*((uint16_t*)(m + j)))); - - const int16_t* srcj0 = (const int16_t*)src[j] + i; - const int16_t* srcj1 = (const int16_t*)src[n - 1 - j] + i; - v_src00 = vx_load(srcj0); - v_src10 = vx_load(srcj0 + VECSZ); - v_src20 = vx_load(srcj0 + 2*VECSZ); - v_src30 = vx_load(srcj0 + 3*VECSZ); - v_src01 = vx_load(srcj1); - v_src11 = vx_load(srcj1 + VECSZ); - v_src21 = vx_load(srcj1 + 2*VECSZ); - v_src31 = vx_load(srcj1 + 3*VECSZ); - v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src01, v_128), v_tmp0, v_tmp1); - v_res0 += v_dotprod(v_tmp0, v_mul); - v_res1 += v_dotprod(v_tmp1, v_mul); - v_zip(v_add_wrap(v_src10, v_128), v_add_wrap(v_src11, v_128), v_tmp2, v_tmp3); - v_res2 += v_dotprod(v_tmp2, v_mul); - v_res3 += v_dotprod(v_tmp3, v_mul); - v_zip(v_add_wrap(v_src20, v_128), v_add_wrap(v_src21, v_128), v_tmp4, v_tmp5); - v_res4 += v_dotprod(v_tmp4, v_mul); - v_res5 += v_dotprod(v_tmp5, v_mul); - v_zip(v_add_wrap(v_src30, v_128), v_add_wrap(v_src31, v_128), v_tmp6, v_tmp7); - v_res6 += v_dotprod(v_tmp6, v_mul); - v_res7 += v_dotprod(v_tmp7, v_mul); - } - - v_res0 += v_128_4; - v_res1 += v_128_4; - v_res2 += v_128_4; - v_res3 += v_128_4; - v_res4 += v_128_4; - v_res5 += v_128_4; - v_res6 += v_128_4; - v_res7 += v_128_4; - - v_store(dst + i , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3)))); - v_store(dst + i + 2*VECSZ, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)), - v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7)))); - } -#endif - for (; i < len; i++) - { - ufixedpoint32 val = m[0] * src[0][i]; - for (int j = 1; j < n; j++) - { - val = val + m[j] * src[j][i]; - } - dst[i] = val; - } -} -template -class fixedSmoothInvoker : public ParallelLoopBody -{ -public: - fixedSmoothInvoker(const ET* _src, size_t _src_stride, ET* _dst, size_t _dst_stride, - int _width, int _height, int _cn, const FT* _kx, int _kxlen, const FT* _ky, int _kylen, int _borderType) : ParallelLoopBody(), - src(_src), dst(_dst), src_stride(_src_stride), dst_stride(_dst_stride), - width(_width), height(_height), cn(_cn), kx(_kx), ky(_ky), kxlen(_kxlen), kylen(_kylen), borderType(_borderType) - { - if (kxlen == 1) - { - if (kx[0] == FT::one()) - hlineSmoothFunc = hlineSmooth1N1; - else - hlineSmoothFunc = hlineSmooth1N; - } - else if (kxlen == 3) - { - if (kx[0] == (FT::one()>>2)&&kx[1] == (FT::one()>>1)&&kx[2] == (FT::one()>>2)) - hlineSmoothFunc = hlineSmooth3N121; - else if ((kx[0] - kx[2]).isZero()) - hlineSmoothFunc = hlineSmooth3Naba; - else - hlineSmoothFunc = hlineSmooth3N; - } - else if (kxlen == 5) - { - if (kx[2] == (FT::one()*(uint8_t)3>>3) && - kx[1] == (FT::one()>>2) && kx[3] == (FT::one()>>2) && - kx[0] == (FT::one()>>4) && kx[4] == (FT::one()>>4)) - hlineSmoothFunc = hlineSmooth5N14641; - else if (kx[0] == kx[4] && kx[1] == kx[3]) - hlineSmoothFunc = hlineSmooth5Nabcba; - else - hlineSmoothFunc = hlineSmooth5N; - } - else if (kxlen % 2 == 1) - { - hlineSmoothFunc = hlineSmoothONa_yzy_a; - for (int i = 0; i < kxlen / 2; i++) - if (!(kx[i] == kx[kxlen - 1 - i])) - { - hlineSmoothFunc = hlineSmooth; - break; - } - } - else - hlineSmoothFunc = hlineSmooth; - if (kylen == 1) - { - if (ky[0] == FT::one()) - vlineSmoothFunc = vlineSmooth1N1; - else - vlineSmoothFunc = vlineSmooth1N; - } - else if (kylen == 3) - { - if (ky[0] == (FT::one() >> 2) && ky[1] == (FT::one() >> 1) && ky[2] == (FT::one() >> 2)) - vlineSmoothFunc = vlineSmooth3N121; - else - vlineSmoothFunc = vlineSmooth3N; - } - else if (kylen == 5) - { - if (ky[2] == (FT::one() * (uint8_t)3 >> 3) && - ky[1] == (FT::one() >> 2) && ky[3] == (FT::one() >> 2) && - ky[0] == (FT::one() >> 4) && ky[4] == (FT::one() >> 4)) - vlineSmoothFunc = vlineSmooth5N14641; - else - vlineSmoothFunc = vlineSmooth5N; - } - else if (kylen % 2 == 1) - { - vlineSmoothFunc = vlineSmoothONa_yzy_a; - for (int i = 0; i < kylen / 2; i++) - if (!(ky[i] == ky[kylen - 1 - i])) - { - vlineSmoothFunc = vlineSmooth; - break; - } - } - else - vlineSmoothFunc = vlineSmooth; - } - virtual void operator() (const Range& range) const CV_OVERRIDE - { - AutoBuffer _buf(width*cn*kylen); - FT* buf = _buf.data(); - AutoBuffer _ptrs(kylen*2); - FT** ptrs = _ptrs.data(); - - if (kylen == 1) - { - ptrs[0] = buf; - for (int i = range.start; i < range.end; i++) - { - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[0], width, borderType); - vlineSmoothFunc(ptrs, ky, kylen, dst + i * dst_stride, width*cn); - } - } - else if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped - { - int pre_shift = kylen / 2; - int post_shift = kylen - pre_shift - 1; - // First line evaluation - int idst = range.start; - int ifrom = max(0, idst - pre_shift); - int ito = idst + post_shift + 1; - int i = ifrom; - int bufline = 0; - for (; i < min(ito, height); i++, bufline++) - { - ptrs[bufline+kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - } - for (; i < ito; i++, bufline++) - { - int src_idx = borderInterpolate(i, height, borderType); - if (src_idx < ifrom) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + src_idx * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - } - else - { - ptrs[bufline + kylen] = ptrs[bufline] = ptrs[src_idx - ifrom]; - } - } - for (int j = idst - pre_shift; j < 0; j++) - { - int src_idx = borderInterpolate(j, height, borderType); - if (src_idx >= ito) - { - ptrs[2*kylen + j] = ptrs[kylen + j] = buf + (kylen + j) * width*cn; - hlineSmoothFunc(src + src_idx * src_stride, cn, kx, kxlen, ptrs[kylen + j], width, borderType); - } - else - { - ptrs[2*kylen + j] = ptrs[kylen + j] = ptrs[src_idx]; - } - } - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); idst++; - - // border mode dependent part evaluation - // i points to last src row to evaluate in convolution - bufline %= kylen; ito = min(height, range.end + post_shift); - for (; i < min(kylen, ito); i++, idst++) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - // Points inside the border - for (; i < ito; i++, idst++) - { - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - // Points that could fall below border - for (; i < range.end + post_shift; i++, idst++) - { - int src_idx = borderInterpolate(i, height, borderType); - if ((i - src_idx) > kylen) - hlineSmoothFunc(src + src_idx * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - else - ptrs[bufline + kylen] = ptrs[bufline] = ptrs[(bufline + kylen - (i - src_idx)) % kylen]; - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - } - else - { - int pre_shift = kylen / 2; - int post_shift = kylen - pre_shift - 1; - // First line evaluation - int idst = range.start; - int ifrom = idst - pre_shift; - int ito = min(idst + post_shift + 1, height); - int i = max(0, ifrom); - int bufline = 0; - for (; i < ito; i++, bufline++) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - } - - if (bufline == 1) - vlineSmooth1N(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - else if (bufline == 3) - vlineSmooth3N(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - else if (bufline == 5) - vlineSmooth5N(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs, ky - min(ifrom, 0), bufline, dst + idst*dst_stride, width*cn); - idst++; - - // border mode dependent part evaluation - // i points to last src row to evaluate in convolution - bufline %= kylen; ito = min(height, range.end + post_shift); - for (; i < min(kylen, ito); i++, idst++) - { - ptrs[bufline + kylen] = ptrs[bufline] = buf + bufline * width*cn; - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline++; - if (bufline == 3) - vlineSmooth3N(ptrs, ky + kylen - bufline, i + 1, dst + idst*dst_stride, width*cn); - else if (bufline == 5) - vlineSmooth5N(ptrs, ky + kylen - bufline, i + 1, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs, ky + kylen - bufline, i + 1, dst + idst*dst_stride, width*cn); - bufline %= kylen; - } - // Points inside the border - if (i - max(0, ifrom) >= kylen) - { - for (; i < ito; i++, idst++) - { - hlineSmoothFunc(src + i * src_stride, cn, kx, kxlen, ptrs[bufline], width, borderType); - bufline = (bufline + 1) % kylen; - vlineSmoothFunc(ptrs + bufline, ky, kylen, dst + idst*dst_stride, width*cn); - } - - // Points that could fall below border - // i points to first src row to evaluate in convolution - bufline = (bufline + 1) % kylen; - for (i = idst - pre_shift; i < range.end - pre_shift; i++, idst++, bufline++) - if (height - i == 3) - vlineSmooth3N(ptrs + bufline, ky, height - i, dst + idst*dst_stride, width*cn); - else if (height - i == 5) - vlineSmooth5N(ptrs + bufline, ky, height - i, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs + bufline, ky, height - i, dst + idst*dst_stride, width*cn); - } - else - { - // i points to first src row to evaluate in convolution - for (i = idst - pre_shift; i < min(range.end - pre_shift, 0); i++, idst++) - if (height == 3) - vlineSmooth3N(ptrs, ky - i, height, dst + idst*dst_stride, width*cn); - else if (height == 5) - vlineSmooth5N(ptrs, ky - i, height, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs, ky - i, height, dst + idst*dst_stride, width*cn); - for (; i < range.end - pre_shift; i++, idst++) - if (height - i == 3) - vlineSmooth3N(ptrs + i - max(0, ifrom), ky, height - i, dst + idst*dst_stride, width*cn); - else if (height - i == 5) - vlineSmooth5N(ptrs + i - max(0, ifrom), ky, height - i, dst + idst*dst_stride, width*cn); - else - vlineSmooth(ptrs + i - max(0, ifrom), ky, height - i, dst + idst*dst_stride, width*cn); - } - } - } -private: - const ET* src; - ET* dst; - size_t src_stride, dst_stride; - int width, height, cn; - const FT *kx, *ky; - int kxlen, kylen; - int borderType; - void(*hlineSmoothFunc)(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType); - void(*vlineSmoothFunc)(const FT* const * src, const FT* m, int n, ET* dst, int len); - - fixedSmoothInvoker(const fixedSmoothInvoker&); - fixedSmoothInvoker& operator=(const fixedSmoothInvoker&); -}; - static void getGaussianKernel(int n, double sigma, int ktype, Mat& res) { res = getGaussianKernel(n, sigma, ktype); } template static void getGaussianKernel(int n, double sigma, int, std::vector& res) { res = getFixedpointGaussianKernel(n, sigma); } @@ -2149,9 +198,7 @@ static void createGaussianKernels( T & kx, T & ky, int type, Size &ksize, getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F), ky ); } -} - -cv::Ptr cv::createGaussianFilter( int type, Size ksize, +Ptr createGaussianFilter( int type, Size ksize, double sigma1, double sigma2, int borderType ) { @@ -2161,8 +208,6 @@ cv::Ptr cv::createGaussianFilter( int type, Size ksize, return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); } -namespace cv -{ #ifdef HAVE_OPENCL static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth, @@ -2431,11 +476,10 @@ static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, #endif } #endif -} -void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, - int borderType ) +void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, + double sigma1, double sigma2, + int borderType) { CV_INSTRUMENT_REGION(); @@ -2497,14 +541,16 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); if (src.data == dst.data) src = src.clone(); - fixedSmoothInvoker invoker(src.ptr(), src.step1(), dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED); - parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs()))); + CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType), + CV_CPU_DISPATCH_MODES_ALL); return; } sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); } +} // namespace + ////////////////////////////////////////////////////////////////////////////////////////// CV_IMPL void diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp index 909ffa919c..4f52bc0d80 100644 --- a/modules/imgproc/src/smooth.simd.hpp +++ b/modules/imgproc/src/smooth.simd.hpp @@ -46,120 +46,28 @@ #include #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" - -#include "opencv2/core/openvx/ovx_defs.hpp" #include "filter.hpp" -#include "fixedpoint.inl.hpp" - -/****************************************************************************************\ - Gaussian Blur -\****************************************************************************************/ - -cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) -{ - CV_Assert(n > 0); - const int SMALL_GAUSSIAN_SIZE = 7; - static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] = - { - {1.f}, - {0.25f, 0.5f, 0.25f}, - {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, - {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f} - }; - - const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ? - small_gaussian_tab[n>>1] : 0; - - CV_Assert( ktype == CV_32F || ktype == CV_64F ); - Mat kernel(n, 1, ktype); - float* cf = kernel.ptr(); - double* cd = kernel.ptr(); - - double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8; - double scale2X = -0.5/(sigmaX*sigmaX); - double sum = 0; - - int i; - for( i = 0; i < n; i++ ) - { - double x = i - (n-1)*0.5; - double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x); - if( ktype == CV_32F ) - { - cf[i] = (float)t; - sum += cf[i]; - } - else - { - cd[i] = t; - sum += cd[i]; - } - } - - CV_DbgAssert(fabs(sum) > 0); - sum = 1./sum; - for( i = 0; i < n; i++ ) - { - if( ktype == CV_32F ) - cf[i] = (float)(cf[i]*sum); - else - cd[i] *= sum; - } - - return kernel; -} +#include "opencv2/core/softfloat.hpp" namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst, + const uint16_t/*ufixedpoint16*/* fkx, int fkx_size, + const uint16_t/*ufixedpoint16*/* fky, int fky_size, + int borderType); -template -static std::vector getFixedpointGaussianKernel( int n, double sigma ) -{ - if (sigma <= 0) - { - if(n == 1) - return std::vector(1, softdouble(1.0)); - else if(n == 3) - { - T v3[] = { softdouble(0.25), softdouble(0.5), softdouble(0.25) }; - return std::vector(v3, v3 + 3); - } - else if(n == 5) - { - T v5[] = { softdouble(0.0625), softdouble(0.25), softdouble(0.375), softdouble(0.25), softdouble(0.0625) }; - return std::vector(v5, v5 + 5); - } - else if(n == 7) - { - T v7[] = { softdouble(0.03125), softdouble(0.109375), softdouble(0.21875), softdouble(0.28125), softdouble(0.21875), softdouble(0.109375), softdouble(0.03125) }; - return std::vector(v7, v7 + 7); - } - } +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#if defined(CV_CPU_BASELINE_MODE) +// included in dispatch.cpp +#else +#include "fixedpoint.inl.hpp" +#endif - softdouble sigmaX = sigma > 0 ? softdouble(sigma) : mulAdd(softdouble(n),softdouble(0.15),softdouble(0.35));// softdouble(((n-1)*0.5 - 1)*0.3 + 0.8) - softdouble scale2X = softdouble(-0.5*0.25)/(sigmaX*sigmaX); - std::vector values(n); - softdouble sum(0.); - for(int i = 0, x = 1 - n; i < n; i++, x+=2 ) - { - // x = i - (n - 1)*0.5 - // t = std::exp(scale2X*x*x) - values[i] = exp(softdouble(x*x)*scale2X); - sum += values[i]; - } - sum = softdouble::one()/sum; - - std::vector kernel(n); - for(int i = 0; i < n; i++ ) - { - kernel[i] = values[i] * sum; - } - - return kernel; -}; +namespace { template void hlineSmooth1N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int) @@ -2119,418 +2027,27 @@ private: fixedSmoothInvoker& operator=(const fixedSmoothInvoker&); }; -static void getGaussianKernel(int n, double sigma, int ktype, Mat& res) { res = getGaussianKernel(n, sigma, ktype); } -template static void getGaussianKernel(int n, double sigma, int, std::vector& res) { res = getFixedpointGaussianKernel(n, sigma); } +} // namespace anon -template -static void createGaussianKernels( T & kx, T & ky, int type, Size &ksize, - double sigma1, double sigma2 ) -{ - int depth = CV_MAT_DEPTH(type); - if( sigma2 <= 0 ) - sigma2 = sigma1; - - // automatic detection of kernel size from sigma - if( ksize.width <= 0 && sigma1 > 0 ) - ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1; - if( ksize.height <= 0 && sigma2 > 0 ) - ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1; - - CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && - ksize.height > 0 && ksize.height % 2 == 1 ); - - sigma1 = std::max( sigma1, 0. ); - sigma2 = std::max( sigma2, 0. ); - - getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F), kx ); - if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) - ky = kx; - else - getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F), ky ); -} - -} - -cv::Ptr cv::createGaussianFilter( int type, Size ksize, - double sigma1, double sigma2, - int borderType ) -{ - Mat kx, ky; - createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - - return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); -} - -namespace cv -{ -#ifdef HAVE_OPENCL - -static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth, - InputArray _kernelX, InputArray _kernelY, int borderType) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if ( !(dev.isIntel() && (type == CV_8UC1) && - (_src.offset() == 0) && (_src.step() % 4 == 0) && - ((ksize.width == 5 && (_src.cols() % 4 == 0)) || - (ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) ) - return false; - - Mat kernelX = _kernelX.getMat().reshape(1, 1); - if (kernelX.cols % 2 != 1) - return false; - Mat kernelY = _kernelY.getMat().reshape(1, 1); - if (kernelY.cols % 2 != 1) - return false; - - if (ddepth < 0) - ddepth = sdepth; - - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - - if (ksize.width == 3) - { - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - } - else if (ksize.width == 5) - { - globalsize[0] = size.width / 4; - globalsize[1] = size.height / 1; - } - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - char build_opts[1024]; - sprintf(build_opts, "-D %s %s%s", borderMap[borderType & ~BORDER_ISOLATED], - ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str()); - - ocl::Kernel kernel; - - if (ksize.width == 3) - kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts); - else if (ksize.width == 5) - kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts); - - if (kernel.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernel.set(idxArg, (int)dst.step); - idxArg = kernel.set(idxArg, (int)dst.rows); - idxArg = kernel.set(idxArg, (int)dst.cols); - - return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -#endif - -#ifdef HAVE_OPENVX - -namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 320 * 240; } -} -static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, int borderType) -{ - if (sigma2 <= 0) - sigma2 = sigma1; - // automatic detection of kernel size from sigma - if (ksize.width <= 0 && sigma1 > 0) - ksize.width = cvRound(sigma1*6 + 1) | 1; - if (ksize.height <= 0 && sigma2 > 0) - ksize.height = cvRound(sigma2*6 + 1) | 1; - - if (_src.type() != CV_8UC1 || - _src.cols() < 3 || _src.rows() < 3 || - ksize.width != 3 || ksize.height != 3) - return false; - - sigma1 = std::max(sigma1, 0.); - sigma2 = std::max(sigma2, 0.); - - if (!(sigma1 == 0.0 || (sigma1 - 0.8) < DBL_EPSILON) || !(sigma2 == 0.0 || (sigma2 - 0.8) < DBL_EPSILON) || - ovx::skipSmallImages(_src.cols(), _src.rows())) - return false; - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) - return false; //Process isolated borders only - vx_enum border; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - border = VX_BORDER_CONSTANT; - break; - case BORDER_REPLICATE: - border = VX_BORDER_REPLICATE; - break; - default: - return false; - } - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(border, (vx_uint8)(0)); - ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib)); - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - return true; -} - -#endif - -#ifdef HAVE_IPP -// IW 2017u2 has bug which doesn't allow use of partial inMem with tiling -#if IPP_DISABLE_GAUSSIANBLUR_PARALLEL -#define IPP_GAUSSIANBLUR_PARALLEL 0 -#else -#define IPP_GAUSSIANBLUR_PARALLEL 1 -#endif - -#ifdef HAVE_IPP_IW - -class ipp_gaussianBlurParallel: public ParallelLoopBody -{ -public: - ipp_gaussianBlurParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, int kernelSize, float sigma, ::ipp::IwiBorderType &border, bool *pOk): - m_src(src), m_dst(dst), m_kernelSize(kernelSize), m_sigma(sigma), m_border(border), m_pOk(pOk) { - *m_pOk = true; - } - ~ipp_gaussianBlurParallel() - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - CV_INSTRUMENT_REGION_IPP(); - - if(!*m_pOk) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, m_src, m_dst, m_kernelSize, m_sigma, ::ipp::IwDefault(), m_border, tile); - } - catch(const ::ipp::IwException &) - { - *m_pOk = false; - return; - } - } -private: - ::ipp::IwiImage &m_src; - ::ipp::IwiImage &m_dst; - - int m_kernelSize; - float m_sigma; - ::ipp::IwiBorderType &m_border; - - volatile bool *m_pOk; - const ipp_gaussianBlurParallel& operator= (const ipp_gaussianBlurParallel&); -}; - -#endif - -static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, int borderType ) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201800 && ((defined _MSC_VER && defined _M_IX86) || (defined __GNUC__ && defined __i386__)) - CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); - return false; // bug on ia32 -#else - if(sigma1 != sigma2) - return false; - - if(sigma1 < FLT_EPSILON) - return false; - - if(ksize.width != ksize.height) - return false; - - // Acquire data and begin processing - try - { - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize = ::ipp::iwiSizeToBorderSize(ippiGetSize(ksize)); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_GAUSSIANBLUR_PARALLEL && threads > 1) { - bool ok; - ipp_gaussianBlurParallel invoker(iwSrc, iwDst, ksize.width, (float) sigma1, ippBorder, &ok); - - if(!ok) - return false; - const Range range(0, (int) iwDst.m_size.height); - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, iwSrc, iwDst, ksize.width, sigma1, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - - return true; -#endif -#else - CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); - return false; -#endif -} -#endif -} - -void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, - int borderType ) +void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst, + const uint16_t/*ufixedpoint16*/* fkx, int fkx_size, + const uint16_t/*ufixedpoint16*/* fky, int fky_size, + int borderType) { CV_INSTRUMENT_REGION(); - int type = _src.type(); - Size size = _src.size(); - _dst.create( size, type ); - - if( (borderType & ~BORDER_ISOLATED) != BORDER_CONSTANT && - ((borderType & BORDER_ISOLATED) != 0 || !_src.getMat().isSubmatrix()) ) + CV_Assert(src.depth() == CV_8U && ((borderType & BORDER_ISOLATED) || !src.isSubmatrix())); + fixedSmoothInvoker invoker( + src.ptr(), src.step1(), + dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), + (const ufixedpoint16*)fkx, fkx_size, (const ufixedpoint16*)fky, fky_size, + borderType & ~BORDER_ISOLATED); { - if( size.height == 1 ) - ksize.height = 1; - if( size.width == 1 ) - ksize.width = 1; - } - - if( ksize.width == 1 && ksize.height == 1 ) - { - _src.copyTo(_dst); - return; - } - - bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && - ((ksize.width == 3 && ksize.height == 3) || - (ksize.width == 5 && ksize.height == 5)) && - _src.rows() > ksize.height && _src.cols() > ksize.width); - CV_UNUSED(useOpenCL); - - int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - Mat kx, ky; - createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - - CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), - ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType)) - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType & BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - sigma1, sigma2, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) - - CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)); - - if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix())) - { - std::vector fkx, fky; - createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); - if (src.data == dst.data) - src = src.clone(); - fixedSmoothInvoker invoker(src.ptr(), src.step1(), dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED); + // TODO AVX guard (external call) parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs()))); - return; } - - sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); } -////////////////////////////////////////////////////////////////////////////////////////// - -CV_IMPL void -cvSmooth( const void* srcarr, void* dstarr, int smooth_type, - int param1, int param2, double param3, double param4 ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0; - - CV_Assert( dst.size() == src.size() && - (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) ); - - if( param2 <= 0 ) - param2 = param1; - - if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE ) - cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point(-1,-1), - smooth_type == CV_BLUR, cv::BORDER_REPLICATE ); - else if( smooth_type == CV_GAUSSIAN ) - cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE ); - else if( smooth_type == CV_MEDIAN ) - cv::medianBlur( src, dst, param1 ); - else - cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE ); - - if( dst.data != dst0.data ) - CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" ); -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From ce3c92eb1f4c38989ad6622e62fd1fb3a2cb3140 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 19:58:48 +0000 Subject: [PATCH 19/21] imgproc: dispatch bilateral_filter --- modules/imgproc/CMakeLists.txt | 1 + .../imgproc/src/bilateral_filter.dispatch.cpp | 975 +----------------- modules/imgproc/src/bilateral_filter.simd.hpp | 394 +------ 3 files changed, 51 insertions(+), 1319 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index d28d6b9046..9731694e59 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,5 +1,6 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(bilateral_filter SSE2 AVX2) ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) diff --git a/modules/imgproc/src/bilateral_filter.dispatch.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp index e9181f2182..a27ebb18f5 100644 --- a/modules/imgproc/src/bilateral_filter.dispatch.cpp +++ b/modules/imgproc/src/bilateral_filter.dispatch.cpp @@ -48,493 +48,14 @@ #include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_imgproc.hpp" +#include "bilateral_filter.simd.hpp" +#include "bilateral_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + /****************************************************************************************\ Bilateral Filtering \****************************************************************************************/ -namespace cv -{ - -class BilateralFilter_8u_Invoker : - public ParallelLoopBody -{ -public: - BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk, - int* _space_ofs, float *_space_weight, float *_color_weight) : - temp(&_temp), dest(&_dest), radius(_radius), - maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, cn = dest->channels(), k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const uchar* sptr = temp->ptr(i+radius) + radius*cn; - uchar* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); - k = 0; - for(; k <= maxk-4; k+=4) - { - const uchar* ksptr0 = sptr + space_ofs[k]; - const uchar* ksptr1 = sptr + space_ofs[k+1]; - const uchar* ksptr2 = sptr + space_ofs[k+2]; - const uchar* ksptr3 = sptr + space_ofs[k+3]; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_uint32 rval = vx_load_expand_q(sptr + j); - - v_uint32 val = vx_load_expand_q(ksptr0 + j); - v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)); - - val = vx_load_expand_q(ksptr1 + j); - w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - val = vx_load_expand_q(ksptr2 + j); - w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - val = vx_load_expand_q(ksptr3 + j); - w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - v_wsum += w; - v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum + j, v_sum); - } -#endif -#if CV_SIMD128 - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++) - { -#if CV_SIMD128 - v_uint32x4 rval = v_setall_u32(sptr[j]); - v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); - wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w); -#else - int rval = sptr[j]; - - int val = ksptr0[j]; - float w = space_weight[k] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr1[j]; - w = space_weight[k+1] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr2[j]; - w = space_weight[k+2] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; - - val = ksptr3[j]; - w = space_weight[k+3] * color_weight[std::abs(val - rval)]; - wsum[j] += w; - sum[j] += val * w; -#endif - } - } - for(; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_uint32 val = vx_load_expand_q(ksptr + j); - v_float32 w = kweight * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, vx_load_expand_q(sptr + j)))); - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - int val = ksptr[j]; - float w = space_weight[k] * color_weight[std::abs(val - sptr[j])]; - wsum[j] += w; - sum[j] += val * w; - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - 2*v_float32::nlanes; j += 2*v_float32::nlanes) - v_pack_u_store(dptr + j, v_pack(v_round(vx_load_aligned(sum + j ) / vx_load_aligned(wsum + j )), - v_round(vx_load_aligned(sum + j + v_float32::nlanes) / vx_load_aligned(wsum + j + v_float32::nlanes)))); -#endif - for (; j < size.width; j++) - { - // overflow is not possible here => there is no need to use cv::saturate_cast - CV_DbgAssert(fabs(wsum[j]) > 0); - dptr[j] = (uchar)cvRound(sum[j]/wsum[j]); - } - } - else - { - assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); - k = 0; - for(; k <= maxk-4; k+=4) - { - const uchar* ksptr0 = sptr + space_ofs[k]; - const uchar* ksptr1 = sptr + space_ofs[k+1]; - const uchar* ksptr2 = sptr + space_ofs[k+2]; - const uchar* ksptr3 = sptr + space_ofs[k+3]; - const uchar* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes, - ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes) - { - v_uint8 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(rsptr, rb, rg, rr); - - v_load_deinterleave(ksptr0, kb, kg, kr); - v_uint16 val0, val1, val2, val3, val4; - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_uint32 vall, valh; - v_expand(val0, vall, valh); - v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); - - v_load_deinterleave(ksptr1, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - - v_load_deinterleave(ksptr2, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - - v_load_deinterleave(ksptr3, kb, kg, kr); - v_expand(v_absdiff(kb, rb), val0, val1); - v_expand(v_absdiff(kg, rg), val2, val3); - val0 += val2; val1 += val3; - v_expand(v_absdiff(kr, rr), val2, val3); - val0 += val2; val1 += val3; - - v_expand(val0, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_expand(kb, val0, val2); - v_expand(val0, vall, valh); - v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_expand(kg, val0, val3); - v_expand(val0, vall, valh); - v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_expand(kr, val0, val4); - v_expand(val0, vall, valh); - v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - - v_expand(val1, vall, valh); - w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); - w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); - v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); - v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); - v_expand(val2, vall, valh); - v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); - v_expand(val3, vall, valh); - v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); - v_expand(val4, vall, valh); - v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); - v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); - } -#endif -#if CV_SIMD128 - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for(; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) - { -#if CV_SIMD128 - v_uint32x4 rb = v_setall_u32(rsptr[0]); - v_uint32x4 rg = v_setall_u32(rsptr[1]); - v_uint32x4 rr = v_setall_u32(rsptr[2]); - v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); - v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); - v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr))); - wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w); - sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w); - sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w); -#else - int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - - int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; - float w = space_weight[k]*color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; - w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; - w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - - b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; - w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; -#endif - } - } - for(; k < maxk; k++) - { - const uchar* ksptr = sptr + space_ofs[k]; - const uchar* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, ksptr += 3*v_uint8::nlanes, rsptr += 3*v_uint8::nlanes) - { - v_uint8 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_uint16 b_l, b_h, g_l, g_h, r_l, r_h; - v_expand(v_absdiff(kb, rb), b_l, b_h); - v_expand(v_absdiff(kg, rg), g_l, g_h); - v_expand(v_absdiff(kr, rr), r_l, r_h); - - v_uint32 val0, val1, val2, val3; - v_expand(b_l + g_l + r_l, val0, val1); - v_expand(b_h + g_h + r_h, val2, val3); - - v_expand(kb, b_l, b_h); - v_expand(kg, g_l, g_h); - v_expand(kr, r_l, r_h); - - v_float32 w0 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val0)); - v_float32 w1 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val1)); - v_float32 w2 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val2)); - v_float32 w3 = kweight * v_lut(color_weight, v_reinterpret_as_s32(val3)); - v_store_aligned(wsum + j , w0 + vx_load_aligned(wsum + j)); - v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); - v_store_aligned(wsum + j + 2*v_float32::nlanes, w2 + vx_load_aligned(wsum + j + 2*v_float32::nlanes)); - v_store_aligned(wsum + j + 3*v_float32::nlanes, w3 + vx_load_aligned(wsum + j + 3*v_float32::nlanes)); - v_expand(b_l, val0, val1); - v_expand(b_h, val2, val3); - v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); - v_store_aligned(sum_b + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_b + j + 2*v_float32::nlanes))); - v_store_aligned(sum_b + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_b + j + 3*v_float32::nlanes))); - v_expand(g_l, val0, val1); - v_expand(g_h, val2, val3); - v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); - v_store_aligned(sum_g + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_g + j + 2*v_float32::nlanes))); - v_store_aligned(sum_g + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_g + j + 3*v_float32::nlanes))); - v_expand(r_l, val0, val1); - v_expand(r_h, val2, val3); - v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(val0)), w0, vx_load_aligned(sum_r + j))); - v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val1)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); - v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val2)), w2, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); - v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(val3)), w3, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); - } -#endif - for(; j < size.width; j++, ksptr += 3, rsptr += 3) - { - int b = ksptr[0], g = ksptr[1], r = ksptr[2]; - float w = space_weight[k]*color_weight[std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])]; - wsum[j] += w; - sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; - } - } - j = 0; -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - for(; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, dptr += 3*v_uint8::nlanes) - { - v_float32 w0 = v_one / vx_load_aligned(wsum + j); - v_float32 w1 = v_one / vx_load_aligned(wsum + j + v_float32::nlanes); - v_float32 w2 = v_one / vx_load_aligned(wsum + j + 2*v_float32::nlanes); - v_float32 w3 = v_one / vx_load_aligned(wsum + j + 3*v_float32::nlanes); - - v_store_interleave(dptr, v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_b + j)), - v_round(w1 * vx_load_aligned(sum_b + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_b + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_b + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_g + j)), - v_round(w1 * vx_load_aligned(sum_g + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_g + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_g + j + 3*v_float32::nlanes)))), - v_pack_u(v_pack(v_round(w0 * vx_load_aligned(sum_r + j)), - v_round(w1 * vx_load_aligned(sum_r + j + v_float32::nlanes))), - v_pack(v_round(w2 * vx_load_aligned(sum_r + j + 2*v_float32::nlanes)), - v_round(w3 * vx_load_aligned(sum_r + j + 3*v_float32::nlanes))))); - } -#endif - for(; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) > 0); - wsum[j] = 1.f/wsum[j]; - *(dptr++) = (uchar)cvRound(sum_b[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_g[j]*wsum[j]); - *(dptr++) = (uchar)cvRound(sum_r[j]*wsum[j]); - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - const Mat *temp; - Mat *dest; - int radius, maxk, *space_ofs; - float *space_weight, *color_weight; -}; +namespace cv { #ifdef HAVE_OPENCL @@ -542,6 +63,7 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, double sigma_color, double sigma_space, int borderType) { + CV_INSTRUMENT_REGION(); #ifdef __ANDROID__ if (ocl::Device::getDefault().isNVidia()) return false; @@ -628,16 +150,18 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; return k.run(2, globalsize, NULL, false); } - #endif + + static void bilateralFilter_8u( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ) { + CV_INSTRUMENT_REGION(); + int cn = src.channels(); int i, j, maxk, radius; - Size size = src.size(); CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); @@ -686,479 +210,18 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, } } - BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + CV_CPU_DISPATCH(bilateralFilterInvoker_8u, (dst, temp, radius, maxk, space_ofs, space_weight, color_weight), + CV_CPU_DISPATCH_MODES_ALL); } -class BilateralFilter_32f_Invoker : - public ParallelLoopBody -{ -public: - - BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs, - const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) : - cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs), - temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT) - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - int i, j, k; - Size size = dest->size(); - - for( i = range.start; i < range.end; i++ ) - { - const float* sptr = temp->ptr(i+radius) + radius*cn; - float* dptr = dest->ptr(i); - - if( cn == 1 ) - { - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH) + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - k = 0; - for(; k <= maxk - 4; k+=4) - { - const float* ksptr0 = sptr + space_ofs[k]; - const float* ksptr1 = sptr + space_ofs[k + 1]; - const float* ksptr2 = sptr + space_ofs[k + 2]; - const float* ksptr3 = sptr + space_ofs[k + 3]; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 rval = vx_load(sptr + j); - - v_float32 val = vx_load(ksptr0 + j); - v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j)); - - val = vx_load(ksptr1 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - val = vx_load(ksptr2 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - val = vx_load(ksptr3 + j); - knan = v_not_nan(val); - alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum = v_muladd(val & knan, w, v_sum); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum + j, v_sum); - } -#endif -#if CV_SIMD128 - v_float32x4 v_one4 = v_setall_f32(1.f); - v_float32x4 sindex4 = v_setall_f32(scale_index); - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++) - { -#if CV_SIMD128 - v_float32x4 rval = v_setall_f32(sptr[j]); - v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); - v_float32x4 knan = v_not_nan(val); - v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan; - v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; - wsum[j] += v_reduce_sum(w); - sum[j] += v_reduce_sum((val & knan) * w); -#else - float rval = sptr[j]; - - float val = ksptr0[j]; - float alpha = std::abs(val - rval) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr1[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+1] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr2[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+2] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - - val = ksptr3[j]; - alpha = std::abs(val - rval) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k+3] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } -#endif - } - } - for(; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 val = vx_load(ksptr + j); - v_float32 rval = vx_load(sptr + j); - v_float32 knan = v_not_nan(val); - v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j))); - } -#endif - for (; j < size.width; j++) - { - float val = ksptr[j]; - float rval = sptr[j]; - float alpha = std::abs(val - rval) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!cvIsNaN(val)) - { - float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum[j] += val * w; - } - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - { - v_float32 v_val = vx_load(sptr + j); - v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val)))); - } -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) >= 0); - dptr[j] = cvIsNaN(sptr[j]) ? sum[j] / wsum[j] : (sum[j] + sptr[j]) / (wsum[j] + 1.f); - } - } - else - { - CV_Assert( cn == 3 ); - AutoBuffer buf(alignSize(size.width, CV_SIMD_WIDTH)*3 + size.width + CV_SIMD_WIDTH - 1); - memset(buf.data(), 0, buf.size() * sizeof(float)); - float *sum_b = alignPtr(buf.data(), CV_SIMD_WIDTH); - float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); - float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); - float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); -#if CV_SIMD - v_float32 v_one = vx_setall_f32(1.f); - v_float32 sindex = vx_setall_f32(scale_index); -#endif - k = 0; - for (; k <= maxk-4; k+=4) - { - const float* ksptr0 = sptr + space_ofs[k]; - const float* ksptr1 = sptr + space_ofs[k+1]; - const float* ksptr2 = sptr + space_ofs[k+2]; - const float* ksptr3 = sptr + space_ofs[k+3]; - const float* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight0 = vx_setall_f32(space_weight[k]); - v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); - v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); - v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes, - ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes) - { - v_float32 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(rsptr, rb, rg, rr); - - v_load_deinterleave(ksptr0, kb, kg, kr); - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_float32 v_wsum = vx_load_aligned(wsum + j) + w; - v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)); - v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)); - v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)); - - v_load_deinterleave(ksptr1, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_load_deinterleave(ksptr2, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_load_deinterleave(ksptr3, kb, kg, kr); - knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_wsum += w; - v_sum_b = v_muladd(kb & knan, w, v_sum_b); - v_sum_g = v_muladd(kg & knan, w, v_sum_g); - v_sum_r = v_muladd(kr & knan, w, v_sum_r); - - v_store_aligned(wsum + j, v_wsum); - v_store_aligned(sum_b + j, v_sum_b); - v_store_aligned(sum_g + j, v_sum_g); - v_store_aligned(sum_r + j, v_sum_r); - } -#endif -#if CV_SIMD128 - v_float32x4 v_one4 = v_setall_f32(1.f); - v_float32x4 sindex4 = v_setall_f32(scale_index); - v_float32x4 kweight4 = v_load(space_weight + k); -#endif - for (; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) - { -#if CV_SIMD128 - v_float32x4 rb = v_setall_f32(rsptr[0]); - v_float32x4 rg = v_setall_f32(rsptr[1]); - v_float32x4 rr = v_setall_f32(rsptr[2]); - v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); - v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); - v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); - v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32x4 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; - wsum[j] += v_reduce_sum(w); - sum_b[j] += v_reduce_sum((kb & knan) * w); - sum_g[j] += v_reduce_sum((kg & knan) * w); - sum_r[j] += v_reduce_sum((kr & knan) * w); -#else - float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); - - float b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; - bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+1] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+2] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - - b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; - v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k+3] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } -#endif - } - } - for (; k < maxk; k++) - { - const float* ksptr = sptr + space_ofs[k]; - const float* rsptr = sptr; - j = 0; -#if CV_SIMD - v_float32 kweight = vx_setall_f32(space_weight[k]); - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, ksptr += 3*v_float32::nlanes, rsptr += 3*v_float32::nlanes) - { - v_float32 kb, kg, kr, rb, rg, rr; - v_load_deinterleave(ksptr, kb, kg, kr); - v_load_deinterleave(rsptr, rb, rg, rr); - - v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); - v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; - v_int32 idx = v_trunc(alpha); - alpha -= v_cvt_f32(idx); - - v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; - v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j))); - } -#endif - for (; j < size.width; j++, ksptr += 3, rsptr += 3) - { - float b = ksptr[0], g = ksptr[1], r = ksptr[2]; - bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); - float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; - bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); - float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; - int idx = cvFloor(alpha); - alpha -= idx; - if (!v_NAN) - { - float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; - } - } - } - j = 0; -#if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes) - { - v_float32 b, g, r; - v_load_deinterleave(sptr, b, g, r); - v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r); - v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask)); - v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w); - } -#endif - for (; j < size.width; j++) - { - CV_DbgAssert(fabs(wsum[j]) >= 0); - float b = *(sptr++); - float g = *(sptr++); - float r = *(sptr++); - if (cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r)) - { - wsum[j] = 1.f / wsum[j]; - *(dptr++) = sum_b[j] * wsum[j]; - *(dptr++) = sum_g[j] * wsum[j]; - *(dptr++) = sum_r[j] * wsum[j]; - } - else - { - wsum[j] = 1.f / (wsum[j] + 1.f); - *(dptr++) = (sum_b[j] + b) * wsum[j]; - *(dptr++) = (sum_g[j] + g) * wsum[j]; - *(dptr++) = (sum_r[j] + r) * wsum[j]; - } - } - } - } -#if CV_SIMD - vx_cleanup(); -#endif - } - -private: - int cn, radius, maxk, *space_ofs; - const Mat* temp; - Mat *dest; - float scale_index, *space_weight, *expLUT; -}; - - static void bilateralFilter_32f( const Mat& src, Mat& dst, int d, double sigma_color, double sigma_space, int borderType ) { + CV_INSTRUMENT_REGION(); + int cn = src.channels(); int i, j, maxk, radius; double minValSrc=-1, maxValSrc=1; @@ -1166,7 +229,6 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, int kExpNumBins = 0; float lastExpVal = 1.f; float len, scale_index; - Size size = src.size(); CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); @@ -1236,9 +298,8 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, } // parallel_for usage - - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + CV_CPU_DISPATCH(bilateralFilterInvoker_32f, (cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT), + CV_CPU_DISPATCH_MODES_ALL); } #ifdef HAVE_IPP @@ -1339,9 +400,7 @@ static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, do } #endif -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, +void bilateralFilter( InputArray _src, OutputArray _dst, int d, double sigmaColor, double sigmaSpace, int borderType ) { @@ -1365,4 +424,4 @@ void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, "Bilateral filtering is only implemented for 8u and 32f images" ); } -/* End of file. */ +} // namespace diff --git a/modules/imgproc/src/bilateral_filter.simd.hpp b/modules/imgproc/src/bilateral_filter.simd.hpp index e9181f2182..65abcd4e40 100644 --- a/modules/imgproc/src/bilateral_filter.simd.hpp +++ b/modules/imgproc/src/bilateral_filter.simd.hpp @@ -43,18 +43,25 @@ #include "precomp.hpp" -#include - #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" /****************************************************************************************\ Bilateral Filtering \****************************************************************************************/ -namespace cv -{ +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight); +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT); +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +namespace { class BilateralFilter_8u_Invoker : public ParallelLoopBody { @@ -68,6 +75,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, cn = dest->channels(), k; Size size = dest->size(); @@ -536,161 +545,20 @@ private: float *space_weight, *color_weight; }; -#ifdef HAVE_OPENCL +} // namespace anon -static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, - double sigma_color, double sigma_space, - int borderType) +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight) { -#ifdef __ANDROID__ - if (ocl::Device::getDefault().isNVidia()) - return false; -#endif - - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int i, j, maxk, radius; - - if (depth != CV_8U || cn > 4) - return false; - - if (sigma_color <= 0) - sigma_color = 1; - if (sigma_space <= 0) - sigma_space = 1; - - double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); - double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); - - if ( d <= 0 ) - radius = cvRound(sigma_space * 1.5); - else - radius = d / 2; - radius = MAX(radius, 1); - d = radius * 2 + 1; - - UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; - if (src.u == dst.u) - return false; - - copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); - std::vector _space_weight(d * d); - std::vector _space_ofs(d * d); - float * const space_weight = &_space_weight[0]; - int * const space_ofs = &_space_ofs[0]; - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i * i + (double)j * j); - if ( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); - space_ofs[maxk++] = (int)(i * temp.step + j * cn); - } - - char cvt[3][40]; - String cnstr = cn > 1 ? format("%d", cn) : ""; - String kernelName("bilateral"); - size_t sizeDiv = 1; - if ((ocl::Device::getDefault().isIntel()) && - (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) - { - //Intel GPU - if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. - { - kernelName = "bilateral_float4"; - sizeDiv = 4; - } - } - ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, - format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" - " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", - radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), - ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), - ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), - ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); - if (k.empty()) - return false; - - Mat mspace_weight(1, d * d, CV_32FC1, space_weight); - Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); - UMat ucolor_weight, uspace_weight, uspace_ofs; - - mspace_weight.copyTo(uspace_weight); - mspace_ofs.copyTo(uspace_ofs); - - k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(uspace_weight), - ocl::KernelArg::PtrReadOnly(uspace_ofs)); - - size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; - return k.run(2, globalsize, NULL, false); -} - -#endif -static void -bilateralFilter_8u( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - Size size = src.size(); - - CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - std::vector _color_weight(cn*256); - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* color_weight = &_color_weight[0]; - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // initialize color-related bilateral filter coefficients - - for( i = 0; i < 256*cn; i++ ) - color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - { - j = -radius; - - for( ; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*temp.step + j*cn); - } - } - + CV_INSTRUMENT_REGION(); BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } +namespace { + class BilateralFilter_32f_Invoker : public ParallelLoopBody { @@ -705,6 +573,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, k; Size size = dest->size(); @@ -1153,216 +1023,18 @@ private: float scale_index, *space_weight, *expLUT; }; +} // namespace anon -static void -bilateralFilter_32f( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) -{ - int cn = src.channels(); - int i, j, maxk, radius; - double minValSrc=-1, maxValSrc=1; - const int kExpNumBinsPerChannel = 1 << 12; - int kExpNumBins = 0; - float lastExpVal = 1.f; - float len, scale_index; - Size size = src.size(); - - CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - // compute the min/max range for the input image (even if multichannel) - - minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); - if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) - { - src.copyTo(dst); - return; - } - - // temporary copy of the image with borders for easy processing - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - // allocate lookup tables - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // assign a length which is slightly more than needed - len = (float)(maxValSrc - minValSrc) * cn; - kExpNumBins = kExpNumBinsPerChannel * cn; - std::vector _expLUT(kExpNumBins+2); - float* expLUT = &_expLUT[0]; - - scale_index = kExpNumBins/len; - - // initialize the exp LUT - for( i = 0; i < kExpNumBins+2; i++ ) - { - if( lastExpVal > 0.f ) - { - double val = i / scale_index; - expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); - lastExpVal = expLUT[i]; - } - else - expLUT[i] = 0.f; - } - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius || ( i == 0 && j == 0 ) ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); - } - - // parallel_for usage - - BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); -} - -#ifdef HAVE_IPP -#define IPP_BILATERAL_PARALLEL 1 - -#ifdef HAVE_IPP_IW -class ipp_bilateralFilterParallel: public ParallelLoopBody -{ -public: - ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok): - src(_src), dst(_dst) - { - pOk = _ok; - - radius = _radius; - valSquareSigma = _valSquareSigma; - posSquareSigma = _posSquareSigma; - borderType = _borderType; - - *pOk = true; - } - ~ipp_bilateralFilterParallel() {} - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - if(*pOk == false) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile); - } - catch(const ::ipp::IwException &) - { - *pOk = false; - return; - } - } -private: - ::ipp::IwiImage &src; - ::ipp::IwiImage &dst; - - int radius; - Ipp32f valSquareSigma; - Ipp32f posSquareSigma; - ::ipp::IwiBorderType borderType; - - bool *pOk; - const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&); -}; -#endif - -static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - int radius = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1); - Ipp32f valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor); - Ipp32f posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace); - - // Acquire data and begin processing - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize(radius); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_BILATERAL_PARALLEL && threads > 1) { - bool ok = true; - Range range(0, (int)iwDst.m_size.height); - ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok); - if(!ok) - return false; - - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType); - return false; -#endif -} -#endif - -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, - double sigmaColor, double sigmaSpace, - int borderType ) +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT) { CV_INSTRUMENT_REGION(); - _dst.create( _src.size(), _src.type() ); - - CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), - ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) - - Mat src = _src.getMat(), dst = _dst.getMat(); - - CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType)); - - if( src.depth() == CV_8U ) - bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else if( src.depth() == CV_32F ) - bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else - CV_Error( CV_StsUnsupportedFormat, - "Bilateral filtering is only implemented for 8u and 32f images" ); + BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From 5a01227aa1ccf971d17484ad1e1cc7c73aafd1dc Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 9 Mar 2019 20:19:05 +0000 Subject: [PATCH 20/21] imgproc: dispatch box_filter --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/box_filter.dispatch.cpp | 1307 +------------------ modules/imgproc/src/box_filter.simd.hpp | 548 +------- 3 files changed, 78 insertions(+), 1778 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 9731694e59..d60fa7c58f 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,6 +1,7 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) ocv_add_dispatched_file(bilateral_filter SSE2 AVX2) +ocv_add_dispatched_file(box_filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) diff --git a/modules/imgproc/src/box_filter.dispatch.cpp b/modules/imgproc/src/box_filter.dispatch.cpp index 14f266258f..154ccfd09e 100644 --- a/modules/imgproc/src/box_filter.dispatch.cpp +++ b/modules/imgproc/src/box_filter.dispatch.cpp @@ -50,1119 +50,11 @@ #include "opencv2/core/openvx/ovx_defs.hpp" -namespace cv -{ +#include "box_filter.simd.hpp" +#include "box_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content -/****************************************************************************************\ - Box Filter -\****************************************************************************************/ -template -struct RowSum : - public BaseRowFilter -{ - RowSum( int _ksize, int _anchor ) : - BaseRowFilter() - { - ksize = _ksize; - anchor = _anchor; - } - - virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - const T* S = (const T*)src; - ST* D = (ST*)dst; - int i = 0, k, ksz_cn = ksize*cn; - - width = (width - 1)*cn; - if( ksize == 3 ) - { - for( i = 0; i < width + cn; i++ ) - { - D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2]; - } - } - else if( ksize == 5 ) - { - for( i = 0; i < width + cn; i++ ) - { - D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4]; - } - } - else if( cn == 1 ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i++ ) - s += (ST)S[i]; - D[0] = s; - for( i = 0; i < width; i++ ) - { - s += (ST)S[i + ksz_cn] - (ST)S[i]; - D[i+1] = s; - } - } - else if( cn == 3 ) - { - ST s0 = 0, s1 = 0, s2 = 0; - for( i = 0; i < ksz_cn; i += 3 ) - { - s0 += (ST)S[i]; - s1 += (ST)S[i+1]; - s2 += (ST)S[i+2]; - } - D[0] = s0; - D[1] = s1; - D[2] = s2; - for( i = 0; i < width; i += 3 ) - { - s0 += (ST)S[i + ksz_cn] - (ST)S[i]; - s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; - s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; - D[i+3] = s0; - D[i+4] = s1; - D[i+5] = s2; - } - } - else if( cn == 4 ) - { - ST s0 = 0, s1 = 0, s2 = 0, s3 = 0; - for( i = 0; i < ksz_cn; i += 4 ) - { - s0 += (ST)S[i]; - s1 += (ST)S[i+1]; - s2 += (ST)S[i+2]; - s3 += (ST)S[i+3]; - } - D[0] = s0; - D[1] = s1; - D[2] = s2; - D[3] = s3; - for( i = 0; i < width; i += 4 ) - { - s0 += (ST)S[i + ksz_cn] - (ST)S[i]; - s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1]; - s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2]; - s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3]; - D[i+4] = s0; - D[i+5] = s1; - D[i+6] = s2; - D[i+7] = s3; - } - } - else - for( k = 0; k < cn; k++, S++, D++ ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i += cn ) - s += (ST)S[i]; - D[0] = s; - for( i = 0; i < width; i += cn ) - { - s += (ST)S[i + ksz_cn] - (ST)S[i]; - D[i+cn] = s; - } - } - } -}; - - -template -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i; - ST* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(ST)); - - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const ST* Sp = (const ST*)src[0]; - - for( i = 0; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const ST* Sp = (const ST*)src[0]; - const ST* Sm = (const ST*)src[1-ksize]; - T* D = (T*)dst; - if( haveScale ) - { - for( i = 0; i <= width - 2; i += 2 ) - { - ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; - D[i] = saturate_cast(s0*_scale); - D[i+1] = saturate_cast(s1*_scale); - s0 -= Sm[i]; s1 -= Sm[i+1]; - SUM[i] = s0; SUM[i+1] = s1; - } - - for( ; i < width; i++ ) - { - ST s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - for( i = 0; i <= width - 2; i += 2 ) - { - ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1]; - D[i] = saturate_cast(s0); - D[i+1] = saturate_cast(s1); - s0 -= Sm[i]; s1 -= Sm[i+1]; - SUM[i] = s0; SUM[i+1] = s1; - } - - for( ; i < width; i++ ) - { - ST s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - uchar* D = (uchar*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); - v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); - - v_uint16 v_dst = v_pack(v_s0d, v_s01d); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); - - v_uint16x8 v_dst = v_pack(v_s0d, v_s01d); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)); - v_pack_store(D + i, v_dst); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : -public BaseColumnFilter -{ - enum { SHIFT = 23 }; - - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - divDelta = 0; - divScale = 1; - if( scale != 1 ) - { - int d = cvRound(1./scale); - double scalef = ((double)(1 << SHIFT))/d; - divScale = cvFloor(scalef); - scalef -= divScale; - divDelta = d/2; - if( scalef < 0.5 ) - divDelta++; - else - divScale++; - } - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - const int ds = divScale; - const int dd = divDelta; - ushort* SUM; - const bool haveScale = scale != 1; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(SUM[0])); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const ushort* Sp = (const ushort*)src[0]; - int i = 0; -#if CV_SIMD - for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const ushort* Sp = (const ushort*)src[0]; - const ushort* Sm = (const ushort*)src[1-ksize]; - uchar* D = (uchar*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_uint32 _ds4 = vx_setall_u32((unsigned)ds); - v_uint16 _dd8 = vx_setall_u16((ushort)dd); - - for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes ) - { - v_uint16 _sm0 = vx_load(Sm + i); - v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes); - - v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i)); - v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes)); - - v_uint32 _s00, _s01, _s10, _s11; - - v_expand(_s0 + _dd8, _s00, _s01); - v_expand(_s1 + _dd8, _s10, _s11); - - _s00 = v_shr(_s00*_ds4); - _s01 = v_shr(_s01*_ds4); - _s10 = v_shr(_s10*_ds4); - _s11 = v_shr(_s11*_ds4); - - v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); - v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); - - _s0 = v_sub_wrap(_s0, _sm0); - _s1 = v_sub_wrap(_s1, _sm1); - - v_store(D + i, v_pack_u(r0, r1)); - v_store(SUM + i, _s0); - v_store(SUM + i + v_uint16::nlanes, _s1); - } -#if CV_SIMD_WIDTH > 16 - v_uint32x4 ds4 = v_setall_u32((unsigned)ds); - v_uint16x8 dd8 = v_setall_u16((ushort)dd); - - for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes ) - { - v_uint16x8 _sm0 = v_load(Sm + i); - v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes); - - v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i)); - v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes)); - - v_uint32x4 _s00, _s01, _s10, _s11; - - v_expand(_s0 + dd8, _s00, _s01); - v_expand(_s1 + dd8, _s10, _s11); - - _s00 = v_shr(_s00*ds4); - _s01 = v_shr(_s01*ds4); - _s10 = v_shr(_s10*ds4); - _s11 = v_shr(_s11*ds4); - - v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01)); - v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11)); - - _s0 = v_sub_wrap(_s0, _sm0); - _s1 = v_sub_wrap(_s1, _sm1); - - v_store(D + i, v_pack_u(r0, r1)); - v_store(SUM + i, _s0); - v_store(SUM + i + v_uint16x8::nlanes, _s1); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (uchar)((s0 + dd)*ds >> SHIFT); - SUM[i] = (ushort)(s0 - Sm[i]); - } - } - else - { - int i = 0; - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = (ushort)(s0 - Sm[i]); - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - int divDelta; - int divScale; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i; - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - short* D = (short*)dst; - if( haveScale ) - { - i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); - v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - i = 0; -#if CV_SIMD - for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_store(D + i, v_pack(v_s0, v_s01)); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_store(D + i, v_pack(v_s0, v_s01)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - ushort* D = (ushort*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale)); - v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale)); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale)); - v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale)); - v_store(D + i, v_pack(v_s0d, v_s01d)); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD - for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes); - - v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes); - - v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01))); - - v_store(SUM + i, v_s0 - v_load(Sm + i)); - v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int* Sp = (const int*)src[0]; - const int* Sm = (const int*)src[1-ksize]; - int* D = (int*)dst; - if( haveScale ) - { - int i = 0; -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale); - - v_store(D + i, v_s0d); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale); - - v_store(D + i, v_s0d); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = saturate_cast(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; -#if CV_SIMD - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - - v_store(D + i, v_s0); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - - v_store(D + i, v_s0); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = s0; - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; - - -template<> -struct ColumnSum : - public BaseColumnFilter -{ - ColumnSum( int _ksize, int _anchor, double _scale ) : - BaseColumnFilter() - { - ksize = _ksize; - anchor = _anchor; - scale = _scale; - sumCount = 0; - } - - virtual void reset() CV_OVERRIDE { sumCount = 0; } - - virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int* SUM; - bool haveScale = scale != 1; - double _scale = scale; - - if( width != (int)sum.size() ) - { - sum.resize(width); - sumCount = 0; - } - - SUM = &sum[0]; - if( sumCount == 0 ) - { - memset((void*)SUM, 0, width*sizeof(int)); - for( ; sumCount < ksize - 1; sumCount++, src++ ) - { - const int* Sp = (const int*)src[0]; - int i = 0; -#if CV_SIMD - for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes ) - { - v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i)); - } -#endif -#endif - - for( ; i < width; i++ ) - SUM[i] += Sp[i]; - } - } - else - { - CV_Assert( sumCount == ksize-1 ); - src += ksize-1; - } - - for( ; count--; src++ ) - { - const int * Sp = (const int*)src[0]; - const int * Sm = (const int*)src[1-ksize]; - float* D = (float*)dst; - if( haveScale ) - { - int i = 0; - -#if CV_SIMD - v_float32 _v_scale = vx_setall_f32((float)_scale); - for (; i <= width - v_int32::nlanes; i += v_int32::nlanes) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0) * _v_scale); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - v_float32x4 v_scale = v_setall_f32((float)_scale); - for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0) * v_scale); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (float)(s0*_scale); - SUM[i] = s0 - Sm[i]; - } - } - else - { - int i = 0; - -#if CV_SIMD - for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes ) - { - v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0)); - v_store(SUM + i, v_s0 - vx_load(Sm + i)); - } -#if CV_SIMD_WIDTH > 16 - for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes ) - { - v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i); - v_store(D + i, v_cvt_f32(v_s0)); - v_store(SUM + i, v_s0 - v_load(Sm + i)); - } -#endif -#endif - for( ; i < width; i++ ) - { - int s0 = SUM[i] + Sp[i]; - D[i] = (float)(s0); - SUM[i] = s0 - Sm[i]; - } - } - dst += dststep; - } -#if CV_SIMD - vx_cleanup(); -#endif - } - - double scale; - int sumCount; - std::vector sum; -}; +namespace cv { #ifdef HAVE_OPENCL @@ -1396,109 +288,34 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, #endif +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor) +{ + CV_INSTRUMENT_REGION(); + + CV_CPU_DISPATCH(getRowSumFilter, (srcType, sumType, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale) { - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); + CV_INSTRUMENT_REGION(); - if( anchor < 0 ) - anchor = ksize/2; - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_16U ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32S && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, sumType)); + CV_CPU_DISPATCH(getColumnSumFilter, (sumType, dstType, ksize, anchor, scale), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, int ksize, - int anchor, double scale) +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType) { - int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); + CV_INSTRUMENT_REGION(); - if( anchor < 0 ) - anchor = ksize/2; - - if( ddepth == CV_8U && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_8U && sdepth == CV_16U ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_8U && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16U && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16U && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16S && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_16S && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32S && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32F && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_32F && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_64F && sdepth == CV_32S ) - return makePtr >(ksize, anchor, scale); - if( ddepth == CV_64F && sdepth == CV_64F ) - return makePtr >(ksize, anchor, scale); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of sum format (=%d), and destination format (=%d)", - sumType, dstType)); -} - - -cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ksize, - Point anchor, bool normalize, int borderType ) -{ - int sdepth = CV_MAT_DEPTH(srcType); - int cn = CV_MAT_CN(srcType), sumType = CV_64F; - if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U && - ksize.width*ksize.height <= 256 ) - sumType = CV_16U; - else if( sdepth <= CV_32S && (!normalize || - ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) : - sdepth == CV_16U ? (1 << 15) : (1 << 16))) ) - sumType = CV_32S; - sumType = CV_MAKETYPE( sumType, cn ); - - Ptr rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x ); - Ptr columnFilter = getColumnSumFilter(sumType, - dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1); - - return makePtr(Ptr(), rowFilter, columnFilter, - srcType, dstType, sumType, borderType ); + CV_CPU_DISPATCH(createBoxFilter, (srcType, dstType, ksize, anchor, normalize, borderType), + CV_CPU_DISPATCH_MODES_ALL); } #ifdef HAVE_OPENVX -namespace cv -{ namespace ovx { template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } } @@ -1570,12 +387,9 @@ namespace cv return true; } -} #endif #if defined(HAVE_IPP) -namespace cv -{ static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) { #ifdef HAVE_IPP_IW @@ -1620,13 +434,12 @@ static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool nor return false; #endif } -} #endif -void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) +void boxFilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) { CV_INSTRUMENT_REGION(); @@ -1674,8 +487,8 @@ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, } -void cv::blur( InputArray src, OutputArray dst, - Size ksize, Point anchor, int borderType ) +void blur(InputArray src, OutputArray dst, + Size ksize, Point anchor, int borderType) { CV_INSTRUMENT_REGION(); @@ -1687,77 +500,17 @@ void cv::blur( InputArray src, OutputArray dst, Squared Box Filter \****************************************************************************************/ -namespace cv -{ - -template -struct SqrRowSum : - public BaseRowFilter -{ - SqrRowSum( int _ksize, int _anchor ) : - BaseRowFilter() - { - ksize = _ksize; - anchor = _anchor; - } - - virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - const T* S = (const T*)src; - ST* D = (ST*)dst; - int i = 0, k, ksz_cn = ksize*cn; - - width = (width - 1)*cn; - for( k = 0; k < cn; k++, S++, D++ ) - { - ST s = 0; - for( i = 0; i < ksz_cn; i += cn ) - { - ST val = (ST)S[i]; - s += val*val; - } - D[0] = s; - for( i = 0; i < width; i += cn ) - { - ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn]; - s += val1*val1 - val0*val0; - D[i+cn] = s; - } - } - } -}; - static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) { - int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); - CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); + CV_INSTRUMENT_REGION(); - if( anchor < 0 ) - anchor = ksize/2; - - if( sdepth == CV_8U && ddepth == CV_32S ) - return makePtr >(ksize, anchor); - if( sdepth == CV_8U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16U && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_16S && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_32F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - if( sdepth == CV_64F && ddepth == CV_64F ) - return makePtr >(ksize, anchor); - - CV_Error_( CV_StsNotImplemented, - ("Unsupported combination of source format (=%d), and buffer format (=%d)", - srcType, sumType)); + CV_CPU_DISPATCH(getSqrRowSumFilter, (srcType, sumType, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -} - -void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) +void sqrBoxFilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) { CV_INSTRUMENT_REGION(); @@ -1801,4 +554,4 @@ void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, f->apply( src, dst, wsz, ofs ); } -/* End of file. */ +} // namespace diff --git a/modules/imgproc/src/box_filter.simd.hpp b/modules/imgproc/src/box_filter.simd.hpp index 14f266258f..4eadee8ec5 100644 --- a/modules/imgproc/src/box_filter.simd.hpp +++ b/modules/imgproc/src/box_filter.simd.hpp @@ -42,21 +42,25 @@ //M*/ #include "precomp.hpp" - -#include - #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "opencv2/core/openvx/ovx_defs.hpp" +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor); +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale); +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType); -namespace cv -{ +Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY /****************************************************************************************\ Box Filter \****************************************************************************************/ +namespace { template struct RowSum : public BaseRowFilter @@ -70,6 +74,8 @@ struct RowSum : virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const T* S = (const T*)src; ST* D = (ST*)dst; int i = 0, k, ksz_cn = ksize*cn; @@ -183,6 +189,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i; ST* SUM; bool haveScale = scale != 1; @@ -281,6 +289,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -408,9 +418,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -452,6 +459,8 @@ public BaseColumnFilter virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const int ds = divScale; const int dd = divDelta; ushort* SUM; @@ -586,9 +595,6 @@ public BaseColumnFilter } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -616,6 +622,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i; int* SUM; bool haveScale = scale != 1; @@ -739,9 +747,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -767,6 +772,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -888,9 +895,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -915,6 +919,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -1022,9 +1028,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -1050,6 +1053,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -1154,9 +1159,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -1164,243 +1166,13 @@ struct ColumnSum : std::vector sum; }; -#ifdef HAVE_OPENCL +} // namespace anon -static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize ) + +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor) { - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + CV_INSTRUMENT_REGION(); - if (ddepth < 0) - ddepth = sdepth; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - if ( !(dev.isIntel() && (type == CV_8UC1) && - (_src.offset() == 0) && (_src.step() % 4 == 0) && - (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) && - (anchor.x == 1) && (anchor.y == 1) && - (ksize.width == 3) && (ksize.height == 3)) ) - return false; - - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - - char build_opts[1024]; - sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : ""); - - ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts); - if (kernel.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernel.set(idxArg, (int)dst.step); - idxArg = kernel.set(idxArg, (int)dst.rows); - idxArg = kernel.set(idxArg, (int)dst.cols); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); - bool doubleSupport = dev.doubleFPConfig() > 0; - - if (ddepth < 0) - ddepth = sdepth; - - if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || - _src.offset() % esz != 0 || _src.step() % esz != 0) - return false; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - int computeUnits = ocl::Device::getDefault().maxComputeUnits(); - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(), wholeSize; - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), - wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; - size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; - - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - int h = isolated ? size.height : wholeSize.height; - int w = isolated ? size.width : wholeSize.width; - - size_t maxWorkItemSizes[32]; - ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes); - int tryWorkItems = (int)maxWorkItemSizes[0]; - - ocl::Kernel kernel; - - if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && - ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || - (ksize.width == 5 && ksize.height == 5 && cn == 1))) - { - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1, pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = size.width % 2 ? 1 : 2; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - globalsize[0] = size.width / pxPerWorkItemX; - globalsize[1] = size.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = roundUp(globalsize[0], wgRound); - - char build_options[1024], cvt[2][40]; - sprintf(build_options, "-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV - ); - - - if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options)) - return false; - } - else - { - localsize = localsize_general; - for ( ; ; ) - { - int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); - - while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) - BLOCK_SIZE_X /= 2; - while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) - BLOCK_SIZE_Y *= 2; - - if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) - return false; - - char cvt[2][50]; - String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" - " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" - " -D ST1=%s -D DT1=%s -D cn=%d", - BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), - anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], - isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); - - localsize[0] = BLOCK_SIZE_X; - globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; - globalsize[1] = divUp(size.height, BLOCK_SIZE_Y); - - kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); - if (kernel.empty()) - return false; - - size_t kernelWorkGroupSize = kernel.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) - return false; - - tryWorkItems = (int)kernelWorkGroupSize; - } - } - - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); - int srcOffsetY = (int)(src.offset / src.step); - int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; - int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; - idxArg = kernel.set(idxArg, srcOffsetX); - idxArg = kernel.set(idxArg, srcOffsetY); - idxArg = kernel.set(idxArg, srcEndX); - idxArg = kernel.set(idxArg, srcEndY); - idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, localsize, false); -} - -#endif - -} - - -cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) -{ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); @@ -1434,9 +1206,10 @@ cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksi } -cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, int ksize, - int anchor, double scale) +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale) { + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); @@ -1474,9 +1247,11 @@ cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, i } -cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ksize, - Point anchor, bool normalize, int borderType ) +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType) { + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(srcType); int cn = CV_MAT_CN(srcType), sumType = CV_64F; if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U && @@ -1496,199 +1271,12 @@ cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ks srcType, dstType, sumType, borderType ); } -#ifdef HAVE_OPENVX -namespace cv -{ - namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } - } - static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType) - { - if (ddepth < 0) - ddepth = CV_8UC1; - if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize || - _src.cols() < 3 || _src.rows() < 3 || - ksize.width != 3 || ksize.height != 3 || - (anchor.x >= 0 && anchor.x != 1) || - (anchor.y >= 0 && anchor.y != 1) || - ovx::skipSmallImages(_src.cols(), _src.rows())) - return false; - - Mat src = _src.getMat(); - - if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) - return false; //Process isolated borders only - vx_enum border; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - border = VX_BORDER_CONSTANT; - break; - case BORDER_REPLICATE: - border = VX_BORDER_REPLICATE; - break; - default: - return false; - } - - _dst.create(src.size(), CV_8UC1); - Mat dst = _dst.getMat(); - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(border, (vx_uint8)(0)); - ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib)); - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; - } -} -#endif - -#if defined(HAVE_IPP) -namespace cv -{ -static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Problem with SSE42 optimization for 16s and some 8u modes - if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5)))) - return false; - - // Other optimizations has some degradations too - if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5)))) - return false; -#endif - - if(!normalize) - return false; - - if(!ippiCheckAnchor(anchor, ksize)) - return false; - - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiSize iwKSize = ippiGetSize(ksize); - ::ipp::IwiBorderSize borderSize(iwKSize); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder); - } - catch (const ::ipp::IwException &) - { - return false; - } - - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType); - return false; -#endif -} -} -#endif - - -void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && - (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || - borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101), - ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - Mat src = _src.getMat(); - int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); - if( ddepth < 0 ) - ddepth = sdepth; - _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); - Mat dst = _dst.getMat(); - if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) - { - if( src.rows == 1 ) - ksize.height = 1; - if( src.cols == 1 ) - ksize.width = 1; - } - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType&BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType)) - - CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType)); - - borderType = (borderType&~BORDER_ISOLATED); - - Ptr f = createBoxFilter( src.type(), dst.type(), - ksize, anchor, normalize, borderType ); - - f->apply( src, dst, wsz, ofs ); -} - - -void cv::blur( InputArray src, OutputArray dst, - Size ksize, Point anchor, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - boxFilter( src, dst, -1, ksize, anchor, true, borderType ); -} /****************************************************************************************\ Squared Box Filter \****************************************************************************************/ - -namespace cv -{ +namespace { template struct SqrRowSum : @@ -1703,6 +1291,8 @@ struct SqrRowSum : virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const T* S = (const T*)src; ST* D = (ST*)dst; int i = 0, k, ksz_cn = ksize*cn; @@ -1727,7 +1317,9 @@ struct SqrRowSum : } }; -static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) +} // namespace anon + +Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); @@ -1753,52 +1345,6 @@ static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize srcType, sumType)); } -} - -void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType); - Size size = _src.size(); - - if( ddepth < 0 ) - ddepth = sdepth < CV_32F ? CV_32F : CV_64F; - - if( borderType != BORDER_CONSTANT && normalize ) - { - if( size.height == 1 ) - ksize.height = 1; - if( size.width == 1 ) - ksize.width = 1; - } - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, - ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true)) - - int sumDepth = CV_64F; - if( sdepth == CV_8U ) - sumDepth = CV_32S; - int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn); - - Mat src = _src.getMat(); - _dst.create( size, dstType ); - Mat dst = _dst.getMat(); - - Ptr rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x ); - Ptr columnFilter = getColumnSumFilter(sumType, - dstType, ksize.height, anchor.y, - normalize ? 1./(ksize.width*ksize.height) : 1); - - Ptr f = makePtr(Ptr(), rowFilter, columnFilter, - srcType, dstType, sumType, borderType ); - Point ofs; - Size wsz(src.cols, src.rows); - src.locateROI( wsz, ofs ); - - f->apply( src, dst, wsz, ofs ); -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace From 2c07c6718fd48902c23c4012f5d57cdc2c0faa59 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 11 Mar 2019 12:37:17 +0000 Subject: [PATCH 21/21] imgproc: dispatch morph --- modules/imgproc/CMakeLists.txt | 1 + modules/imgproc/src/morph.dispatch.cpp | 794 +------------- modules/imgproc/src/morph.simd.hpp | 1327 +----------------------- 3 files changed, 68 insertions(+), 2054 deletions(-) diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index d60fa7c58f..0c7b3268df 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -7,5 +7,6 @@ ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(morph SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/morph.dispatch.cpp b/modules/imgproc/src/morph.dispatch.cpp index c18e5c8066..326bc66593 100644 --- a/modules/imgproc/src/morph.dispatch.cpp +++ b/modules/imgproc/src/morph.dispatch.cpp @@ -48,779 +48,49 @@ #include "opencv2/core/hal/intrin.hpp" #include +#include "morph.simd.hpp" +#include "morph.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + + /****************************************************************************************\ Basic Morphological Operations: Erosion & Dilation \****************************************************************************************/ -using namespace std; - -namespace cv -{ - -template struct MinOp -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::min(a, b); } -}; - -template struct MaxOp -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::max(a, b); } -}; - -#undef CV_MIN_8U -#undef CV_MAX_8U -#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) -#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) - -template<> inline uchar MinOp::operator ()(const uchar a, const uchar b) const { return CV_MIN_8U(a, b); } -template<> inline uchar MaxOp::operator ()(const uchar a, const uchar b) const { return CV_MAX_8U(a, b); } - -struct MorphRowNoVec -{ - MorphRowNoVec(int, int) {} - int operator()(const uchar*, uchar*, int, int) const { return 0; } -}; - -struct MorphColumnNoVec -{ - MorphColumnNoVec(int, int) {} - int operator()(const uchar**, uchar*, int, int, int) const { return 0; } -}; - -struct MorphNoVec -{ - int operator()(uchar**, int, uchar*, int) const { return 0; } -}; - -#if CV_SIMD - -template struct MorphRowVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar* src, uchar* dst, int width, int cn) const - { - int i, k, _ksize = ksize*cn; - width *= cn; - VecUpdate updateOp; - - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) - { - vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); - vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes); - vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes); - for (k = cn; k < _ksize; k += cn) - { - s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); - s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes)); - } - v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - v_store((stype*)dst + i + 2*vtype::nlanes, s2); - v_store((stype*)dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); - for( k = cn; k < _ksize; k += cn ) - { - s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); - } - v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s = vx_load((const stype*)src + i); - for( k = cn; k < _ksize; k += cn ) - s = updateOp(s, vx_load((const stype*)src + i + k)); - v_store((stype*)dst + i, s); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s = vx_load_low((const stype*)src + i); - for( k = cn; k < _ksize; k += cn ) - s = updateOp(s, vx_load_low((const stype*)src + i + k)); - v_store_low((stype*)dst + i, s); - i += vtype::nlanes/2; - } - - return i - i % cn; - } - - int ksize, anchor; -}; - - -template struct MorphColumnVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const - { - int i = 0, k, _ksize = ksize; - VecUpdate updateOp; - - for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 ); - - const stype** src = (const stype**)_src; - stype* dst = (stype*)_dst; - dststep /= sizeof(dst[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) - { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) - { - const stype* sptr = src[1] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); - } - - sptr = src[0] + i; - v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); - - sptr = src[k] + i; - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[1] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - } - - sptr = src[0] + i; - v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - - sptr = src[k] + i; - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load_aligned(src[1] + i); - - for( k = 2; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - - v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i))); - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i))); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[1] + i); - - for( k = 2; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - - v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i))); - v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i))); - i += vtype::nlanes/2; - } - } - - for( ; count > 0; count--, dst += dststep, src++ ) - { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load_aligned(src[0] + i); - - for( k = 1; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - v_store(dst + i, s0); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[0] + i); - - for( k = 1; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - v_store_low(dst + i, s0); - i += vtype::nlanes/2; - } - } - - return i; - } - - int ksize, anchor; -}; - - -template struct MorphVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - int operator()(uchar** _src, int nz, uchar* _dst, int width) const - { - const stype** src = (const stype**)_src; - stype* dst = (stype*)_dst; - int i, k; - VecUpdate updateOp; - - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); - vtype s2 = vx_load(sptr + 2*vtype::nlanes); - vtype s3 = vx_load(sptr + 3*vtype::nlanes); - for( k = 1; k < nz; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); - for( k = 1; k < nz; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load(src[0] + i); - for( k = 1; k < nz; k++ ) - s0 = updateOp(s0, vx_load(src[k] + i)); - v_store(dst + i, s0); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[0] + i); - for( k = 1; k < nz; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - v_store_low(dst + i, s0); - i += vtype::nlanes/2; - } - return i; - } -}; - -template struct VMin -{ - typedef T vtype; - vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); } -}; -template struct VMax -{ - typedef T vtype; - vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); } -}; - -typedef MorphRowVec > ErodeRowVec8u; -typedef MorphRowVec > DilateRowVec8u; -typedef MorphRowVec > ErodeRowVec16u; -typedef MorphRowVec > DilateRowVec16u; -typedef MorphRowVec > ErodeRowVec16s; -typedef MorphRowVec > DilateRowVec16s; -typedef MorphRowVec > ErodeRowVec32f; -typedef MorphRowVec > DilateRowVec32f; - -typedef MorphColumnVec > ErodeColumnVec8u; -typedef MorphColumnVec > DilateColumnVec8u; -typedef MorphColumnVec > ErodeColumnVec16u; -typedef MorphColumnVec > DilateColumnVec16u; -typedef MorphColumnVec > ErodeColumnVec16s; -typedef MorphColumnVec > DilateColumnVec16s; -typedef MorphColumnVec > ErodeColumnVec32f; -typedef MorphColumnVec > DilateColumnVec32f; - -typedef MorphVec > ErodeVec8u; -typedef MorphVec > DilateVec8u; -typedef MorphVec > ErodeVec16u; -typedef MorphVec > DilateVec16u; -typedef MorphVec > ErodeVec16s; -typedef MorphVec > DilateVec16s; -typedef MorphVec > ErodeVec32f; -typedef MorphVec > DilateVec32f; - -#else - -typedef MorphRowNoVec ErodeRowVec8u; -typedef MorphRowNoVec DilateRowVec8u; - -typedef MorphColumnNoVec ErodeColumnVec8u; -typedef MorphColumnNoVec DilateColumnVec8u; - -typedef MorphRowNoVec ErodeRowVec16u; -typedef MorphRowNoVec DilateRowVec16u; -typedef MorphRowNoVec ErodeRowVec16s; -typedef MorphRowNoVec DilateRowVec16s; -typedef MorphRowNoVec ErodeRowVec32f; -typedef MorphRowNoVec DilateRowVec32f; - -typedef MorphColumnNoVec ErodeColumnVec16u; -typedef MorphColumnNoVec DilateColumnVec16u; -typedef MorphColumnNoVec ErodeColumnVec16s; -typedef MorphColumnNoVec DilateColumnVec16s; -typedef MorphColumnNoVec ErodeColumnVec32f; -typedef MorphColumnNoVec DilateColumnVec32f; - -typedef MorphNoVec ErodeVec8u; -typedef MorphNoVec DilateVec8u; -typedef MorphNoVec ErodeVec16u; -typedef MorphNoVec DilateVec16u; -typedef MorphNoVec ErodeVec16s; -typedef MorphNoVec DilateVec16s; -typedef MorphNoVec ErodeVec32f; -typedef MorphNoVec DilateVec32f; - -#endif - -typedef MorphRowNoVec ErodeRowVec64f; -typedef MorphRowNoVec DilateRowVec64f; -typedef MorphColumnNoVec ErodeColumnVec64f; -typedef MorphColumnNoVec DilateColumnVec64f; -typedef MorphNoVec ErodeVec64f; -typedef MorphNoVec DilateVec64f; - - -template struct MorphRowFilter : public BaseRowFilter -{ - typedef typename Op::rtype T; - - MorphRowFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) - { - ksize = _ksize; - anchor = _anchor; - } - - void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - int i, j, k, _ksize = ksize*cn; - const T* S = (const T*)src; - Op op; - T* D = (T*)dst; - - if( _ksize == cn ) - { - for( i = 0; i < width*cn; i++ ) - D[i] = S[i]; - return; - } - - int i0 = vecOp(src, dst, width, cn); - width *= cn; - - for( k = 0; k < cn; k++, S++, D++ ) - { - for( i = i0; i <= width - cn*2; i += cn*2 ) - { - const T* s = S + i; - T m = s[cn]; - for( j = cn*2; j < _ksize; j += cn ) - m = op(m, s[j]); - D[i] = op(m, s[0]); - D[i+cn] = op(m, s[j]); - } - - for( ; i < width; i += cn ) - { - const T* s = S + i; - T m = s[0]; - for( j = cn; j < _ksize; j += cn ) - m = op(m, s[j]); - D[i] = m; - } - } - } - - VecOp vecOp; -}; - - -template struct MorphColumnFilter : public BaseColumnFilter -{ - typedef typename Op::rtype T; - - MorphColumnFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) - { - ksize = _ksize; - anchor = _anchor; - } - - void operator()(const uchar** _src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i, k, _ksize = ksize; - const T** src = (const T**)_src; - T* D = (T*)dst; - Op op; - - int i0 = vecOp(_src, dst, dststep, count, width); - dststep /= sizeof(D[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, D += dststep*2, src += 2 ) - { - i = i0; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = src[1] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - sptr = src[0] + i; - D[i] = op(s0, sptr[0]); - D[i+1] = op(s1, sptr[1]); - D[i+2] = op(s2, sptr[2]); - D[i+3] = op(s3, sptr[3]); - - sptr = src[k] + i; - D[i+dststep] = op(s0, sptr[0]); - D[i+dststep+1] = op(s1, sptr[1]); - D[i+dststep+2] = op(s2, sptr[2]); - D[i+dststep+3] = op(s3, sptr[3]); - } - #endif - for( ; i < width; i++ ) - { - T s0 = src[1][i]; - - for( k = 2; k < _ksize; k++ ) - s0 = op(s0, src[k][i]); - - D[i] = op(s0, src[0][i]); - D[i+dststep] = op(s0, src[k][i]); - } - } - - for( ; count > 0; count--, D += dststep, src++ ) - { - i = i0; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = src[0] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - T s0 = src[0][i]; - for( k = 1; k < _ksize; k++ ) - s0 = op(s0, src[k][i]); - D[i] = s0; - } - } - } - - VecOp vecOp; -}; - - -template struct MorphFilter : BaseFilter -{ - typedef typename Op::rtype T; - - MorphFilter( const Mat& _kernel, Point _anchor ) - { - anchor = _anchor; - ksize = _kernel.size(); - CV_Assert( _kernel.type() == CV_8U ); - - std::vector coeffs; // we do not really the values of non-zero - // kernel elements, just their locations - preprocess2DKernel( _kernel, coords, coeffs ); - ptrs.resize( coords.size() ); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) CV_OVERRIDE - { - const Point* pt = &coords[0]; - const T** kp = (const T**)&ptrs[0]; - int i, k, nz = (int)coords.size(); - Op op; - - width *= cn; - for( ; count > 0; count--, dst += dststep, src++ ) - { - T* D = (T*)dst; - - for( k = 0; k < nz; k++ ) - kp[k] = (const T*)src[pt[k].y] + pt[k].x*cn; - - i = vecOp(&ptrs[0], nz, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = kp[0] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 1; k < nz; k++ ) - { - sptr = kp[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - T s0 = kp[0][i]; - for( k = 1; k < nz; k++ ) - s0 = op(s0, kp[k][i]); - D[i] = s0; - } - } - } - - std::vector coords; - std::vector ptrs; - VecOp vecOp; -}; - -} +namespace cv { /////////////////////////////////// External Interface ///////////////////////////////////// -cv::Ptr cv::getMorphologyRowFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor) { - int depth = CV_MAT_DEPTH(type); - if( anchor < 0 ) - anchor = ksize/2; - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, - ErodeRowVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - ErodeRowVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - ErodeRowVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - ErodeRowVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - ErodeRowVec64f> >(ksize, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, - DilateRowVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - DilateRowVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - DilateRowVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - DilateRowVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - DilateRowVec64f> >(ksize, anchor); - } + CV_INSTRUMENT_REGION(); - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyRowFilter, (op, type, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getMorphologyColumnFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor) { - int depth = CV_MAT_DEPTH(type); - if( anchor < 0 ) - anchor = ksize/2; - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, - ErodeColumnVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - ErodeColumnVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - ErodeColumnVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - ErodeColumnVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - ErodeColumnVec64f> >(ksize, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, - DilateColumnVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - DilateColumnVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - DilateColumnVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - DilateColumnVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - DilateColumnVec64f> >(ksize, anchor); - } + CV_INSTRUMENT_REGION(); - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyColumnFilter, (op, type, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) +Ptr getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) { + CV_INSTRUMENT_REGION(); + Mat kernel = _kernel.getMat(); - int depth = CV_MAT_DEPTH(type); - anchor = normalizeAnchor(anchor, kernel.size()); - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, ErodeVec8u> >(kernel, anchor); - if( depth == CV_16U ) - return makePtr, ErodeVec16u> >(kernel, anchor); - if( depth == CV_16S ) - return makePtr, ErodeVec16s> >(kernel, anchor); - if( depth == CV_32F ) - return makePtr, ErodeVec32f> >(kernel, anchor); - if( depth == CV_64F ) - return makePtr, ErodeVec64f> >(kernel, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, DilateVec8u> >(kernel, anchor); - if( depth == CV_16U ) - return makePtr, DilateVec16u> >(kernel, anchor); - if( depth == CV_16S ) - return makePtr, DilateVec16s> >(kernel, anchor); - if( depth == CV_32F ) - return makePtr, DilateVec32f> >(kernel, anchor); - if( depth == CV_64F ) - return makePtr, DilateVec64f> >(kernel, anchor); - } - - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyFilter, (op, type, kernel, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::createMorphologyFilter( int op, int type, InputArray _kernel, - Point anchor, int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) +Ptr createMorphologyFilter( + int op, int type, InputArray _kernel, + Point anchor, int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) { Mat kernel = _kernel.getMat(); anchor = normalizeAnchor(anchor, kernel.size()); @@ -862,7 +132,7 @@ cv::Ptr cv::createMorphologyFilter( int op, int type, InputArr } -cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) +Mat getStructuringElement(int shape, Size ksize, Point anchor) { int i, j; int r = 0, c = 0; @@ -915,9 +185,6 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) return elem; } -namespace cv -{ - // ===== 1. replacement implementation static bool halMorph(int op, int src_type, int dst_type, @@ -1732,9 +999,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst, (src.isSubmatrix() && !isolated)); } -} - -void cv::erode( InputArray src, OutputArray dst, InputArray kernel, +void erode( InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1744,7 +1009,7 @@ void cv::erode( InputArray src, OutputArray dst, InputArray kernel, } -void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, +void dilate( InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1755,8 +1020,6 @@ void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, #ifdef HAVE_OPENCL -namespace cv { - static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue) @@ -1813,13 +1076,11 @@ static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, return true; } -} #endif #define IPP_DISABLE_MORPH_ADV 1 #ifdef HAVE_IPP #if !IPP_DISABLE_MORPH_ADV -namespace cv { static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, InputArray _kernel, Point anchor, int iterations, @@ -1884,11 +1145,10 @@ static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, return false; #endif } -} #endif #endif -void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, +void morphologyEx( InputArray _src, OutputArray _dst, int op, InputArray _kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1985,6 +1245,8 @@ void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, } } +} // namespace cv + CV_IMPL IplConvKernel * cvCreateStructuringElementEx( int cols, int rows, int anchorX, int anchorY, diff --git a/modules/imgproc/src/morph.simd.hpp b/modules/imgproc/src/morph.simd.hpp index c18e5c8066..9b3023f8f0 100644 --- a/modules/imgproc/src/morph.simd.hpp +++ b/modules/imgproc/src/morph.simd.hpp @@ -42,21 +42,22 @@ #include "precomp.hpp" #include -#include "opencl_kernels_imgproc.hpp" -#include -#include "hal_replacement.hpp" #include "opencv2/core/hal/intrin.hpp" -#include /****************************************************************************************\ Basic Morphological Operations: Erosion & Dilation \****************************************************************************************/ -using namespace std; +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor); +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor); +Ptr getMorphologyFilter(int op, int type, const Mat& kernel, Point anchor); -namespace cv -{ +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +namespace { template struct MinOp { typedef T type1; @@ -73,6 +74,9 @@ template struct MaxOp T operator ()(const T a, const T b) const { return std::max(a, b); } }; + +#if !defined(CV_SIMD) // min/max operation are usually fast enough (without using of control flow 'if' statements) + #undef CV_MIN_8U #undef CV_MAX_8U #define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) @@ -81,6 +85,10 @@ template struct MaxOp template<> inline uchar MinOp::operator ()(const uchar a, const uchar b) const { return CV_MIN_8U(a, b); } template<> inline uchar MaxOp::operator ()(const uchar a, const uchar b) const { return CV_MAX_8U(a, b); } +#endif + + + struct MorphRowNoVec { MorphRowNoVec(int, int) {} @@ -107,6 +115,8 @@ template struct MorphRowVec MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar* src, uchar* dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i, k, _ksize = ksize*cn; width *= cn; VecUpdate updateOp; @@ -173,6 +183,8 @@ template struct MorphColumnVec MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = ksize; VecUpdate updateOp; @@ -332,6 +344,8 @@ template struct MorphVec typedef typename vtype::lane_type stype; int operator()(uchar** _src, int nz, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + const stype** src = (const stype**)_src; stype* dst = (stype*)_dst; int i, k; @@ -483,6 +497,8 @@ template struct MorphRowFilter : public BaseRowFilter void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, k, _ksize = ksize*cn; const T* S = (const T*)src; Op op; @@ -537,6 +553,8 @@ template struct MorphColumnFilter : public BaseColumnFilt void operator()(const uchar** _src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, k, _ksize = ksize; const T** src = (const T**)_src; T* D = (T*)dst; @@ -638,6 +656,8 @@ template struct MorphFilter : BaseFilter void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const Point* pt = &coords[0]; const T** kp = (const T**)&ptrs[0]; int i, k, nz = (int)coords.size(); @@ -684,12 +704,14 @@ template struct MorphFilter : BaseFilter VecOp vecOp; }; -} +} // namespace anon /////////////////////////////////// External Interface ///////////////////////////////////// -cv::Ptr cv::getMorphologyRowFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor) { + CV_INSTRUMENT_REGION(); + int depth = CV_MAT_DEPTH(type); if( anchor < 0 ) anchor = ksize/2; @@ -734,8 +756,10 @@ cv::Ptr cv::getMorphologyRowFilter(int op, int type, int ksiz CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); } -cv::Ptr cv::getMorphologyColumnFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor) { + CV_INSTRUMENT_REGION(); + int depth = CV_MAT_DEPTH(type); if( anchor < 0 ) anchor = ksize/2; @@ -780,10 +804,10 @@ cv::Ptr cv::getMorphologyColumnFilter(int op, int type, in CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); } - -cv::Ptr cv::getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) +Ptr getMorphologyFilter(int op, int type, const Mat& kernel, Point anchor) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int depth = CV_MAT_DEPTH(type); anchor = normalizeAnchor(anchor, kernel.size()); CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); @@ -817,1279 +841,6 @@ cv::Ptr cv::getMorphologyFilter(int op, int type, InputArray _ke CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); } - -cv::Ptr cv::createMorphologyFilter( int op, int type, InputArray _kernel, - Point anchor, int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat kernel = _kernel.getMat(); - anchor = normalizeAnchor(anchor, kernel.size()); - - Ptr rowFilter; - Ptr columnFilter; - Ptr filter2D; - - if( countNonZero(kernel) == kernel.rows*kernel.cols ) - { - // rectangular structuring element - rowFilter = getMorphologyRowFilter(op, type, kernel.cols, anchor.x); - columnFilter = getMorphologyColumnFilter(op, type, kernel.rows, anchor.y); - } - else - filter2D = getMorphologyFilter(op, type, kernel, anchor); - - Scalar borderValue = _borderValue; - if( (_rowBorderType == BORDER_CONSTANT || _columnBorderType == BORDER_CONSTANT) && - borderValue == morphologyDefaultBorderValue() ) - { - int depth = CV_MAT_DEPTH(type); - CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_16S || - depth == CV_32F || depth == CV_64F ); - if( op == MORPH_ERODE ) - borderValue = Scalar::all( depth == CV_8U ? (double)UCHAR_MAX : - depth == CV_16U ? (double)USHRT_MAX : - depth == CV_16S ? (double)SHRT_MAX : - depth == CV_32F ? (double)FLT_MAX : DBL_MAX); - else - borderValue = Scalar::all( depth == CV_8U || depth == CV_16U ? - 0. : - depth == CV_16S ? (double)SHRT_MIN : - depth == CV_32F ? (double)-FLT_MAX : -DBL_MAX); - } - - return makePtr(filter2D, rowFilter, columnFilter, - type, type, type, _rowBorderType, _columnBorderType, borderValue ); -} - - -cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) -{ - int i, j; - int r = 0, c = 0; - double inv_r2 = 0; - - CV_Assert( shape == MORPH_RECT || shape == MORPH_CROSS || shape == MORPH_ELLIPSE ); - - anchor = normalizeAnchor(anchor, ksize); - - if( ksize == Size(1,1) ) - shape = MORPH_RECT; - - if( shape == MORPH_ELLIPSE ) - { - r = ksize.height/2; - c = ksize.width/2; - inv_r2 = r ? 1./((double)r*r) : 0; - } - - Mat elem(ksize, CV_8U); - - for( i = 0; i < ksize.height; i++ ) - { - uchar* ptr = elem.ptr(i); - int j1 = 0, j2 = 0; - - if( shape == MORPH_RECT || (shape == MORPH_CROSS && i == anchor.y) ) - j2 = ksize.width; - else if( shape == MORPH_CROSS ) - j1 = anchor.x, j2 = j1 + 1; - else - { - int dy = i - r; - if( std::abs(dy) <= r ) - { - int dx = saturate_cast(c*std::sqrt((r*r - dy*dy)*inv_r2)); - j1 = std::max( c - dx, 0 ); - j2 = std::min( c + dx + 1, ksize.width ); - } - } - - for( j = 0; j < j1; j++ ) - ptr[j] = 0; - for( ; j < j2; j++ ) - ptr[j] = 1; - for( ; j < ksize.width; j++ ) - ptr[j] = 0; - } - - return elem; -} - -namespace cv -{ - -// ===== 1. replacement implementation - -static bool halMorph(int op, int src_type, int dst_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations, bool isSubmatrix) -{ - cvhalFilter2D * ctx; - int res = cv_hal_morphInit(&ctx, op, src_type, dst_type, width, height, - kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, - anchor_x, anchor_y, - borderType, borderValue, - iterations, isSubmatrix, src_data == dst_data); - if (res != CV_HAL_ERROR_OK) - return false; - - res = cv_hal_morph(ctx, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, - roi_x, roi_y, - roi_width2, roi_height2, - roi_x2, roi_y2); - bool success = (res == CV_HAL_ERROR_OK); - - res = cv_hal_morphFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - - return success; -} - -// ===== 2. IPP implementation -#ifdef HAVE_IPP -#ifdef HAVE_IPP_IW -static inline IwiMorphologyType ippiGetMorphologyType(int morphOp) -{ - return morphOp == MORPH_ERODE ? iwiMorphErode : - morphOp == MORPH_DILATE ? iwiMorphDilate : - morphOp == MORPH_OPEN ? iwiMorphOpen : - morphOp == MORPH_CLOSE ? iwiMorphClose : - morphOp == MORPH_GRADIENT ? iwiMorphGradient : - morphOp == MORPH_TOPHAT ? iwiMorphTophat : - morphOp == MORPH_BLACKHAT ? iwiMorphBlackhat : (IwiMorphologyType)-1; -} #endif - -static bool ippMorph(int op, int src_type, int dst_type, - const uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations, bool isSubmatrix) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201800 - // Problem with SSE42 optimizations performance - if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42) - return false; - - // Different mask flipping - if(op == MORPH_GRADIENT) - return false; - - // Integer overflow bug - if(src_step >= IPP_MAX_32S || - src_step*height >= IPP_MAX_32S) - return false; -#endif - -#if IPP_VERSION_X100 < 201801 - // Problem with AVX512 optimizations performance - if(cv::ipp::getIppTopFeatures()&ippCPUID_AVX512F) - return false; - - // Multiple iterations on small mask is not effective in current integration - // Implace imitation for 3x3 kernel is not efficient - // Advanced morphology for small mask introduces degradations - if((iterations > 1 || src_data == dst_data || (op != MORPH_ERODE && op != MORPH_DILATE)) && kernel_width*kernel_height < 25) - return false; - - // Skip even mask sizes for advanced morphology since they can produce out of spec writes - if((op != MORPH_ERODE && op != MORPH_DILATE) && (!(kernel_width&1) || !(kernel_height&1))) - return false; -#endif - - IppAutoBuffer kernelTempBuffer; - ::ipp::IwiBorderSize iwBorderSize; - ::ipp::IwiBorderSize iwBorderSize2; - ::ipp::IwiBorderType iwBorderType; - ::ipp::IwiBorderType iwBorderType2; - ::ipp::IwiImage iwMask; - ::ipp::IwiImage iwInter; - ::ipp::IwiSize initSize(width, height); - ::ipp::IwiSize kernelSize(kernel_width, kernel_height); - IppDataType type = ippiGetDataType(CV_MAT_DEPTH(src_type)); - int channels = CV_MAT_CN(src_type); - IwiMorphologyType morphType = ippiGetMorphologyType(op); - - CV_UNUSED(isSubmatrix); - - if((int)morphType < 0) - return false; - - if(iterations > 1 && morphType != iwiMorphErode && morphType != iwiMorphDilate) - return false; - - if(src_type != dst_type) - return false; - - if(!ippiCheckAnchor(anchor_x, anchor_y, kernel_width, kernel_height)) - return false; - - try - { - ::ipp::IwiImage iwSrc(initSize, type, channels, ::ipp::IwiBorderSize(roi_x, roi_y, roi_width-roi_x-width, roi_height-roi_y-height), (void*)src_data, src_step); - ::ipp::IwiImage iwDst(initSize, type, channels, ::ipp::IwiBorderSize(roi_x2, roi_y2, roi_width2-roi_x2-width, roi_height2-roi_y2-height), (void*)dst_data, dst_step); - - iwBorderSize = ::ipp::iwiSizeToBorderSize(kernelSize); - iwBorderType = ippiGetBorder(iwSrc, borderType, iwBorderSize); - if(!iwBorderType) - return false; - if(iterations > 1) - { - // Check dst border for second and later iterations - iwBorderSize2 = ::ipp::iwiSizeToBorderSize(kernelSize); - iwBorderType2 = ippiGetBorder(iwDst, borderType, iwBorderSize2); - if(!iwBorderType2) - return false; - } - - if(morphType != iwiMorphErode && morphType != iwiMorphDilate && morphType != iwiMorphGradient) - { - // For now complex morphology support only InMem around all sides. This will be improved later. - if((iwBorderType&ippBorderInMem) && (iwBorderType&ippBorderInMem) != ippBorderInMem) - return false; - - if((iwBorderType&ippBorderInMem) == ippBorderInMem) - { - iwBorderType &= ~ippBorderInMem; - iwBorderType &= ippBorderFirstStageInMem; - } - } - - if(iwBorderType.StripFlags() == ippBorderConst) - { - if(Vec(borderValue) == morphologyDefaultBorderValue()) - iwBorderType.SetType(ippBorderDefault); - else - iwBorderType.m_value = ::ipp::IwValueFloat(borderValue[0], borderValue[1], borderValue[2], borderValue[3]); - } - - iwMask.Init(ippiSize(kernel_width, kernel_height), ippiGetDataType(CV_MAT_DEPTH(kernel_type)), CV_MAT_CN(kernel_type), 0, kernel_data, kernel_step); - - ::ipp::IwiImage iwMaskLoc = iwMask; - if(morphType == iwiMorphDilate) - { - iwMaskLoc.Alloc(iwMask.m_size, iwMask.m_dataType, iwMask.m_channels); - ::ipp::iwiMirror(iwMask, iwMaskLoc, ippAxsBoth); - iwMask = iwMaskLoc; - } - - if(iterations > 1) - { - // OpenCV uses in mem border from dst for two and more iterations, so we need to keep this border in intermediate image - iwInter.Alloc(initSize, type, channels, iwBorderSize2); - - ::ipp::IwiImage *pSwap[2] = {&iwInter, &iwDst}; - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, iwSrc, iwInter, morphType, iwMask, ::ipp::IwDefault(), iwBorderType); - - // Copy border only - { - if(iwBorderSize2.top) - { - ::ipp::IwiRoi borderRoi(-iwBorderSize2.left, -iwBorderSize2.top, iwDst.m_size.width+iwBorderSize2.left+iwBorderSize2.right, iwBorderSize2.top); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - if(iwBorderSize2.bottom) - { - ::ipp::IwiRoi borderRoi(-iwBorderSize2.left, iwDst.m_size.height, iwDst.m_size.width+iwBorderSize2.left+iwBorderSize2.right, iwBorderSize2.bottom); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - if(iwBorderSize2.left) - { - ::ipp::IwiRoi borderRoi(-iwBorderSize2.left, 0, iwBorderSize2.left, iwDst.m_size.height); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - if(iwBorderSize2.right) - { - ::ipp::IwiRoi borderRoi(iwDst.m_size.width, 0, iwBorderSize2.left, iwDst.m_size.height); - ::ipp::IwiImage iwInterRoi = iwInter.GetRoiImage(borderRoi); - ::ipp::iwiCopy(iwDst.GetRoiImage(borderRoi), iwInterRoi); - } - } - - iwBorderType2.SetType(iwBorderType); - for(int i = 0; i < iterations-1; i++) - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, *pSwap[i&0x1], *pSwap[(i+1)&0x1], morphType, iwMask, ::ipp::IwDefault(), iwBorderType2); - if(iterations&0x1) - CV_INSTRUMENT_FUN_IPP(::ipp::iwiCopy, iwInter, iwDst); - } - else - { - if(src_data == dst_data) - { - iwInter.Alloc(initSize, type, channels); - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, iwSrc, iwInter, morphType, iwMask, ::ipp::IwDefault(), iwBorderType); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiCopy, iwInter, iwDst); - } - else - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterMorphology, iwSrc, iwDst, morphType, iwMask, ::ipp::IwDefault(), iwBorderType); - } - } - catch(const ::ipp::IwException &) - { - return false; - } - - return true; -#else - CV_UNUSED(op); CV_UNUSED(src_type); CV_UNUSED(dst_type); CV_UNUSED(src_data); CV_UNUSED(src_step); CV_UNUSED(dst_data); - CV_UNUSED(dst_step); CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(roi_width); CV_UNUSED(roi_height); - CV_UNUSED(roi_x); CV_UNUSED(roi_y); CV_UNUSED(roi_width2); CV_UNUSED(roi_height2); CV_UNUSED(roi_x2); CV_UNUSED(roi_y2); - CV_UNUSED(kernel_type); CV_UNUSED(kernel_data); CV_UNUSED(kernel_step); CV_UNUSED(kernel_width); CV_UNUSED(kernel_height); - CV_UNUSED(anchor_x); CV_UNUSED(anchor_y); CV_UNUSED(borderType); CV_UNUSED(borderValue); CV_UNUSED(iterations); - CV_UNUSED(isSubmatrix); - return false; -#endif -}; - -#endif // HAVE_IPP - -// ===== 3. Fallback implementation - -static void ocvMorph(int op, int src_type, int dst_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations) -{ - Mat kernel(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - Point anchor(anchor_x, anchor_y); - Vec borderVal(borderValue); - Ptr f = createMorphologyFilter(op, src_type, kernel, anchor, borderType, borderType, borderVal); - Mat src(Size(width, height), src_type, src_data, src_step); - Mat dst(Size(width, height), dst_type, dst_data, dst_step); - { - Point ofs(roi_x, roi_y); - Size wsz(roi_width, roi_height); - f->apply( src, dst, wsz, ofs ); - } - { - Point ofs(roi_x2, roi_y2); - Size wsz(roi_width2, roi_height2); - for( int i = 1; i < iterations; i++ ) - f->apply( dst, dst, wsz, ofs ); - } -} - - -// ===== HAL interface implementation - -namespace hal { - - -CV_DEPRECATED Ptr Morph::create(int , int , int , int , int , - int , uchar * , size_t , - int , int , - int , int , - int , const double *, - int , bool , bool ) { return Ptr(); } - - -void morph(int op, int src_type, int dst_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int roi_width, int roi_height, int roi_x, int roi_y, - int roi_width2, int roi_height2, int roi_x2, int roi_y2, - int kernel_type, uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, int anchor_x, int anchor_y, - int borderType, const double borderValue[4], int iterations, bool isSubmatrix) -{ - { - bool res = halMorph(op, src_type, dst_type, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, roi_x, roi_y, - roi_width2, roi_height2, roi_x2, roi_y2, - kernel_type, kernel_data, kernel_step, - kernel_width, kernel_height, anchor_x, anchor_y, - borderType, borderValue, iterations, isSubmatrix); - if (res) - return; - } - - CV_IPP_RUN_FAST(ippMorph(op, src_type, dst_type, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, roi_x, roi_y, - roi_width2, roi_height2, roi_x2, roi_y2, - kernel_type, kernel_data, kernel_step, - kernel_width, kernel_height, anchor_x, anchor_y, - borderType, borderValue, iterations, isSubmatrix)); - - ocvMorph(op, src_type, dst_type, src_data, src_step, dst_data, dst_step, width, height, - roi_width, roi_height, roi_x, roi_y, - roi_width2, roi_height2, roi_x2, roi_y2, - kernel_type, kernel_data, kernel_step, - kernel_width, kernel_height, anchor_x, anchor_y, - borderType, borderValue, iterations); -} - -} // cv::hal - -#ifdef HAVE_OPENCL - -#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) - -static bool ocl_morph3x3_8UC1( InputArray _src, OutputArray _dst, InputArray _kernel, Point anchor, - int op, int actual_op = -1, InputArray _extraMat = noArray()) -{ - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - Size ksize = _kernel.size(); - - Mat kernel8u; - String processing; - - bool haveExtraMat = !_extraMat.empty(); - CV_Assert(actual_op <= 3 || haveExtraMat); - - _kernel.getMat().convertTo(kernel8u, CV_8U); - for (int y = 0; y < kernel8u.rows; ++y) - for (int x = 0; x < kernel8u.cols; ++x) - if (kernel8u.at(y, x) != 0) - processing += format("PROCESS(%d,%d)", y, x); - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - if (actual_op < 0) - actual_op = op; - - if (type != CV_8UC1 || - !((_src.offset() == 0) && (_src.step() % 4 == 0)) || - !((_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)) || - !(anchor.x == 1 && anchor.y == 1) || - !(ksize.width == 3 && ksize.height == 3)) - return false; - - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - - static const char * const op2str[] = { "OP_ERODE", "OP_DILATE", NULL, NULL, "OP_GRADIENT", "OP_TOPHAT", "OP_BLACKHAT" }; - String opts = format("-D PROCESS_ELEM_=%s -D %s%s", processing.c_str(), op2str[op], - actual_op == op ? "" : cv::format(" -D %s", op2str[actual_op]).c_str()); - - ocl::Kernel k; - k.create("morph3x3_8UC1_cols16_rows2", cv::ocl::imgproc::morph3x3_oclsrc, opts); - - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(depth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - UMat extraMat = _extraMat.getUMat(); - - int idxArg = k.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = k.set(idxArg, (int)src.step); - idxArg = k.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = k.set(idxArg, (int)dst.step); - idxArg = k.set(idxArg, (int)dst.rows); - idxArg = k.set(idxArg, (int)dst.cols); - - if (haveExtraMat) - { - idxArg = k.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(extraMat)); - } - - return k.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -static bool ocl_morphSmall( InputArray _src, OutputArray _dst, InputArray _kernel, Point anchor, int borderType, - int op, int actual_op = -1, InputArray _extraMat = noArray()) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); - bool doubleSupport = dev.doubleFPConfig() > 0; - - if (cn > 4 || (!doubleSupport && depth == CV_64F) || - _src.offset() % esz != 0 || _src.step() % esz != 0) - return false; - - bool haveExtraMat = !_extraMat.empty(); - CV_Assert(actual_op <= 3 || haveExtraMat); - - Size ksize = _kernel.size(); - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - Size size = _src.size(), wholeSize; - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - int wdepth = depth, wtype = type; - if (depth == CV_8U) - { - wdepth = CV_32S; - wtype = CV_MAKETYPE(wdepth, cn); - } - char cvt[2][40]; - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", - "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; - - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - int h = isolated ? size.height : wholeSize.height; - int w = isolated ? size.width : wholeSize.width; - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1, pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = size.width % 2 ? 1 : 2; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - globalsize[0] = size.width / pxPerWorkItemX; - globalsize[1] = size.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = ROUNDUP(globalsize[0], wgRound); - - if (actual_op < 0) - actual_op = op; - - // build processing - String processing; - Mat kernel8u; - _kernel.getMat().convertTo(kernel8u, CV_8U); - for (int y = 0; y < kernel8u.rows; ++y) - for (int x = 0; x < kernel8u.cols; ++x) - if (kernel8u.at(y, x) != 0) - processing += format("PROCESS(%d,%d)", y, x); - - - static const char * const op2str[] = { "OP_ERODE", "OP_DILATE", NULL, NULL, "OP_GRADIENT", "OP_TOPHAT", "OP_BLACKHAT" }; - String opts = format("-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d -D DEPTH_%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=srcT -D dstT1=srcT1 -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D PROCESS_ELEM_=%s -D %s%s", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, depth, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(depth), - haveExtraMat ? ocl::typeToStr(wtype):"srcT",//to prevent overflow - WT - haveExtraMat ? ocl::typeToStr(wdepth):"srcT1",//to prevent overflow - WT1 - haveExtraMat ? ocl::convertTypeStr(depth, wdepth, cn, cvt[0]) : "noconvert",//to prevent overflow - src to WT - haveExtraMat ? ocl::convertTypeStr(wdepth, depth, cn, cvt[1]) : "noconvert",//to prevent overflow - WT to dst - ocl::typeToStr(CV_MAKE_TYPE(haveExtraMat ? wdepth : depth, pxLoadVecSize)), //PX_LOAD_FLOAT_VEC_CONV - processing.c_str(), op2str[op], - actual_op == op ? "" : cv::format(" -D %s", op2str[actual_op]).c_str()); - - ocl::Kernel kernel("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, opts); - if (kernel.empty()) - return false; - - _dst.create(size, type); - UMat dst = _dst.getUMat(); - - UMat source; - if(src.u != dst.u) - source = src; - else - { - Point ofs; - int cols = src.cols, rows = src.rows; - src.locateROI(wholeSize, ofs); - src.adjustROI(ofs.y, wholeSize.height - rows - ofs.y, ofs.x, wholeSize.width - cols - ofs.x); - src.copyTo(source); - - src.adjustROI(-ofs.y, -wholeSize.height + rows + ofs.y, -ofs.x, -wholeSize.width + cols + ofs.x); - source.adjustROI(-ofs.y, -wholeSize.height + rows + ofs.y, -ofs.x, -wholeSize.width + cols + ofs.x); - source.locateROI(wholeSize, ofs); - } - - UMat extraMat = _extraMat.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(source)); - idxArg = kernel.set(idxArg, (int)source.step); - int srcOffsetX = (int)((source.offset % source.step) / source.elemSize()); - int srcOffsetY = (int)(source.offset / source.step); - int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; - int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; - idxArg = kernel.set(idxArg, srcOffsetX); - idxArg = kernel.set(idxArg, srcOffsetY); - idxArg = kernel.set(idxArg, srcEndX); - idxArg = kernel.set(idxArg, srcEndY); - idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); - - if (haveExtraMat) - { - idxArg = kernel.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(extraMat)); - } - - return kernel.run(2, globalsize, NULL, false); -} - -static bool ocl_morphOp(InputArray _src, OutputArray _dst, InputArray _kernel, - Point anchor, int iterations, int op, int borderType, - const Scalar &, int actual_op = -1, InputArray _extraMat = noArray()) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - Mat kernel = _kernel.getMat(); - Size ksize = !kernel.empty() ? kernel.size() : Size(3, 3), ssize = _src.size(); - - bool doubleSupport = dev.doubleFPConfig() > 0; - if ((depth == CV_64F && !doubleSupport) || borderType != BORDER_CONSTANT) - return false; - - bool haveExtraMat = !_extraMat.empty(); - CV_Assert(actual_op <= 3 || haveExtraMat); - - if (kernel.empty()) - { - ksize = Size(1+iterations*2,1+iterations*2); - kernel = getStructuringElement(MORPH_RECT, ksize); - anchor = Point(iterations, iterations); - iterations = 1; - CV_DbgAssert(ksize == kernel.size()); - } - else if( iterations > 1 && countNonZero(kernel) == kernel.rows*kernel.cols ) - { - ksize = Size(ksize.width + (iterations-1)*(ksize.width-1), - ksize.height + (iterations-1)*(ksize.height-1)); - anchor = Point(anchor.x*iterations, anchor.y*iterations); - kernel = getStructuringElement(MORPH_RECT, ksize, anchor); - iterations = 1; - CV_DbgAssert(ksize == kernel.size()); - } - - static bool param_use_morph_special_kernels = utils::getConfigurationParameterBool("OPENCV_OPENCL_IMGPROC_MORPH_SPECIAL_KERNEL", -#ifndef __APPLE__ - true -#else - false -#endif - ); - - int esz = CV_ELEM_SIZE(type); - // try to use OpenCL kernel adopted for small morph kernel - if (param_use_morph_special_kernels && dev.isIntel() && - ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || - (ksize.width == 5 && ksize.height == 5 && cn == 1)) && - (iterations == 1) - ) - { - if (ocl_morph3x3_8UC1(_src, _dst, kernel, anchor, op, actual_op, _extraMat)) - return true; - - if (ocl_morphSmall(_src, _dst, kernel, anchor, borderType, op, actual_op, _extraMat)) - return true; - } - - if (iterations == 0 || kernel.rows*kernel.cols == 1) - { - _src.copyTo(_dst); - return true; - } - -#ifdef __ANDROID__ - size_t localThreads[2] = { 16, 8 }; -#else - size_t localThreads[2] = { 16, 16 }; -#endif - size_t globalThreads[2] = { (size_t)ssize.width, (size_t)ssize.height }; - -#ifdef __APPLE__ - if( actual_op != MORPH_ERODE && actual_op != MORPH_DILATE ) - localThreads[0] = localThreads[1] = 4; -#endif - - if (localThreads[0]*localThreads[1] * 2 < (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1)) - return false; - -#ifdef __ANDROID__ - if (dev.isNVidia()) - return false; -#endif - - // build processing - String processing; - Mat kernel8u; - kernel.convertTo(kernel8u, CV_8U); - for (int y = 0; y < kernel8u.rows; ++y) - for (int x = 0; x < kernel8u.cols; ++x) - if (kernel8u.at(y, x) != 0) - processing += format("PROCESS(%d,%d)", y, x); - - static const char * const op2str[] = { "OP_ERODE", "OP_DILATE", NULL, NULL, "OP_GRADIENT", "OP_TOPHAT", "OP_BLACKHAT" }; - - char cvt[2][50]; - int wdepth = std::max(depth, CV_32F), scalarcn = cn == 3 ? 4 : cn; - - if (actual_op < 0) - actual_op = op; - - std::vector kernels(iterations); - for (int i = 0; i < iterations; i++) - { - int current_op = iterations == i + 1 ? actual_op : op; - String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s" - " -D PROCESS_ELEMS=%s -D T=%s -D DEPTH_%d -D cn=%d -D T1=%s" - " -D convertToWT=%s -D convertToT=%s -D ST=%s%s", - anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], op2str[op], - doubleSupport ? " -D DOUBLE_SUPPORT" : "", processing.c_str(), - ocl::typeToStr(type), depth, cn, ocl::typeToStr(depth), - ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, depth, cn, cvt[1]), - ocl::typeToStr(CV_MAKE_TYPE(depth, scalarcn)), - current_op == op ? "" : cv::format(" -D %s", op2str[current_op]).c_str()); - - kernels[i].create("morph", ocl::imgproc::morph_oclsrc, buildOptions); - if (kernels[i].empty()) - return false; - } - - UMat src = _src.getUMat(), extraMat = _extraMat.getUMat(); - _dst.create(src.size(), src.type()); - UMat dst = _dst.getUMat(); - - if (iterations == 1 && src.u != dst.u) - { - Size wholesize; - Point ofs; - src.locateROI(wholesize, ofs); - int wholecols = wholesize.width, wholerows = wholesize.height; - - if (haveExtraMat) - kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, src.cols, src.rows, wholecols, wholerows, - ocl::KernelArg::ReadOnlyNoSize(extraMat)); - else - kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, src.cols, src.rows, wholecols, wholerows); - - return kernels[0].run(2, globalThreads, localThreads, false); - } - - for (int i = 0; i < iterations; i++) - { - UMat source; - Size wholesize; - Point ofs; - - if (i == 0) - { - int cols = src.cols, rows = src.rows; - src.locateROI(wholesize, ofs); - src.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x); - if(src.u != dst.u) - source = src; - else - src.copyTo(source); - - src.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - } - else - { - int cols = dst.cols, rows = dst.rows; - dst.locateROI(wholesize, ofs); - dst.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x); - dst.copyTo(source); - dst.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x); - } - source.locateROI(wholesize, ofs); - - if (haveExtraMat && iterations == i + 1) - kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, source.cols, source.rows, wholesize.width, wholesize.height, - ocl::KernelArg::ReadOnlyNoSize(extraMat)); - else - kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst), - ofs.x, ofs.y, source.cols, source.rows, wholesize.width, wholesize.height); - - if (!kernels[i].run(2, globalThreads, localThreads, false)) - return false; - } - - return true; -} - -#endif - -static void morphOp( int op, InputArray _src, OutputArray _dst, - InputArray _kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - Mat kernel = _kernel.getMat(); - Size ksize = !kernel.empty() ? kernel.size() : Size(3,3); - anchor = normalizeAnchor(anchor, ksize); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 && - borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() && - (op == MORPH_ERODE || op == MORPH_DILATE) && - anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1, - ocl_morphOp(_src, _dst, kernel, anchor, iterations, op, borderType, borderValue) ) - - if (iterations == 0 || kernel.rows*kernel.cols == 1) - { - _src.copyTo(_dst); - return; - } - - if (kernel.empty()) - { - kernel = getStructuringElement(MORPH_RECT, Size(1+iterations*2,1+iterations*2)); - anchor = Point(iterations, iterations); - iterations = 1; - } - else if( iterations > 1 && countNonZero(kernel) == kernel.rows*kernel.cols ) - { - anchor = Point(anchor.x*iterations, anchor.y*iterations); - kernel = getStructuringElement(MORPH_RECT, - Size(ksize.width + (iterations-1)*(ksize.width-1), - ksize.height + (iterations-1)*(ksize.height-1)), - anchor); - iterations = 1; - } - - Mat src = _src.getMat(); - _dst.create( src.size(), src.type() ); - Mat dst = _dst.getMat(); - - Point s_ofs; - Size s_wsz(src.cols, src.rows); - Point d_ofs; - Size d_wsz(dst.cols, dst.rows); - bool isolated = (borderType&BORDER_ISOLATED)?true:false; - borderType = (borderType&~BORDER_ISOLATED); - - if(!isolated) - { - src.locateROI(s_wsz, s_ofs); - dst.locateROI(d_wsz, d_ofs); - } - - hal::morph(op, src.type(), dst.type(), - src.data, src.step, - dst.data, dst.step, - src.cols, src.rows, - s_wsz.width, s_wsz.height, s_ofs.x, s_ofs.y, - d_wsz.width, d_wsz.height, d_ofs.x, d_ofs.y, - kernel.type(), kernel.data, kernel.step, kernel.cols, kernel.rows, anchor.x, anchor.y, - borderType, borderValue.val, iterations, - (src.isSubmatrix() && !isolated)); -} - -} - -void cv::erode( InputArray src, OutputArray dst, InputArray kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - morphOp( MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue ); -} - - -void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - morphOp( MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue ); -} - -#ifdef HAVE_OPENCL - -namespace cv { - -static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, - InputArray kernel, Point anchor, int iterations, - int borderType, const Scalar& borderValue) -{ - _dst.createSameSize(_src, _src.type()); - bool submat = _dst.isSubmatrix(); - UMat temp; - _OutputArray _temp = submat ? _dst : _OutputArray(temp); - - switch( op ) - { - case MORPH_ERODE: - if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - break; - case MORPH_DILATE: - if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - break; - case MORPH_OPEN: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - break; - case MORPH_CLOSE: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - break; - case MORPH_GRADIENT: - if (!ocl_morphOp( _src, temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _src, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue, MORPH_GRADIENT, temp )) - return false; - break; - case MORPH_TOPHAT: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue, MORPH_TOPHAT, _src )) - return false; - break; - case MORPH_BLACKHAT: - if (!ocl_morphOp( _src, _temp, kernel, anchor, iterations, MORPH_DILATE, borderType, borderValue )) - return false; - if (!ocl_morphOp( _temp, _dst, kernel, anchor, iterations, MORPH_ERODE, borderType, borderValue, MORPH_BLACKHAT, _src )) - return false; - break; - default: - CV_Error( CV_StsBadArg, "unknown morphological operation" ); - } - - return true; -} - -} -#endif - -#define IPP_DISABLE_MORPH_ADV 1 -#ifdef HAVE_IPP -#if !IPP_DISABLE_MORPH_ADV -namespace cv { -static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, - InputArray _kernel, - Point anchor, int iterations, - int borderType, const Scalar& borderValue) -{ -#if defined HAVE_IPP_IW - Mat kernel = _kernel.getMat(); - Size ksize = !kernel.empty() ? kernel.size() : Size(3,3); - anchor = normalizeAnchor(anchor, ksize); - - if (iterations == 0 || kernel.rows*kernel.cols == 1) - { - _src.copyTo(_dst); - return true; - } - - if (kernel.empty()) - { - kernel = getStructuringElement(MORPH_RECT, Size(1+iterations*2,1+iterations*2)); - anchor = Point(iterations, iterations); - iterations = 1; - } - else if( iterations > 1 && countNonZero(kernel) == kernel.rows*kernel.cols ) - { - anchor = Point(anchor.x*iterations, anchor.y*iterations); - kernel = getStructuringElement(MORPH_RECT, - Size(ksize.width + (iterations-1)*(ksize.width-1), - ksize.height + (iterations-1)*(ksize.height-1)), - anchor); - iterations = 1; - } - - Mat src = _src.getMat(); - _dst.create( src.size(), src.type() ); - Mat dst = _dst.getMat(); - - Point s_ofs; - Size s_wsz(src.cols, src.rows); - Point d_ofs; - Size d_wsz(dst.cols, dst.rows); - bool isolated = (borderType&BORDER_ISOLATED)?true:false; - borderType = (borderType&~BORDER_ISOLATED); - - if(!isolated) - { - src.locateROI(s_wsz, s_ofs); - dst.locateROI(d_wsz, d_ofs); - } - - return ippMorph(op, src.type(), dst.type(), - src.data, src.step, - dst.data, dst.step, - src.cols, src.rows, - s_wsz.width, s_wsz.height, s_ofs.x, s_ofs.y, - d_wsz.width, d_wsz.height, d_ofs.x, d_ofs.y, - kernel.type(), kernel.data, kernel.step, kernel.cols, kernel.rows, anchor.x, anchor.y, - borderType, borderValue.val, iterations, - (src.isSubmatrix() && !isolated)); -#else - CV_UNUSED(op); CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(_kernel); CV_UNUSED(anchor); - CV_UNUSED(iterations); CV_UNUSED(borderType); CV_UNUSED(borderValue); - return false; -#endif -} -} -#endif -#endif - -void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, - InputArray _kernel, Point anchor, int iterations, - int borderType, const Scalar& borderValue ) -{ - CV_INSTRUMENT_REGION(); - - Mat kernel = _kernel.getMat(); - if (kernel.empty()) - { - kernel = getStructuringElement(MORPH_RECT, Size(3,3), Point(1,1)); - } -#ifdef HAVE_OPENCL - Size ksize = kernel.size(); - anchor = normalizeAnchor(anchor, ksize); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 && - anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1 && - borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue(), - ocl_morphologyEx(_src, _dst, op, kernel, anchor, iterations, borderType, borderValue)) -#endif - - Mat src = _src.getMat(), temp; - _dst.create(src.size(), src.type()); - Mat dst = _dst.getMat(); - -#if !IPP_DISABLE_MORPH_ADV - CV_IPP_RUN_FAST(ipp_morphologyEx(op, src, dst, kernel, anchor, iterations, borderType, borderValue)); -#endif - - switch( op ) - { - case MORPH_ERODE: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_DILATE: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_OPEN: - erode( src, dst, kernel, anchor, iterations, borderType, borderValue ); - dilate( dst, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_CLOSE: - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue ); - erode( dst, dst, kernel, anchor, iterations, borderType, borderValue ); - break; - case MORPH_GRADIENT: - erode( src, temp, kernel, anchor, iterations, borderType, borderValue ); - dilate( src, dst, kernel, anchor, iterations, borderType, borderValue ); - dst -= temp; - break; - case MORPH_TOPHAT: - if( src.data != dst.data ) - temp = dst; - erode( src, temp, kernel, anchor, iterations, borderType, borderValue ); - dilate( temp, temp, kernel, anchor, iterations, borderType, borderValue ); - dst = src - temp; - break; - case MORPH_BLACKHAT: - if( src.data != dst.data ) - temp = dst; - dilate( src, temp, kernel, anchor, iterations, borderType, borderValue ); - erode( temp, temp, kernel, anchor, iterations, borderType, borderValue ); - dst = temp - src; - break; - case MORPH_HITMISS: - CV_Assert(src.type() == CV_8UC1); - if(countNonZero(kernel) <=0) - { - src.copyTo(dst); - break; - } - { - Mat k1, k2, e1, e2; - k1 = (kernel == 1); - k2 = (kernel == -1); - - if (countNonZero(k1) <= 0) - e1 = Mat(src.size(), src.type(), Scalar(255)); - else - erode(src, e1, k1, anchor, iterations, borderType, borderValue); - - if (countNonZero(k2) <= 0) - e2 = Mat(src.size(), src.type(), Scalar(255)); - else - { - Mat src_complement; - bitwise_not(src, src_complement); - erode(src_complement, e2, k2, anchor, iterations, borderType, borderValue); - } - dst = e1 & e2; - } - break; - default: - CV_Error( CV_StsBadArg, "unknown morphological operation" ); - } -} - -CV_IMPL IplConvKernel * -cvCreateStructuringElementEx( int cols, int rows, - int anchorX, int anchorY, - int shape, int *values ) -{ - cv::Size ksize = cv::Size(cols, rows); - cv::Point anchor = cv::Point(anchorX, anchorY); - CV_Assert( cols > 0 && rows > 0 && anchor.inside(cv::Rect(0,0,cols,rows)) && - (shape != CV_SHAPE_CUSTOM || values != 0)); - - int i, size = rows * cols; - int element_size = sizeof(IplConvKernel) + size*sizeof(int); - IplConvKernel *element = (IplConvKernel*)cvAlloc(element_size + 32); - - element->nCols = cols; - element->nRows = rows; - element->anchorX = anchorX; - element->anchorY = anchorY; - element->nShiftR = shape < CV_SHAPE_ELLIPSE ? shape : CV_SHAPE_CUSTOM; - element->values = (int*)(element + 1); - - if( shape == CV_SHAPE_CUSTOM ) - { - for( i = 0; i < size; i++ ) - element->values[i] = values[i]; - } - else - { - cv::Mat elem = cv::getStructuringElement(shape, ksize, anchor); - for( i = 0; i < size; i++ ) - element->values[i] = elem.ptr()[i]; - } - - return element; -} - - -CV_IMPL void -cvReleaseStructuringElement( IplConvKernel ** element ) -{ - if( !element ) - CV_Error( CV_StsNullPtr, "" ); - cvFree( element ); -} - - -static void convertConvKernel( const IplConvKernel* src, cv::Mat& dst, cv::Point& anchor ) -{ - if(!src) - { - anchor = cv::Point(1,1); - dst.release(); - return; - } - anchor = cv::Point(src->anchorX, src->anchorY); - dst.create(src->nRows, src->nCols, CV_8U); - - int i, size = src->nRows*src->nCols; - for( i = 0; i < size; i++ ) - dst.ptr()[i] = (uchar)(src->values[i] != 0); -} - - -CV_IMPL void -cvErode( const CvArr* srcarr, CvArr* dstarr, IplConvKernel* element, int iterations ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), kernel; - CV_Assert( src.size() == dst.size() && src.type() == dst.type() ); - cv::Point anchor; - convertConvKernel( element, kernel, anchor ); - cv::erode( src, dst, kernel, anchor, iterations, cv::BORDER_REPLICATE ); -} - - -CV_IMPL void -cvDilate( const CvArr* srcarr, CvArr* dstarr, IplConvKernel* element, int iterations ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), kernel; - CV_Assert( src.size() == dst.size() && src.type() == dst.type() ); - cv::Point anchor; - convertConvKernel( element, kernel, anchor ); - cv::dilate( src, dst, kernel, anchor, iterations, cv::BORDER_REPLICATE ); -} - - -CV_IMPL void -cvMorphologyEx( const void* srcarr, void* dstarr, void*, - IplConvKernel* element, int op, int iterations ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), kernel; - CV_Assert( src.size() == dst.size() && src.type() == dst.type() ); - cv::Point anchor; - IplConvKernel* temp_element = NULL; - if (!element) - { - temp_element = cvCreateStructuringElementEx(3, 3, 1, 1, CV_SHAPE_RECT); - } else { - temp_element = element; - } - convertConvKernel( temp_element, kernel, anchor ); - if (!element) - { - cvReleaseStructuringElement(&temp_element); - } - cv::morphologyEx( src, dst, op, kernel, anchor, iterations, cv::BORDER_REPLICATE ); -} - -/* End of file. */ +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace