From fcfcd4cbced5e188ffd01758150e07b2210b25ab Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Apr 2013 11:46:49 +0400
Subject: [PATCH 1/9] refactored box filter

---
 .../gpubgsegm/include/opencv2/gpubgsegm.hpp   |   2 +-
 modules/gpubgsegm/src/gmg.cpp                 |   4 +-
 modules/gpufilters/CMakeLists.txt             |   2 +-
 .../gpufilters/include/opencv2/gpufilters.hpp |  70 +++++--
 modules/gpufilters/perf/perf_filters.cpp      |   4 +-
 modules/gpufilters/src/filtering.cpp          | 196 +++++++++++-------
 modules/gpufilters/src/precomp.hpp            |   7 +-
 modules/gpufilters/test/test_filters.cpp      |  15 +-
 8 files changed, 193 insertions(+), 107 deletions(-)

diff --git a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
index e7a29b5763..3fe62ec94b 100644
--- a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
+++ b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
@@ -321,7 +321,7 @@ private:
     GpuMat colors_;
     GpuMat weights_;
 
-    Ptr<FilterEngine_GPU> boxFilter_;
+    Ptr<gpu::Filter> boxFilter_;
     GpuMat buf_;
 };
 
diff --git a/modules/gpubgsegm/src/gmg.cpp b/modules/gpubgsegm/src/gmg.cpp
index a38cbffaca..b97f0836f4 100644
--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
@@ -100,7 +100,7 @@ void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
     nfeatures_.setTo(cv::Scalar::all(0));
 
     if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
+        boxFilter_ = cv::gpu::createBoxFilter(CV_8UC1, -1, cv::Size(smoothingRadius, smoothingRadius));
 
     loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
 }
@@ -141,7 +141,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
     // medianBlur
     if (smoothingRadius > 0)
     {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
+        boxFilter_->apply(fgmask, buf_, stream);
         int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
         double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
         cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
diff --git a/modules/gpufilters/CMakeLists.txt b/modules/gpufilters/CMakeLists.txt
index 18f6d7f7b6..640de8c115 100644
--- a/modules/gpufilters/CMakeLists.txt
+++ b/modules/gpufilters/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "GPU-accelerated Image Filtering")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
 
-ocv_define_module(gpufilters opencv_imgproc OPTIONAL opencv_gpuarithm)
+ocv_define_module(gpufilters opencv_imgproc opencv_gpuarithm)
diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 582c55d999..5cc2ac49aa 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -48,10 +48,61 @@
 #endif
 
 #include "opencv2/core/gpu.hpp"
-#include "opencv2/core/base.hpp"
+
+#if defined __GNUC__
+    #define __OPENCV_GPUFILTERS_DEPR_BEFORE__
+    #define __OPENCV_GPUFILTERS_DEPR_AFTER__ __attribute__ ((deprecated))
+#elif (defined WIN32 || defined _WIN32)
+    #define __OPENCV_GPUFILTERS_DEPR_BEFORE__ __declspec(deprecated)
+    #define __OPENCV_GPUFILTERS_DEPR_AFTER__
+#else
+    #define __OPENCV_GPUFILTERS_DEPR_BEFORE__
+    #define __OPENCV_GPUFILTERS_DEPR_AFTER__
+#endif
 
 namespace cv { namespace gpu {
 
+class CV_EXPORTS Filter : public Algorithm
+{
+public:
+    virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Box Filter
+
+//! smooths the image using the normalized box filter
+//! supports CV_8UC1, CV_8UC4 types
+CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
+                                       int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void boxFilter(InputArray src, OutputArray dst, int dstType,
+                                                 Size ksize, Point anchor = Point(-1,-1),
+                                                 Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void boxFilter(InputArray src, OutputArray dst, int dstType, Size ksize, Point anchor, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createBoxFilter(src.type(), dstType, ksize, anchor);
+    f->apply(src, dst, stream);
+}
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void blur(InputArray src, OutputArray dst, Size ksize,
+                                            Point anchor = Point(-1,-1),
+                                            Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void blur(InputArray src, OutputArray dst, Size ksize, Point anchor, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createBoxFilter(src.type(), -1, ksize, anchor);
+    f->apply(src, dst, stream);
+}
+
+
+
+
+
+
+
+
 /*!
 The Base Class for 1D or Row-wise Filters
 
@@ -128,13 +179,7 @@ CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType,
 //! supports only CV_8UC1 sum type and CV_32FC1 dst type
 CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
 
-//! returns 2D box filter
-//! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
-CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
 
-//! returns box filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
-    const Point& anchor = Point(-1,-1));
 
 //! returns 2D morphological filter
 //! only MORPH_ERODE and MORPH_DILATE are supported
@@ -205,15 +250,7 @@ CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const
 //! returns minimum filter
 CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
 
-//! smooths the image using the normalized box filter
-//! supports CV_8UC1, CV_8UC4 types
-CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
 
-//! a synonym for normalized box filter
-static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
-{
-    boxFilter(src, dst, -1, ksize, anchor, stream);
-}
 
 //! erodes the image (applies the local minimum operator)
 CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
@@ -266,4 +303,7 @@ CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize
 
 }} // namespace cv { namespace gpu {
 
+#undef __OPENCV_GPUFILTERS_DEPR_BEFORE__
+#undef __OPENCV_GPUFILTERS_DEPR_AFTER__
+
 #endif /* __OPENCV_GPUFILTERS_HPP__ */
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index 64cf4cc5db..35c4a94fbb 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -70,7 +70,9 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
+        cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(d_src.type(), -1, cv::Size(ksize, ksize));
+
+        TEST_CYCLE() blurFilter->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst, 1);
     }
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index d40293d4ac..35df05ec6f 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -47,13 +47,13 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
+Ptr<Filter> cv::gpu::createBoxFilter(int, int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
@@ -70,7 +70,6 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, doub
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 
-void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_no_cuda(); }
 void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
 void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
@@ -92,20 +91,135 @@ void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, int, Stream&)
 
 namespace
 {
-    inline void normalizeAnchor(int& anchor, int ksize)
+    void normalizeAnchor(int& anchor, int ksize)
     {
         if (anchor < 0)
             anchor = ksize >> 1;
 
-        CV_Assert(0 <= anchor && anchor < ksize);
+        CV_Assert( 0 <= anchor && anchor < ksize );
     }
 
-    inline void normalizeAnchor(Point& anchor, const Size& ksize)
+    void normalizeAnchor(Point& anchor, Size ksize)
     {
         normalizeAnchor(anchor.x, ksize.width);
         normalizeAnchor(anchor.y, ksize.height);
     }
+}
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Box Filter
+
+namespace
+{
+    class NPPBoxFilter : public Filter
+    {
+    public:
+        NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppFilterBox_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep,
+                                            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
+
+        Size ksize_;
+        Point anchor_;
+        int type_;
+        nppFilterBox_t func_;
+        int borderMode_;
+        Scalar borderVal_;
+        GpuMat srcBorder_;
+    };
+
+    NPPBoxFilter::NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        ksize_(ksize), anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        static const nppFilterBox_t funcs[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
+
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+        CV_Assert( dstType == srcType );
+
+        normalizeAnchor(anchor_, ksize);
+
+        func_ = funcs[CV_MAT_CN(srcType)];
+    }
+
+    void NPPBoxFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                           dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+namespace
+{
     inline void normalizeROI(Rect& roi, const Size& ksize, const Point& anchor, const Size& src_size)
     {
         if (roi == Rect(0,0,-1,-1))
@@ -329,74 +443,6 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstTy
     return Ptr<BaseColumnFilter_GPU>(new NppColumnSumFilter(ksize, anchor));
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Box Filter
-
-namespace
-{
-    typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        NppiSize oMaskSize, NppiPoint oAnchor);
-
-    struct NPPBoxFilter : public BaseFilter_GPU
-    {
-        NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        nppFilterBox_t func;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
-{
-    static const nppFilterBox_t nppFilterBox_callers[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPBoxFilter(ksize, anchor, nppFilterBox_callers[CV_MAT_CN(srcType)]));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)
-{
-    Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);
-    return createFilter2D_GPU(boxFilter, srcType, dstType);
-}
-
-void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor, Stream& stream)
-{
-    int sdepth = src.depth(), cn = src.channels();
-    if( ddepth < 0 )
-        ddepth = sdepth;
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, cn));
-
-    Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor);
-    f->apply(src, dst, Rect(0,0,-1,-1), stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Morphology Filter
 
@@ -633,7 +679,6 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
         erode(buf2, dst, kernel, buf1, anchor, iterations, stream);
         break;
 
-#ifdef HAVE_OPENCV_GPUARITHM
     case MORPH_GRADIENT:
         erode(src, buf2, kernel, buf1, anchor, iterations, stream);
         dilate(src, dst, kernel, buf1, anchor, iterations, stream);
@@ -651,7 +696,6 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
         erode(dst, buf2, kernel, buf1, anchor, iterations, stream);
         gpu::subtract(buf2, src, dst, GpuMat(), -1, stream);
         break;
-#endif
 
     default:
         CV_Error(cv::Error::StsBadArg, "unknown morphological operation");
diff --git a/modules/gpufilters/src/precomp.hpp b/modules/gpufilters/src/precomp.hpp
index 3add0f2af1..c3d5e020d2 100644
--- a/modules/gpufilters/src/precomp.hpp
+++ b/modules/gpufilters/src/precomp.hpp
@@ -46,14 +46,9 @@
 #include <limits>
 
 #include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuarithm.hpp"
 #include "opencv2/imgproc.hpp"
 
 #include "opencv2/core/private.gpu.hpp"
 
-#include "opencv2/opencv_modules.hpp"
-
-#ifdef HAVE_OPENCV_GPUARITHM
-#  include "opencv2/gpuarithm.hpp"
-#endif
-
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index 5adcd87a41..a63d92b3d7 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -70,13 +70,14 @@ namespace
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Blur
 
-PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi)
+PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
     int type;
     cv::Size ksize;
     cv::Point anchor;
+    int borderType;
     bool useRoi;
 
     virtual void SetUp()
@@ -86,7 +87,8 @@ PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, Use
         type = GET_PARAM(2);
         ksize = GET_PARAM(3);
         anchor = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
 
         cv::gpu::setDevice(devInfo.deviceID());
     }
@@ -96,13 +98,15 @@ GPU_TEST_P(Blur, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(src.type(), -1, ksize, anchor, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::blur(loadMat(src, useRoi), dst, ksize, anchor);
+    blurFilter->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
-    cv::blur(src, dst_gold, ksize, anchor);
+    cv::blur(src, dst_gold, ksize, anchor, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
@@ -111,6 +115,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
     testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
     testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
 /////////////////////////////////////////////////////////////////////////////////////////////////

From 1eedc6c42aaf7b5ac5cd5edca1b0a0367c7eb3f6 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Apr 2013 11:51:47 +0400
Subject: [PATCH 2/9] refactored Linear Filter

---
 .../gpufilters/include/opencv2/gpufilters.hpp |  41 ++-
 modules/gpufilters/perf/perf_filters.cpp      |  81 +++--
 modules/gpufilters/src/cuda/filter2d.cu       | 189 +++++------
 modules/gpufilters/src/filtering.cpp          | 319 ++++++++----------
 modules/gpufilters/test/test_filters.cpp      | 210 ++++++------
 samples/gpu/performance/tests.cpp             |   5 +-
 6 files changed, 416 insertions(+), 429 deletions(-)

diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 5cc2ac49aa..32d3403d53 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -96,6 +96,34 @@ inline void blur(InputArray src, OutputArray dst, Size ksize, Point anchor, Stre
     f->apply(src, dst, stream);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
+
+//! non-separable linear 2D filter
+CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
+                                            int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernel,
+                                                Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT,
+                                                Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernel, Point anchor, int borderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createLinearFilter(src.type(), ddepth, kernel, anchor, borderType);
+    f->apply(src, dst, stream);
+}
+
+
+
+
+
+
+
+//! applies Laplacian operator to the image
+//! supports only ksize = 1 and ksize = 3
+CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+
+
 
 
 
@@ -194,13 +222,7 @@ CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, co
 CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
     const Point& anchor = Point(-1,-1), int iterations = 1);
 
-//! returns 2D filter with the specified kernel
-//! supports CV_8U, CV_16U and CV_32F one and four channel image
-CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
-//! returns the non-separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-    Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT);
 
 //! returns the primitive row filter with the specified kernel.
 //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
@@ -269,9 +291,6 @@ CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat&
 CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
                              Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
 
-//! applies non-separable 2D linear filter to the image
-CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
 //! applies separable 2D linear filter to the image
 CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
                             Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
@@ -297,10 +316,6 @@ CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double
 CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
                              int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
 
-//! applies Laplacian operator to the image
-//! supports only ksize = 1 and ksize = 3
-CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
 }} // namespace cv { namespace gpu {
 
 #undef __OPENCV_GPUFILTERS_DEPR_BEFORE__
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index 35c4a94fbb..3d3f587558 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -86,6 +86,51 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(d_src.type(), -1, kernel);
+
+        TEST_CYCLE() filter2D->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+
+
+
+
+
+
+
 //////////////////////////////////////////////////////////////////////
 // Sobel
 
@@ -330,39 +375,3 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         CPU_SANITY_CHECK(dst);
     }
 }
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    declare.in(kernel, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
diff --git a/modules/gpufilters/src/cuda/filter2d.cu b/modules/gpufilters/src/cuda/filter2d.cu
index 80c93c54ed..4e913124df 100644
--- a/modules/gpufilters/src/cuda/filter2d.cu
+++ b/modules/gpufilters/src/cuda/filter2d.cu
@@ -48,111 +48,104 @@
 
 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
+    template <class SrcPtr, typename D>
+    __global__ void filter2D(const SrcPtr src, PtrStepSz<D> dst,
+                             const float* __restrict__ kernel,
+                             const int kWidth, const int kHeight,
+                             const int anchorX, const int anchorY)
     {
-        #define FILTER2D_MAX_KERNEL_SIZE 16
+        typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
 
-        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-        template <class SrcT, typename D>
-        __global__ void filter2D(const SrcT src, PtrStepSz<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        sum_t res = VecTraits<sum_t>::all(0);
+        int kInd = 0;
+
+        for (int i = 0; i < kHeight; ++i)
         {
-            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            sum_t res = VecTraits<sum_t>::all(0);
-            int kInd = 0;
-
-            for (int i = 0; i < kHeight; ++i)
-            {
-                for (int j = 0; j < kWidth; ++j)
-                    res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
-            }
-
-            dst(y, x) = saturate_cast<D>(res);
+            for (int j = 0; j < kWidth; ++j)
+                res = res + src(y - anchorY + i, x - anchorX + j) * kernel[kInd++];
         }
 
-        template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
-
-        #define IMPLEMENT_FILTER2D_TEX_READER(type) \
-            texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_filter2D_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
-            { \
-                static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, \
-                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(16, 16); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_filter2D_ ## type , srcWhole); \
-                    tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
-                    Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
-                    filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    if (stream == 0) \
-                        cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        IMPLEMENT_FILTER2D_TEX_READER(uchar);
-        IMPLEMENT_FILTER2D_TEX_READER(uchar4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(ushort);
-        IMPLEMENT_FILTER2D_TEX_READER(ushort4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(float);
-        IMPLEMENT_FILTER2D_TEX_READER(float4);
-
-        #undef IMPLEMENT_FILTER2D_TEX_READER
-
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                Filter2DCaller<T, D, BrdConstant>::call,
-                Filter2DCaller<T, D, BrdReplicate>::call,
-                Filter2DCaller<T, D, BrdReflect>::call,
-                Filter2DCaller<T, D, BrdWrap>::call,
-                Filter2DCaller<T, D, BrdReflect101>::call
-            };
-
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
-        }
-
-        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        dst(y, x) = saturate_cast<D>(res);
     }
+
+    template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+
+    #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+        texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        struct tex_filter2D_ ## type ## _reader \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            const int xoff; \
+            const int yoff; \
+            tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
+            } \
+        }; \
+        template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
+        { \
+            static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel, \
+                int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+            { \
+                typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                dim3 block(16, 16); \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kernel, kWidth, kHeight, anchorX, anchorY); \
+                cudaSafeCall( cudaGetLastError() ); \
+                if (stream == 0) \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+            } \
+        };
+
+    IMPLEMENT_FILTER2D_TEX_READER(uchar);
+    IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(ushort);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(float);
+    IMPLEMENT_FILTER2D_TEX_READER(float4);
+
+    #undef IMPLEMENT_FILTER2D_TEX_READER
+
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel,
+                               int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            Filter2DCaller<T, D, BrdConstant>::call,
+            Filter2DCaller<T, D, BrdReplicate>::call,
+            Filter2DCaller<T, D, BrdReflect>::call,
+            Filter2DCaller<T, D, BrdWrap>::call,
+            Filter2DCaller<T, D, BrdReflect101>::call
+        };
+
+        funcs[borderMode]((PtrStepSz<T>) srcWhole, ofsX, ofsY, (PtrStepSz<D>) dst, kernel,
+                          kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+    }
+
+    template void filter2D<uchar  , uchar  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<uchar4 , uchar4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort , ushort >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float  , float  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float4 , float4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
 }}}
 
 #endif // CUDA_DISABLER
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index 35df05ec6f..3135b599a7 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -49,6 +49,8 @@ using namespace cv::gpu;
 
 Ptr<Filter> cv::gpu::createBoxFilter(int, int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
+Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
@@ -57,8 +59,6 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) {
 Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
@@ -76,7 +76,6 @@ void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_
 void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_no_cuda(); }
 void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_no_cuda(); }
@@ -188,6 +187,138 @@ Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point
     return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream);
+}}}
+
+namespace
+{
+    class LinearFilter : public Filter
+    {
+    public:
+        LinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*filter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                                   int kWidth, int kHeight, int anchorX, int anchorY,
+                                   int borderMode, const float* borderValue, cudaStream_t stream);
+
+        GpuMat kernel_;
+        Point anchor_;
+        int type_;
+        filter2D_t func_;
+        int borderMode_;
+        Scalar_<float> borderVal_;
+    };
+
+    LinearFilter::LinearFilter(int srcType, int dstType, InputArray _kernel, Point anchor, int borderMode, Scalar borderVal) :
+        anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int scn = CV_MAT_CN(srcType);
+
+        Mat kernel = _kernel.getMat();
+
+        CV_Assert( sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F );
+        CV_Assert( scn == 1 || scn == 4 );
+        CV_Assert( dstType == srcType );
+        CV_Assert( kernel.channels() == 1 );
+        CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+        Mat kernel32F;
+        kernel.convertTo(kernel32F, CV_32F);
+
+        kernel_ = gpu::createContinuous(kernel.size(), CV_32FC1);
+        kernel_.upload(kernel32F);
+
+        normalizeAnchor(anchor_, kernel.size());
+
+        switch (srcType)
+        {
+        case CV_8UC1:
+            func_ = cudev::filter2D<uchar, uchar>;
+            break;
+        case CV_8UC4:
+            func_ = cudev::filter2D<uchar4, uchar4>;
+            break;
+        case CV_16UC1:
+            func_ = cudev::filter2D<ushort, ushort>;
+            break;
+        case CV_16UC4:
+            func_ = cudev::filter2D<ushort4, ushort4>;
+            break;
+        case CV_32FC1:
+            func_ = cudev::filter2D<float, float>;
+            break;
+        case CV_32FC4:
+            func_ = cudev::filter2D<float4, float4>;
+            break;
+        }
+    }
+
+    void LinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        Point ofs;
+        Size wholeSize;
+        src.locateROI(wholeSize, ofs);
+
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart);
+
+        func_(srcWhole, ofs.x, ofs.y, dst, kernel_.ptr<float>(),
+              kernel_.cols, kernel_.rows, anchor_.x, anchor_.y,
+              borderMode_, borderVal_.val, StreamAccessor::getStream(_stream));
+    }
+}
+
+Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
+}
+
+
+
+
+
+
+
+
+
+
+void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
+{
+    CV_Assert(ksize == 1 || ksize == 3);
+
+    static const int K[2][9] =
+    {
+        {0, 1, 0, 1, -4, 1, 0, 1, 0},
+        {2, 0, 2, 0, -8, 0, 2, 0, 2}
+    };
+    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
+    if (scale != 1)
+        kernel *= scale;
+
+    Ptr<gpu::Filter> f = gpu::createLinearFilter(src.type(), ddepth, kernel, Point(-1,-1), borderType);
+    f->apply(src, dst, stream);
+}
+
 
 
 
@@ -702,172 +833,6 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Linear Filter
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);
-
-    struct NPPLinearFilter : public BaseFilter_GPU
-    {
-        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter2D_t func;
-    };
-
-    typedef void (*gpuFilter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                                   int borderMode, const float* borderValue, cudaStream_t stream);
-
-    struct GpuFilter2D : public BaseFilter_GPU
-    {
-        GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) :
-            BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_)
-        {
-        }
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-        {
-            using namespace cv::gpu::cudev::imgproc;
-
-            Point ofs;
-            Size wholeSize;
-            src.locateROI(wholeSize, ofs);
-            GpuMat srcWhole(wholeSize, src.type(), src.datastart);
-
-            static const Scalar_<float> zero = Scalar_<float>::all(0.0f);
-            func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), brd_type, zero.val, StreamAccessor::getStream(stream));
-        }
-
-        gpuFilter2D_t func;
-        GpuMat kernel;
-        int brd_type;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    int sdepth = CV_MAT_DEPTH(srcType);
-    int scn = CV_MAT_CN(srcType);
-
-    CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F);
-    CV_Assert(scn == 1 || scn == 4);
-    CV_Assert(dstType == srcType);
-    CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP);
-
-    Size ksize = kernel.size();
-
-#if 0
-    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT)
-    {
-        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
-
-        GpuMat gpu_krnl;
-        int nDivisor;
-        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
-
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
-    }
-#endif
-
-    CV_Assert(ksize.width * ksize.height <= 16 * 16);
-
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl, CV_32F);
-
-    normalizeAnchor(anchor, ksize);
-
-    gpuFilter2D_t func = 0;
-
-    switch (srcType)
-    {
-    case CV_8UC1:
-        func = filter2D_gpu<uchar, uchar>;
-        break;
-    case CV_8UC4:
-        func = filter2D_gpu<uchar4, uchar4>;
-        break;
-    case CV_16UC1:
-        func = filter2D_gpu<ushort, ushort>;
-        break;
-    case CV_16UC4:
-        func = filter2D_gpu<ushort4, ushort4>;
-        break;
-    case CV_32FC1:
-        func = filter2D_gpu<float, float>;
-        break;
-    case CV_32FC4:
-        func = filter2D_gpu<float4, float4>;
-        break;
-    }
-
-    return Ptr<BaseFilter_GPU>(new GpuFilter2D(ksize, anchor, func, gpu_krnl, brd_type));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType)
-{
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType);
-
-    return createFilter2D_GPU(linearFilter, srcType, dstType);
-}
-
-void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream)
-{
-    if (ddepth < 0)
-        ddepth = src.depth();
-
-    int dst_type = CV_MAKE_TYPE(ddepth, src.channels());
-
-    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType);
-
-    dst.create(src.size(), dst_type);
-
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
@@ -1208,22 +1173,6 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
     sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
 }
 
-void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
-{
-    CV_Assert(ksize == 1 || ksize == 3);
-
-    static const int K[2][9] =
-    {
-        {0, 1, 0, 1, -4, 1, 0, 1, 0},
-        {2, 0, 2, 0, -8, 0, 2, 0, 2}
-    };
-    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
-    if (scale != 1)
-        kernel *= scale;
-
-    filter2D(src, dst, ddepth, kernel, Point(-1,-1), borderType, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index a63d92b3d7..6d6da7e4b9 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -118,6 +118,121 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
     testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Filter2D, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(src.type(), -1, kernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter2D->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+
+
+
+
+
+
+
+
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Laplacian, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
+
+    cv::Mat dst_gold;
+    cv::Laplacian(src, dst_gold, -1, ksize.width);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
+    WHOLE_SUBMAT));
+
+
+
+
+
+
+
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Sobel
 
@@ -332,49 +447,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Laplacian, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
-
-    cv::Mat dst_gold;
-    cv::Laplacian(src, dst_gold, -1, ksize.width);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
-    WHOLE_SUBMAT));
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
 
@@ -527,56 +599,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, MorphEx, testing::Combine(
     testing::Values(Iterations(1), Iterations(2), Iterations(3)),
     WHOLE_SUBMAT));
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    cv::Point anchor;
-    int borderType;
-    bool useRoi;
-
-    cv::Mat img;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        anchor = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Filter2D, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType);
-
-    cv::Mat dst_gold;
-    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
-    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
-    WHOLE_SUBMAT));
-
 #endif // HAVE_CUDA
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 97eb7a82aa..f6ace6d771 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -961,10 +961,11 @@ TEST(filter2D)
             gpu::GpuMat d_src(src);
             gpu::GpuMat d_dst;
 
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            Ptr<gpu::Filter> filter2D = gpu::createLinearFilter(d_src.type(), -1, kernel);
+            filter2D->apply(d_src, d_dst);
 
             GPU_ON;
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            filter2D->apply(d_src, d_dst);
             GPU_OFF;
         }
     }

From ee7eb1b807d1201bdd5472171c0fd4e74a99b185 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Apr 2013 12:02:59 +0400
Subject: [PATCH 3/9] refactored Laplacian filter

---
 .../gpufilters/include/opencv2/gpufilters.hpp | 18 +++--
 modules/gpufilters/perf/perf_filters.cpp      | 68 ++++++++++---------
 modules/gpufilters/src/filtering.cpp          | 30 ++++----
 modules/gpufilters/test/test_filters.cpp      | 16 +----
 4 files changed, 63 insertions(+), 69 deletions(-)

diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 32d3403d53..1133ae01d9 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -113,17 +113,23 @@ inline void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray ker
     f->apply(src, dst, stream);
 }
 
-
-
-
-
-
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
 
 //! applies Laplacian operator to the image
 //! supports only ksize = 1 and ksize = 3
-CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
+                                            int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Laplacian(InputArray src, OutputArray dst, int ddepth,
+                                                 int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT,
+                                                 Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
 
+inline void Laplacian(InputArray src, OutputArray dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createLaplacianFilter(src.type(), ddepth, ksize, scale, borderType);
+    f->apply(src, dst, stream);
+}
 
 
 
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index 3d3f587558..efa99696a0 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -124,6 +124,41 @@ PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(d_src.type(), -1, ksize);
+
+        TEST_CYCLE() laplacian->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 
 
 
@@ -232,39 +267,6 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     }
 }
 
-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // Erode
 
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index 3135b599a7..305ecec235 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -51,6 +51,8 @@ Ptr<Filter> cv::gpu::createBoxFilter(int, int, Size, Point, int, Scalar) { throw
 
 Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
+Ptr<Filter> cv::gpu::createLaplacianFilter(int, int, int, double, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
@@ -84,7 +86,6 @@ void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) {
 void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_no_cuda(); }
 void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_no_cuda(); }
 
 #else
 
@@ -293,30 +294,24 @@ Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray ker
     return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
 
-
-
-
-
-
-
-
-
-void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
+Ptr<Filter> cv::gpu::createLaplacianFilter(int srcType, int dstType, int ksize, double scale, int borderMode, Scalar borderVal)
 {
-    CV_Assert(ksize == 1 || ksize == 3);
+    CV_Assert( ksize == 1 || ksize == 3 );
 
-    static const int K[2][9] =
+    static const float K[2][9] =
     {
-        {0, 1, 0, 1, -4, 1, 0, 1, 0},
-        {2, 0, 2, 0, -8, 0, 2, 0, 2}
+        {0.0f, 1.0f, 0.0f, 1.0f, -4.0f, 1.0f, 0.0f, 1.0f, 0.0f},
+        {2.0f, 0.0f, 2.0f, 0.0f, -8.0f, 0.0f, 2.0f, 0.0f, 2.0f}
     };
-    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
+
+    Mat kernel(3, 3, CV_32FC1, (void*)K[ksize == 3]);
     if (scale != 1)
         kernel *= scale;
 
-    Ptr<gpu::Filter> f = gpu::createLinearFilter(src.type(), ddepth, kernel, Point(-1,-1), borderType);
-    f->apply(src, dst, stream);
+    return gpu::createLinearFilter(srcType, dstType, kernel, Point(-1,-1), borderMode, borderVal);
 }
 
 
@@ -347,6 +342,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d
 
 
 
+
 
 
 namespace
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index 6d6da7e4b9..42018424d1 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -170,18 +170,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
     testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
-
-
-
-
-
-
-
-
-
-
-
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian
 
@@ -209,8 +197,10 @@ GPU_TEST_P(Laplacian, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(src.type(), -1, ksize.width);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
+    laplacian->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Laplacian(src, dst_gold, -1, ksize.width);

From 12ae11e2ff56ac9be36708b356601f1f62aba829 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Apr 2013 14:47:42 +0400
Subject: [PATCH 4/9] refactored Separable Linear Filters

---
 .../include/opencv2/gpufeatures2d.hpp         |   2 +-
 modules/gpufeatures2d/src/orb.cpp             |   4 +-
 .../gpufilters/include/opencv2/gpufilters.hpp | 146 ++--
 modules/gpufilters/perf/perf_filters.cpp      |  29 +-
 modules/gpufilters/src/filtering.cpp          | 777 +++++-------------
 modules/gpufilters/test/test_filters.cpp      | 109 ++-
 .../gpuimgproc/include/opencv2/gpuimgproc.hpp |   2 +-
 modules/gpuimgproc/src/canny.cpp              |   8 +-
 modules/gpuimgproc/src/corners.cpp            |  15 +-
 modules/superres/src/btv_l1_gpu.cpp           |   8 +-
 .../gpu-basics-similarity.cpp                 |  24 +-
 samples/gpu/performance/tests.cpp             |   6 +-
 12 files changed, 427 insertions(+), 703 deletions(-)

diff --git a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
index 0c821745f9..cc73da9d9e 100644
--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
@@ -351,7 +351,7 @@ private:
 
     FAST_GPU fastDetector_;
 
-    Ptr<FilterEngine_GPU> blurFilter;
+    Ptr<gpu::Filter> blurFilter;
 
     GpuMat d_keypoints_;
 };
diff --git a/modules/gpufeatures2d/src/orb.cpp b/modules/gpufeatures2d/src/orb.cpp
index 495ca3f6ef..7cb1cbecc1 100644
--- a/modules/gpufeatures2d/src/orb.cpp
+++ b/modules/gpufeatures2d/src/orb.cpp
@@ -468,7 +468,7 @@ cv::gpu::ORB_GPU::ORB_GPU(int nFeatures, float scaleFactor, int nLevels, int edg
 
     pattern_.upload(h_pattern);
 
-    blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+    blurFilter = gpu::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
 
     blurForDescriptor = false;
 }
@@ -632,7 +632,7 @@ void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
         {
             // preprocess the resized image
             ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
+            blurFilter->apply(imagePyr_[level], buf_);
         }
 
         computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 1133ae01d9..bf7a8e928b 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -131,6 +131,83 @@ inline void Laplacian(InputArray src, OutputArray dst, int ddepth, int ksize, do
     f->apply(src, dst, stream);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
+
+//! separable linear 2D filter
+CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
+                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernelX, InputArray kernelY,
+                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                                   Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernelX, InputArray kernelY, Point anchor, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createSeparableLinearFilter(src.type(), ddepth, kernelX, kernelY, anchor, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+//! the generalized Deriv operator
+CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
+                                         int ksize, bool normalize = false, double scale = 1,
+                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! the Sobel operator
+CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
+                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! the vertical or horizontal Scharr operator
+CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
+                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Sobel(InputArray src, OutputArray dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
+                                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                             Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void Sobel(InputArray src, OutputArray dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createSobelFilter(src.type(), ddepth, dx, dy, ksize, scale, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Scharr(InputArray src, OutputArray dst, int ddepth, int dx, int dy, double scale = 1,
+                                              int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                              Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void Scharr(InputArray src, OutputArray dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createScharrFilter(src.type(), ddepth, dx, dy, scale, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+//! smooths the image using Gaussian filter
+CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
+                                            double sigma1, double sigma2 = 0,
+                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void GaussianBlur(InputArray src, OutputArray dst, Size ksize,
+                                                    double sigma1, double sigma2 = 0,
+                                                    int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                                    Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void GaussianBlur(InputArray src, OutputArray dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+
+
+
+
+
 
 
 
@@ -196,14 +273,7 @@ public:
     virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
 };
 
-//! returns the non-separable filter engine with the specified filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
 
-//! returns the separable filter engine with the specified filters
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);
 
 //! returns horizontal 1D box filter
 //! supports only CV_8UC1 source type and CV_32FC1 sum type
@@ -230,47 +300,7 @@ CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, co
 
 
 
-//! returns the primitive row filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
 
-//! returns the primitive column filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-
-//! returns filter engine for the generalized Sobel operator
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns the Gaussian filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
 //! returns maximum filter
 CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
@@ -297,30 +327,8 @@ CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat&
 CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
                              Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
 
-//! applies separable 2D linear filter to the image
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                            Stream& stream = Stream::Null());
 
-//! applies generalized Sobel operator to the image
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
 
-//! applies the vertical or horizontal Scharr operator to the image
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! smooths the image using Gaussian filter.
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
 
 }} // namespace cv { namespace gpu {
 
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index efa99696a0..9c7dcb8b43 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -159,13 +159,6 @@ PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(C
     }
 }
 
-
-
-
-
-
-
-
 //////////////////////////////////////////////////////////////////////
 // Sobel
 
@@ -184,9 +177,10 @@ PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);
+        cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(d_src.type(), -1, 1, 1, ksize);
+
+        TEST_CYCLE() sobel->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -217,9 +211,10 @@ PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);
+        cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(d_src.type(), -1, 1, 0);
+
+        TEST_CYCLE() scharr->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -251,9 +246,10 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE() gauss->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -267,6 +263,13 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     }
 }
 
+
+
+
+
+
+
+
 //////////////////////////////////////////////////////////////////////
 // Erode
 
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index 305ecec235..c4de574408 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -53,22 +53,24 @@ Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar
 
 Ptr<Filter> cv::gpu::createLaplacianFilter(int, int, int, double, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int, int, InputArray, InputArray, Point, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createDerivFilter(int, int, int, int, int, bool, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createSobelFilter(int, int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createScharrFilter(int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+
+
+
+
+
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 
@@ -78,14 +80,7 @@ void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_
 void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_no_cuda(); }
+
 
 #else
 
@@ -185,6 +180,8 @@ Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point
     if (dstType < 0)
         dstType = srcType;
 
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
     return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
@@ -291,6 +288,8 @@ Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray ker
     if (dstType < 0)
         dstType = srcType;
 
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
     return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
 }
 
@@ -314,7 +313,198 @@ Ptr<Filter> cv::gpu::createLaplacianFilter(int srcType, int dstType, int ksize,
     return gpu::createLinearFilter(srcType, dstType, kernel, Point(-1,-1), borderMode, borderVal);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
 
+namespace filter
+{
+    template <typename T, typename D>
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+namespace
+{
+    class SeparableLinearFilter : public Filter
+    {
+    public:
+        SeparableLinearFilter(int srcType, int dstType,
+                              InputArray rowKernel, InputArray columnKernel,
+                              Point anchor, int rowBorderMode, int columnBorderMode);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+        int srcType_, bufType_, dstType_;
+        GpuMat rowKernel_, columnKernel_;
+        func_t rowFilter_, columnFilter_;
+        Point anchor_;
+        int rowBorderMode_, columnBorderMode_;
+
+        GpuMat buf_;
+    };
+
+    SeparableLinearFilter::SeparableLinearFilter(int srcType, int dstType,
+                                                 InputArray _rowKernel, InputArray _columnKernel,
+                                                 Point anchor, int rowBorderMode, int columnBorderMode) :
+        srcType_(srcType), dstType_(dstType), anchor_(anchor), rowBorderMode_(rowBorderMode), columnBorderMode_(columnBorderMode)
+    {
+        static const func_t rowFilterFuncs[7][4] =
+        {
+            {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+            {0, 0, 0, 0},
+            {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
+            {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
+            {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
+            {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+            {0, 0, 0, 0}
+        };
+
+        static const func_t columnFilterFuncs[7][4] =
+        {
+            {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+            {0, 0, 0, 0},
+            {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
+            {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+            {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
+            {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+            {0, 0, 0, 0}
+        };
+
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int cn = CV_MAT_CN(srcType);
+        const int ddepth = CV_MAT_DEPTH(dstType);
+
+        Mat rowKernel = _rowKernel.getMat();
+        Mat columnKernel = _columnKernel.getMat();
+
+        CV_Assert( sdepth <= CV_64F && cn <= 4 );
+        CV_Assert( rowKernel.channels() == 1 );
+        CV_Assert( columnKernel.channels() == 1 );
+        CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP );
+        CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP );
+
+        Mat kernel32F;
+
+        rowKernel.convertTo(kernel32F, CV_32F);
+        rowKernel_.upload(kernel32F.reshape(1, 1));
+
+        columnKernel.convertTo(kernel32F, CV_32F);
+        columnKernel_.upload(kernel32F.reshape(1, 1));
+
+        CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 );
+        CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 );
+
+        normalizeAnchor(anchor_.x, rowKernel_.cols);
+        normalizeAnchor(anchor_.y, columnKernel_.cols);
+
+        bufType_ = CV_MAKE_TYPE(CV_32F, cn);
+
+        rowFilter_ = rowFilterFuncs[sdepth][cn - 1];
+        CV_Assert( rowFilter_ != 0 );
+
+        columnFilter_ = columnFilterFuncs[ddepth][cn - 1];
+        CV_Assert( columnFilter_ != 0 );
+    }
+
+    void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        ensureSizeIsEnough(src.size(), bufType_, buf_);
+
+        DeviceInfo devInfo;
+        const int cc = devInfo.major() * 10 + devInfo.minor();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
+        columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
+    }
+}
+
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor, int rowBorderMode, int columnBorderMode)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
+    if (columnBorderMode < 0)
+        columnBorderMode = rowBorderMode;
+
+    return new SeparableLinearFilter(srcType, dstType, rowKernel, columnKernel, anchor, rowBorderMode, columnBorderMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+Ptr<Filter> cv::gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize, double scale, int rowBorderMode, int columnBorderMode)
+{
+    Mat kx, ky;
+    getDerivKernels(kx, ky, dx, dy, ksize, normalize, CV_32F);
+
+    if (scale != 1)
+    {
+        // usually the smoothing part is the slowest to compute,
+        // so try to scale it instead of the faster differenciating part
+        if (dx == 0)
+            kx *= scale;
+        else
+            ky *= scale;
+    }
+
+    return gpu::createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1, -1), rowBorderMode, columnBorderMode);
+}
+
+Ptr<Filter> cv::gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize, double scale, int rowBorderMode, int columnBorderMode)
+{
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, ksize, false, scale, rowBorderMode, columnBorderMode);
+}
+
+Ptr<Filter> cv::gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale, int rowBorderMode, int columnBorderMode)
+{
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, -1, false, scale, rowBorderMode, columnBorderMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+Ptr<Filter> cv::gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2, int rowBorderMode, int columnBorderMode)
+{
+    const int depth = CV_MAT_DEPTH(srcType);
+
+    if (sigma2 <= 0)
+        sigma2 = sigma1;
+
+    // automatic detection of kernel size from sigma
+    if (ksize.width <= 0 && sigma1 > 0)
+        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+    if (ksize.height <= 0 && sigma2 > 0)
+        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+
+    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
+
+    sigma1 = std::max(sigma1, 0.0);
+    sigma2 = std::max(sigma2, 0.0);
+
+    Mat kx = getGaussianKernel(ksize.width, sigma1, CV_32F);
+    Mat ky;
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
+        ky = kx;
+    else
+        ky = getGaussianKernel(ksize.height, sigma2, CV_32F);
+
+    return createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1,-1), rowBorderMode, columnBorderMode);
+}
 
 
 
@@ -377,128 +567,6 @@ namespace
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-namespace
-{
-    struct Filter2DEngine_GPU : public FilterEngine_GPU
-    {
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
-            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
-        {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-
-            (*filter2D)(srcROI, dstROI, stream);
-        }
-
-        Ptr<BaseFilter_GPU> filter2D;
-        int srcType, dstType;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// SeparableFilter
-
-namespace
-{
-    struct SeparableFilterEngine_GPU : public FilterEngine_GPU
-    {
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf;
-        }
-
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_,
-                                  GpuMat& buf_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf_;
-        }
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            ensureSizeIsEnough(src_size, bufType, *pbuf);
-
-            normalizeROI(roi, ksize, anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-            GpuMat bufROI = (*pbuf)(roi);
-
-            (*rowFilter)(srcROI, bufROI, stream);
-            (*columnFilter)(bufROI, dstROI, stream);
-        }
-
-        Ptr<BaseRowFilter_GPU> rowFilter;
-        Ptr<BaseColumnFilter_GPU> columnFilter;
-
-        int srcType, bufType, dstType;
-
-        Size ksize;
-        Point anchor;
-
-        GpuMat buf;
-        GpuMat* pbuf;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // 1D Sum Filter
 
@@ -829,433 +897,6 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Separable Linear Filter
-
-namespace filter
-{
-    template <typename T, typename D>
-    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-namespace
-{
-    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
-        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
-
-    typedef void (*gpuFilter1D_t)(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-
-    struct NppLinearRowFilter : public BaseRowFilter_GPU
-    {
-        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearRowFilter : public BaseRowFilter_GPU
-    {
-        GpuLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
-
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
-        {0, 0, 0, 0},
-        {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
-        {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
-        {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
-        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
-    };
-
-    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
-
-        GpuMat gpu_row_krnl;
-        int nDivisor;
-        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
-
-        const int ksize = gpu_row_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, npp_funcs[CV_MAT_CN(srcType)]));
-    }
-
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int sdepth = CV_MAT_DEPTH(srcType);
-    const int cn = CV_MAT_CN(srcType);
-    CV_Assert( sdepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    const gpuFilter1D_t func = funcs[sdepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_row_krnl;
-    normalizeKernel(rowKernel, gpu_row_krnl, CV_32F);
-
-    const int ksize = gpu_row_krnl.cols;
-    CV_Assert( ksize > 0 && ksize <= 32 );
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, borderType));
-}
-
-namespace
-{
-    struct NppLinearColumnFilter : public BaseColumnFilter_GPU
-    {
-        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
-    {
-        GpuLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            if (ksize > 16 && cc < 20)
-                CV_Error(cv::Error::StsNotImplemented, "column linear filter doesn't implemented for kernel size > 16 for device with compute capabilities less than 2.0");
-
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
-
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
-        {0, 0, 0, 0},
-        {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
-        {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
-        {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
-        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
-    };
-
-    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
-
-        GpuMat gpu_col_krnl;
-        int nDivisor;
-        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
-
-        const int ksize = gpu_col_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, npp_funcs[CV_MAT_CN(bufType)]));
-    }
-
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int ddepth = CV_MAT_DEPTH(dstType);
-    const int cn = CV_MAT_CN(dstType);
-    CV_Assert( ddepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    gpuFilter1D_t func = funcs[ddepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_col_krnl;
-    normalizeKernel(columnKernel, gpu_col_krnl, CV_32F);
-
-    const int ksize = gpu_col_krnl.cols;
-    CV_Assert(ksize > 0 && ksize <= 32);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, borderType));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
-    const Point& anchor, int rowBorderType, int columnBorderType)
-{
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
-
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
-
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
-    const Point& anchor, int rowBorderType, int columnBorderType)
-{
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
-
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
-
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                          Point anchor, int rowBorderType, int columnBorderType)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                          Point anchor, int rowBorderType, int columnBorderType,
-                          Stream& stream)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Deriv Filter
-
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
-{
-    GpuMat buf;
-    Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-
-    if (scale != 1)
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if (dx == 0)
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
-}
-
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
-{
-    GpuMat buf;
-    Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
-
-    if( scale != 1 )
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if( dx == 0 )
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Gaussian Filter
-
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    int depth = CV_MAT_DEPTH(type);
-
-    if (sigma2 <= 0)
-        sigma2 = sigma1;
-
-    // automatic detection of kernel size from sigma
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
-
-    sigma1 = std::max(sigma1, 0.0);
-    sigma2 = std::max(sigma2, 0.0);
-
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
-    Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    int depth = CV_MAT_DEPTH(type);
-
-    if (sigma2 <= 0)
-        sigma2 = sigma1;
-
-    // automatic detection of kernel size from sigma
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
-
-    sigma1 = std::max(sigma1, 0.0);
-    sigma2 = std::max(sigma2, 0.0);
-
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
-    Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    if (ksize.width == 1 && ksize.height == 1)
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
-}
-
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    if (ksize.width == 1 && ksize.height == 1)
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Image Rank Filter
 
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index 42018424d1..e052ede6cb 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -215,13 +215,74 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
     testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
     WHOLE_SUBMAT));
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SeparableLinearFilter
 
+PARAM_TEST_CASE(SeparableLinearFilter, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
 
+    int type;
 
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        anchor = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
 
+        cv::gpu::setDevice(devInfo.deviceID());
 
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
 
+GPU_TEST_P(SeparableLinearFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat rowKernel = randomMat(Size(ksize.width, 1), CV_32FC1, 0.0, 1.0);
+    cv::Mat columnKernel = randomMat(Size(ksize.height, 1), CV_32FC1, 0.0, 1.0);
 
+    cv::Ptr<cv::gpu::Filter> filter = cv::gpu::createSeparableLinearFilter(src.type(), -1, rowKernel, columnKernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::sepFilter2D(src, dst_gold, -1, rowKernel, columnKernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, SeparableLinearFilter, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(23, 15)),
+                    KSize(cv::Size(31, 3))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Sobel
@@ -265,13 +326,15 @@ GPU_TEST_P(Sobel, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(src.type(), -1, dx, dy, ksize.width, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Sobel(loadMat(src, useRoi), dst, -1, dx, dy, ksize.width, 1.0, borderType);
+    sobel->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Sobel, testing::Combine(
@@ -328,13 +391,15 @@ GPU_TEST_P(Scharr, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(src.type(), -1, dx, dy, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Scharr(loadMat(src, useRoi), dst, -1, dx, dy, 1.0, borderType);
+    scharr->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Scharr, testing::Combine(
@@ -387,28 +452,15 @@ GPU_TEST_P(GaussianBlur, Accuracy)
     double sigma1 = randomDouble(0.1, 1.0);
     double sigma2 = randomDouble(0.1, 1.0);
 
-    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
+    cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, borderType);
 
-        cv::Mat dst_gold;
-        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    gauss->apply(loadMat(src, useRoi), dst);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
-    }
+    cv::Mat dst_gold;
+    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 4.0 : 1e-4);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
@@ -437,6 +489,15 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
+
+
+
+
+
+
+
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
 
diff --git a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
index cf1b8e6706..3fe9f82f4c 100644
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -158,7 +158,7 @@ struct CV_EXPORTS CannyBuf
     GpuMat mag;
     GpuMat map;
     GpuMat st1, st2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
+    Ptr<Filter> filterDX, filterDY;
 };
 
 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
diff --git a/modules/gpuimgproc/src/canny.cpp b/modules/gpuimgproc/src/canny.cpp
index 8d361fe50f..9a33575648 100644
--- a/modules/gpuimgproc/src/canny.cpp
+++ b/modules/gpuimgproc/src/canny.cpp
@@ -65,8 +65,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 
         if (apperture_size != 3)
         {
-            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+            filterDX = createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size, false, 1, BORDER_REPLICATE);
+            filterDY = createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size, false, 1, BORDER_REPLICATE);
         }
     }
 
@@ -150,8 +150,8 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
     }
     else
     {
-        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
-        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+        buf.filterDX->apply(src, buf.dx);
+        buf.filterDY->apply(src, buf.dy);
 
         calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
     }
diff --git a/modules/gpuimgproc/src/corners.cpp b/modules/gpuimgproc/src/corners.cpp
index 44dc1505d4..824a3308ee 100644
--- a/modules/gpuimgproc/src/corners.cpp
+++ b/modules/gpuimgproc/src/corners.cpp
@@ -70,6 +70,8 @@ namespace
 {
     void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
     {
+        (void) buf;
+
         double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
 
         if (ksize < 0)
@@ -83,16 +85,21 @@ namespace
         Dx.create(src.size(), CV_32F);
         Dy.create(src.size(), CV_32F);
 
+        Ptr<gpu::Filter> filterDx, filterDy;
+
         if (ksize > 0)
         {
-            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
-            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
+            filterDx = gpu::createSobelFilter(src.type(), CV_32F, 1, 0, ksize, scale, borderType);
+            filterDy = gpu::createSobelFilter(src.type(), CV_32F, 0, 1, ksize, scale, borderType);
         }
         else
         {
-            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
-            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
+            filterDx = gpu::createScharrFilter(src.type(), CV_32F, 1, 0, scale, borderType);
+            filterDy = gpu::createScharrFilter(src.type(), CV_32F, 0, 1, scale, borderType);
         }
+
+        filterDx->apply(src, Dx);
+        filterDy->apply(src, Dy);
     }
 }
 
diff --git a/modules/superres/src/btv_l1_gpu.cpp b/modules/superres/src/btv_l1_gpu.cpp
index 6813187c45..7b2ad73700 100644
--- a/modules/superres/src/btv_l1_gpu.cpp
+++ b/modules/superres/src/btv_l1_gpu.cpp
@@ -230,7 +230,7 @@ namespace
         Ptr<DenseOpticalFlowExt> opticalFlow_;
 
     private:
-        std::vector<Ptr<FilterEngine_GPU> > filters_;
+        std::vector<Ptr<gpu::Filter> > filters_;
         int curBlurKernelSize_;
         double curBlurSigma_;
         int curSrcType_;
@@ -299,7 +299,7 @@ namespace
         {
             filters_.resize(src.size());
             for (size_t i = 0; i < src.size(); ++i)
-                filters_[i] = createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                filters_[i] = gpu::createGaussianFilter(src[0].type(), -1, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
             curBlurKernelSize_ = blurKernelSize_;
             curBlurSigma_ = blurSigma_;
             curSrcType_ = src[0].type();
@@ -346,7 +346,7 @@ namespace
                 // a = M * Ih
                 gpu::remap(highRes_, a_[k], backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
                 // b = HM * Ih
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // c = DHF * Ih
                 gpu::resize(b_[k], c_[k], lowResSize, 0, 0, INTER_NEAREST, streams_[k]);
 
@@ -355,7 +355,7 @@ namespace
                 // a = Dt * diff
                 upscale(c_[k], a_[k], scale_, streams_[k]);
                 // b = HtDt * diff
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // diffTerm = MtHtDt * diff
                 gpu::remap(b_[k], diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
             }
diff --git a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
index 87b5255990..1815cc6dec 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
@@ -308,6 +308,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
     gpu::split(tmp2, vI2);
     Scalar mssim;
 
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(vI2[0].type(), -1, Size(11, 11), 1.5);
+
     for( int i = 0; i < gI1.channels(); ++i )
     {
         gpu::GpuMat I2_2, I1_2, I1_I2;
@@ -318,8 +320,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         /*************************** END INITS **********************************/
         gpu::GpuMat mu1, mu2;   // PRELIMINARY COMPUTING
-        gpu::GaussianBlur(vI1[i], mu1, Size(11, 11), 1.5);
-        gpu::GaussianBlur(vI2[i], mu2, Size(11, 11), 1.5);
+        gauss->apply(vI1[i], mu1);
+        gauss->apply(vI2[i], mu2);
 
         gpu::GpuMat mu1_2, mu2_2, mu1_mu2;
         gpu::multiply(mu1, mu1, mu1_2);
@@ -328,13 +330,13 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         gpu::GpuMat sigma1_2, sigma2_2, sigma12;
 
-        gpu::GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
+        gauss->apply(I1_2, sigma1_2);
         gpu::subtract(sigma1_2, mu1_2, sigma1_2); // sigma1_2 -= mu1_2;
 
-        gpu::GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
+        gauss->apply(I2_2, sigma2_2);
         gpu::subtract(sigma2_2, mu2_2, sigma2_2); // sigma2_2 -= mu2_2;
 
-        gpu::GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
+        gauss->apply(I1_I2, sigma12);
         gpu::subtract(sigma12, mu1_mu2, sigma12); // sigma12 -= mu1_mu2;
 
         ///////////////////////////////// FORMULA ////////////////////////////////
@@ -375,7 +377,7 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
     gpu::split(b.t2, b.vI2, stream);
     Scalar mssim;
 
-    gpu::GpuMat buf;
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(b.vI1[0].type(), -1, Size(11, 11), 1.5);
 
     for( int i = 0; i < b.gI1.channels(); ++i )
     {
@@ -383,22 +385,22 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
         gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
         gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
 
-        gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
-        gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.vI1[i], b.mu1, stream);
+        gauss->apply(b.vI2[i], b.mu2, stream);
 
         gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
         gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
         gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
 
-        gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_2, b.sigma1_2, stream);
         gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);
         //b.sigma1_2 -= b.mu1_2;  - This would result in an extra data transfer operation
 
-        gpu::GaussianBlur(b.I2_2, b.sigma2_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I2_2, b.sigma2_2, stream);
         gpu::subtract(b.sigma2_2, b.mu2_2, b.sigma2_2, gpu::GpuMat(), -1, stream);
         //b.sigma2_2 -= b.mu2_2;
 
-        gpu::GaussianBlur(b.I1_I2, b.sigma12, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_I2, b.sigma12, stream);
         gpu::subtract(b.sigma12, b.mu1_mu2, b.sigma12, gpu::GpuMat(), -1, stream);
         //b.sigma12 -= b.mu1_mu2;
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index f6ace6d771..cbe50a40f6 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -929,10 +929,12 @@ TEST(GaussianBlur)
         gpu::GpuMat d_dst(src.size(), src.type());
         gpu::GpuMat d_buf;
 
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(3, 3), 1);
+
+        gauss->apply(d_src, d_dst);
 
         GPU_ON;
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        gauss->apply(d_src, d_dst);
         GPU_OFF;
     }
 }

From 5720eaf35472b6b40985792c7a9c338b74ed1652 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Apr 2013 16:03:16 +0400
Subject: [PATCH 5/9] refactored Morphology Filters

---
 modules/gpubgsegm/src/fgd.cpp                 |  16 +-
 .../gpufilters/include/opencv2/gpufilters.hpp |  61 +-
 modules/gpufilters/perf/perf_filters.cpp      |  23 +-
 modules/gpufilters/src/filtering.cpp          | 559 +++++++++---------
 modules/gpufilters/test/test_filters.cpp      |  21 +-
 samples/gpu/morphology.cpp                    | 226 ++++---
 samples/gpu/performance/tests.cpp             |   6 +-
 7 files changed, 503 insertions(+), 409 deletions(-)

diff --git a/modules/gpubgsegm/src/fgd.cpp b/modules/gpubgsegm/src/fgd.cpp
index 1b4038304a..fb14ff172a 100644
--- a/modules/gpubgsegm/src/fgd.cpp
+++ b/modules/gpubgsegm/src/fgd.cpp
@@ -228,11 +228,10 @@ private:
     cv::gpu::GpuMat countBuf_;
 
     cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
     cv::gpu::GpuMat filterBrd_;
 
-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
+    cv::Ptr<cv::gpu::Filter> dilateFilter_;
+    cv::Ptr<cv::gpu::Filter> erodeFilter_;
 
     CvMemStorage* storage_;
 };
@@ -305,8 +304,8 @@ void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, cons
         cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
         cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
 
-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
+        dilateFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, CV_8UC1, kernel, anchor);
+        erodeFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, CV_8UC1, kernel, anchor);
     }
 }
 
@@ -326,7 +325,6 @@ void cv::gpu::FGDStatModel::Impl::release()
     countBuf_.release();
 
     buf_.release();
-    filterBuf_.release();
     filterBrd_.release();
 }
 
@@ -488,14 +486,14 @@ namespace
 
 namespace
 {
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
+    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::Filter>& filter, cv::Scalar brdVal)
     {
         cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
+        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst);
     }
 
     void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
+                          cv::Ptr<cv::gpu::Filter>& erodeFilter, cv::Ptr<cv::gpu::Filter>& dilateFilter,
                           const cv::gpu::FGDStatModel::Params& params)
     {
         const int brd = params.perform_morphing;
diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index bf7a8e928b..7595bd5439 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -48,6 +48,7 @@
 #endif
 
 #include "opencv2/core/gpu.hpp"
+#include "opencv2/imgproc.hpp"
 
 #if defined __GNUC__
     #define __OPENCV_GPUFILTERS_DEPR_BEFORE__
@@ -203,8 +204,42 @@ inline void GaussianBlur(InputArray src, OutputArray dst, Size ksize, double sig
     f->apply(src, dst, stream);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
 
+//! returns 2D morphological filter
+//! supports CV_8UC1 and CV_8UC4 types
+CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
 
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void erode(InputArray src, OutputArray dst, InputArray kernel,
+                                             Point anchor = Point(-1, -1), int iterations = 1,
+                                             Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void erode(InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createMorphologyFilter(MORPH_ERODE, src.type(), kernel, anchor, iterations);
+    f->apply(src, dst, stream);
+}
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void dilate(InputArray src, OutputArray dst, InputArray kernel,
+                                              Point anchor = Point(-1, -1), int iterations = 1,
+                                              Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void dilate(InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createMorphologyFilter(MORPH_DILATE, src.type(), kernel, anchor, iterations);
+    f->apply(src, dst, stream);
+}
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void morphologyEx(InputArray src, OutputArray dst, int op,
+                                                    InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1,
+                                                    Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void morphologyEx(InputArray src, OutputArray dst, int op, InputArray kernel, Point anchor, int iterations, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createMorphologyFilter(op, src.type(), kernel, anchor, iterations);
+    f->apply(src, dst, stream);
+}
 
 
 
@@ -285,18 +320,7 @@ CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dst
 
 
 
-//! returns 2D morphological filter
-//! only MORPH_ERODE and MORPH_DILATE are supported
-//! supports CV_8UC1 and CV_8UC4 types
-//! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
-    Point anchor=Point(-1,-1));
 
-//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
 
 
 
@@ -310,22 +334,7 @@ CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const
 
 
 
-//! erodes the image (applies the local minimum operator)
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                      Point anchor = Point(-1, -1), int iterations = 1,
-                      Stream& stream = Stream::Null());
 
-//! dilates the image (applies the local maximum operator)
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                       Point anchor = Point(-1, -1), int iterations = 1,
-                       Stream& stream = Stream::Null());
-
-//! applies an advanced morphological operation to the image
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
-                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
 
 
 
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index 9c7dcb8b43..6ad0998a5b 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -263,13 +263,6 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     }
 }
 
-
-
-
-
-
-
-
 //////////////////////////////////////////////////////////////////////
 // Erode
 
@@ -289,9 +282,10 @@ PERF_TEST_P(Sz_Type, Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8U
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), ker);
+
+        TEST_CYCLE() erode->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -324,9 +318,10 @@ PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), ker);
+
+        TEST_CYCLE() dilate->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -364,10 +359,10 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf1;
-        cv::gpu::GpuMat d_buf2;
 
-        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);
+        cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), ker);
+
+        TEST_CYCLE() morph->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index c4de574408..145e049842 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -61,6 +61,8 @@ Ptr<Filter> cv::gpu::createScharrFilter(int, int, int, int, double, int, int) {
 
 Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
 
+Ptr<Filter> cv::gpu::createMorphologyFilter(int, int, InputArray, Point, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
 
 
 
@@ -68,18 +70,9 @@ Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, i
 
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
 
 
 #else
@@ -506,6 +499,297 @@ Ptr<Filter> cv::gpu::createGaussianFilter(int srcType, int dstType, Size ksize,
     return createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1,-1), rowBorderMode, columnBorderMode);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+namespace
+{
+    class MorphologyFilter : public Filter
+    {
+    public:
+        MorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppMorfFilter_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        GpuMat kernel_;
+        Point anchor_;
+        int iters_;
+        nppMorfFilter_t func_;
+
+        GpuMat srcBorder_;
+        GpuMat buf_;
+    };
+
+    MorphologyFilter::MorphologyFilter(int op, int srcType, InputArray _kernel, Point anchor, int iterations) :
+        type_(srcType), anchor_(anchor), iters_(iterations)
+    {
+        static const nppMorfFilter_t funcs[2][5] =
+        {
+            {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
+            {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
+        };
+
+        CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE );
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+
+        Mat kernel = _kernel.getMat();
+        Size ksize = !kernel.empty() ? _kernel.size() : Size(3, 3);
+
+        normalizeAnchor(anchor_, ksize);
+
+        if (kernel.empty())
+        {
+            kernel = getStructuringElement(MORPH_RECT, Size(1 + iters_ * 2, 1 + iters_ * 2));
+            anchor_ = Point(iters_, iters_);
+            iters_ = 1;
+        }
+        else if (iters_ > 1 && countNonZero(kernel) == (int) kernel.total())
+        {
+            anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_);
+            kernel = getStructuringElement(MORPH_RECT,
+                                           Size(ksize.width + (iters_ - 1) * (ksize.width - 1),
+                                                ksize.height + (iters_ - 1) * (ksize.height - 1)),
+                                           anchor_);
+            iters_ = 1;
+        }
+
+        CV_Assert( kernel.channels() == 1 );
+
+        Mat kernel8U;
+        kernel.convertTo(kernel8U, CV_8U);
+
+        kernel_ = gpu::createContinuous(kernel.size(), CV_8UC1);
+        kernel_.upload(kernel8U);
+
+        func_ = funcs[op][CV_MAT_CN(srcType)];
+    }
+
+    void MorphologyFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        Size ksize = kernel_.size();
+        gpu::copyMakeBorder(src, srcBorder_, ksize.height, ksize.height, ksize.width, ksize.width, BORDER_DEFAULT, Scalar(), _stream);
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+
+        GpuMat bufRoi;
+        if (iters_ > 1)
+        {
+            ensureSizeIsEnough(srcBorder_.size(), type_, buf_);
+            buf_.setTo(Scalar::all(0), _stream);
+            bufRoi = buf_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+        }
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize.height;
+        oMaskSize.width = ksize.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+
+        for(int i = 1; i < iters_; ++i)
+        {
+            dst.copyTo(bufRoi, _stream);
+
+            nppSafeCall( func_(bufRoi.ptr<Npp8u>(), static_cast<int>(bufRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                               oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace
+{
+    class MorphologyExFilter : public Filter
+    {
+    public:
+        MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+    protected:
+        Ptr<gpu::Filter> erodeFilter_, dilateFilter_;
+        GpuMat buf_;
+    };
+
+    MorphologyExFilter::MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations)
+    {
+        erodeFilter_ = gpu::createMorphologyFilter(MORPH_ERODE, srcType, kernel, anchor, iterations);
+        dilateFilter_ = gpu::createMorphologyFilter(MORPH_DILATE, srcType, kernel, anchor, iterations);
+    }
+
+    // MORPH_OPEN
+
+    class MorphologyOpenFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyOpenFilter::MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyOpenFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_CLOSE
+
+    class MorphologyCloseFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyCloseFilter::MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyCloseFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, buf_, stream);
+        erodeFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_GRADIENT
+
+    class MorphologyGradientFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyGradientFilter::MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyGradientFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(src, dst, stream);
+        gpu::subtract(dst, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_TOPHAT
+
+    class MorphologyTophatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyTophatFilter::MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyTophatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, dst, stream);
+        dilateFilter_->apply(dst, buf_, stream);
+        gpu::subtract(src, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_BLACKHAT
+
+    class MorphologyBlackhatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyBlackhatFilter::MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyBlackhatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, dst, stream);
+        erodeFilter_->apply(dst, buf_, stream);
+        gpu::subtract(buf_, src, dst, noArray(), -1, stream);
+    }
+}
+
+Ptr<Filter> cv::gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations)
+{
+    switch( op )
+    {
+    case MORPH_ERODE:
+    case MORPH_DILATE:
+        return new MorphologyFilter(op, srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_OPEN:
+        return new MorphologyOpenFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_CLOSE:
+        return new MorphologyCloseFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_GRADIENT:
+        return new MorphologyGradientFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_TOPHAT:
+        return new MorphologyTophatFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_BLACKHAT:
+        return new MorphologyBlackhatFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    default:
+        CV_Error(Error::StsBadArg, "Unknown morphological operation");
+        return Ptr<Filter>();
+    }
+}
+
+
+
+
+
+
+
+
 
 
 
@@ -638,264 +922,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstTy
     return Ptr<BaseColumnFilter_GPU>(new NppColumnSumFilter(ksize, anchor));
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Morphology Filter
 
-namespace
-{
-    typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint);
-
-    struct NPPMorphFilter : public BaseFilter_GPU
-    {
-        NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        nppMorfFilter_t func;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
-{
-    static const nppMorfFilter_t nppMorfFilter_callers[2][5] =
-    {
-        {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
-        {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
-    };
-
-    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
-    CV_Assert(type == CV_8UC1 || type == CV_8UC4);
-
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl);
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
-}
-
-namespace
-{
-    struct MorphologyFilterEngine_GPU : public FilterEngine_GPU
-    {
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) :
-            filter2D(filter2D_), type(type_), iters(iters_)
-        {
-            pbuf = &buf;
-        }
-
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) :
-            filter2D(filter2D_), type(type_), iters(iters_)
-        {
-            pbuf = &buf_;
-        }
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == type);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, type);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-            if (iters > 1)
-                pbuf->create(src_size, type);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-
-            (*filter2D)(srcROI, dstROI, stream);
-
-            for(int i = 1; i < iters; ++i)
-            {
-                dst.swap((*pbuf));
-
-                dstROI = dst(roi);
-                GpuMat bufROI = (*pbuf)(roi);
-
-                (*filter2D)(bufROI, dstROI, stream);
-            }
-        }
-
-        Ptr<BaseFilter_GPU> filter2D;
-
-        int type;
-        int iters;
-
-        GpuMat buf;
-        GpuMat* pbuf;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor, int iterations)
-{
-    CV_Assert(iterations > 0);
-
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, const Point& anchor, int iterations)
-{
-    CV_Assert(iterations > 0);
-
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations, buf));
-}
-
-namespace
-{
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream = Stream::Null())
-    {
-        Mat kernel;
-        Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
-
-        normalizeAnchor(anchor, ksize);
-
-        if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
-        {
-            src.copyTo(dst, stream);
-            return;
-        }
-
-        dst.create(src.size(), src.type());
-
-        if (!_kernel.data)
-        {
-            kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2));
-            anchor = Point(iterations, iterations);
-            iterations = 1;
-        }
-        else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
-        {
-            anchor = Point(anchor.x * iterations, anchor.y * iterations);
-            kernel = getStructuringElement(MORPH_RECT,
-                                           Size(ksize.width + (iterations - 1) * (ksize.width - 1),
-                                                ksize.height + (iterations - 1) * (ksize.height - 1)),
-                                           anchor);
-            iterations = 1;
-        }
-        else
-            kernel = _kernel;
-
-        Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, buf, anchor, iterations);
-
-        f->apply(src, dst, Rect(0,0,-1,-1), stream);
-    }
-
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations)
-    {
-        GpuMat buf;
-        morphOp(op, src, dst, _kernel, buf, anchor, iterations);
-    }
-}
-
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
-{
-    morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations);
-}
-
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
-{
-    morphOp(MORPH_ERODE, src, dst, kernel, buf, anchor, iterations, stream);
-}
-
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
-{
-    morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations);
-}
-
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
-{
-    morphOp(MORPH_DILATE, src, dst, kernel, buf, anchor, iterations, stream);
-}
-
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations)
-{
-    GpuMat buf1;
-    GpuMat buf2;
-    morphologyEx(src, dst, op, kernel, buf1, buf2, anchor, iterations);
-}
-
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor, int iterations, Stream& stream)
-{
-    switch( op )
-    {
-    case MORPH_ERODE:
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_DILATE:
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_OPEN:
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
-        dilate(buf2, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_CLOSE:
-        dilate(src, buf2, kernel, buf1, anchor, iterations, stream);
-        erode(buf2, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_GRADIENT:
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(dst, buf2, dst, GpuMat(), -1, stream);
-        break;
-
-    case MORPH_TOPHAT:
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
-        dilate(dst, buf2, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(src, buf2, dst, GpuMat(), -1, stream);
-        break;
-
-    case MORPH_BLACKHAT:
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        erode(dst, buf2, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(buf2, src, dst, GpuMat(), -1, stream);
-        break;
-
-    default:
-        CV_Error(cv::Error::StsBadArg, "unknown morphological operation");
-    }
-}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Image Rank Filter
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index e052ede6cb..03bea05e6d 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -489,15 +489,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
-
-
-
-
-
-
-
-
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
 
@@ -528,8 +519,10 @@ GPU_TEST_P(Erode, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::erode(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    erode->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::erode(src, dst_gold, kernel, anchor, iterations);
@@ -577,8 +570,10 @@ GPU_TEST_P(Dilate, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::dilate(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    dilate->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::dilate(src, dst_gold, kernel, anchor, iterations);
@@ -630,8 +625,10 @@ GPU_TEST_P(MorphEx, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::morphologyEx(loadMat(src, useRoi), dst, morphOp, kernel, anchor, iterations);
+    morph->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::morphologyEx(src, dst_gold, morphOp, kernel, anchor, iterations);
diff --git a/samples/gpu/morphology.cpp b/samples/gpu/morphology.cpp
index 1ed8f96dc9..abc6c980b0 100644
--- a/samples/gpu/morphology.cpp
+++ b/samples/gpu/morphology.cpp
@@ -1,120 +1,186 @@
+#include <iostream>
 
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuimgproc.hpp"
 
 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
 
-static void help()
+class App
 {
+public:
+    App(int argc, const char* argv[]);
 
-printf("\nShow off image morphology: erosion, dialation, open and close\n"
-    "Call:\n   morphology2 [image]\n"
-    "This program also shows use of rect, elipse and cross kernels\n\n");
-printf( "Hot keys: \n"
-    "\tESC - quit the program\n"
-    "\tr - use rectangle structuring element\n"
-    "\te - use elliptic structuring element\n"
-    "\tc - use cross-shaped structuring element\n"
-    "\tSPACE - loop through all the options\n" );
-}
+    int run();
 
-GpuMat src, dst;
+private:
+    void help();
 
-int element_shape = MORPH_RECT;
+    void OpenClose();
+    void ErodeDilate();
 
-//the address of variable which receives trackbar position update
-int max_iters = 10;
-int open_close_pos = 0;
-int erode_dilate_pos = 0;
+    static void OpenCloseCallback(int, void*);
+    static void ErodeDilateCallback(int, void*);
 
-// callback function for open/close trackbar
-static void OpenClose(int, void*)
+    gpu::GpuMat src, dst;
+
+    int element_shape;
+
+    int max_iters;
+    int open_close_pos;
+    int erode_dilate_pos;
+};
+
+App::App(int argc, const char* argv[])
 {
-    int n = open_close_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::morphologyEx(src, dst, MORPH_OPEN, element);
-    else
-        cv::gpu::morphologyEx(src, dst, MORPH_CLOSE, element);
-    imshow("Open/Close",(Mat)dst);
-}
+    element_shape = MORPH_RECT;
+    open_close_pos = erode_dilate_pos = max_iters = 10;
 
-// callback function for erode/dilate trackbar
-static void ErodeDilate(int, void*)
-{
-    int n = erode_dilate_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::erode(src, dst, element);
-    else
-        cv::gpu::dilate(src, dst, element);
-    imshow("Erode/Dilate",(Mat)dst);
-}
-
-
-int main( int argc, char** argv )
-{
-    char* filename = argc == 2 ? argv[1] : (char*)"baboon.jpg";
-    if (string(argv[1]) == "--help")
+    if (argc == 2 && String(argv[1]) == "--help")
     {
         help();
-        return -1;
+        exit(0);
     }
 
-    src.upload(imread(filename, 1));
-    if (src.empty())
+    String filename = argc == 2 ? argv[1] : "baboon.jpg";
+
+    Mat img = imread(filename);
+    if (img.empty())
     {
-        help();
-        return -1;
+        cerr << "Can't open image " << filename.c_str() << endl;
+        exit(-1);
     }
 
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    help();
-
-
+    src.upload(img);
     if (src.channels() == 3)
     {
         // gpu support only 4th channel images
-        GpuMat src4ch;
-        cv::gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
+        gpu::GpuMat src4ch;
+        gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
         src = src4ch;
     }
 
-    //create windows for output images
-    namedWindow("Open/Close",1);
-    namedWindow("Erode/Dilate",1);
+    help();
 
-    open_close_pos = erode_dilate_pos = max_iters;
-    createTrackbar("iterations", "Open/Close",&open_close_pos,max_iters*2+1,OpenClose);
-    createTrackbar("iterations", "Erode/Dilate",&erode_dilate_pos,max_iters*2+1,ErodeDilate);
+    gpu::printShortCudaDeviceInfo(gpu::getDevice());
+}
+
+int App::run()
+{
+    // create windows for output images
+    namedWindow("Open/Close");
+    namedWindow("Erode/Dilate");
+
+    createTrackbar("iterations", "Open/Close", &open_close_pos, max_iters * 2 + 1, OpenCloseCallback, this);
+    createTrackbar("iterations", "Erode/Dilate", &erode_dilate_pos, max_iters * 2 + 1, ErodeDilateCallback, this);
 
     for(;;)
     {
-        int c;
+        OpenClose();
+        ErodeDilate();
 
-        OpenClose(open_close_pos, 0);
-        ErodeDilate(erode_dilate_pos, 0);
-        c = waitKey();
+        char c = (char) waitKey();
 
-        if( (char)c == 27 )
+        switch (c)
+        {
+        case 27:
+            return 0;
             break;
-        if( (char)c == 'e' )
+
+        case 'e':
             element_shape = MORPH_ELLIPSE;
-        else if( (char)c == 'r' )
+            break;
+
+        case 'r':
             element_shape = MORPH_RECT;
-        else if( (char)c == 'c' )
+            break;
+
+        case 'c':
             element_shape = MORPH_CROSS;
-        else if( (char)c == ' ' )
+            break;
+
+        case ' ':
             element_shape = (element_shape + 1) % 3;
+            break;
+        }
+    }
+}
+
+void App::help()
+{
+    cout << "Show off image morphology: erosion, dialation, open and close \n";
+    cout << "Call: \n";
+    cout << "   gpu-example-morphology [image] \n";
+    cout << "This program also shows use of rect, elipse and cross kernels \n" << endl;
+
+    cout << "Hot keys: \n";
+    cout << "\tESC - quit the program \n";
+    cout << "\tr - use rectangle structuring element \n";
+    cout << "\te - use elliptic structuring element \n";
+    cout << "\tc - use cross-shaped structuring element \n";
+    cout << "\tSPACE - loop through all the options \n" << endl;
+}
+
+void App::OpenClose()
+{
+    int n = open_close_pos - max_iters;
+    int an = n > 0 ? n : -n;
+
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> openFilter = gpu::createMorphologyFilter(MORPH_OPEN, src.type(), element);
+        openFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> closeFilter = gpu::createMorphologyFilter(MORPH_CLOSE, src.type(), element);
+        closeFilter->apply(src, dst);
     }
 
-    return 0;
+    Mat h_dst(dst);
+    imshow("Open/Close", h_dst);
+}
+
+void App::ErodeDilate()
+{
+    int n = erode_dilate_pos - max_iters;
+    int an = n > 0 ? n : -n;
+
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> erodeFilter = gpu::createMorphologyFilter(MORPH_ERODE, src.type(), element);
+        erodeFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> dilateFilter = gpu::createMorphologyFilter(MORPH_DILATE, src.type(), element);
+        dilateFilter->apply(src, dst);
+    }
+
+    Mat h_dst(dst);
+    imshow("Erode/Dilate", h_dst);
+}
+
+void App::OpenCloseCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->OpenClose();
+}
+
+void App::ErodeDilateCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->ErodeDilate();
+}
+
+int main(int argc, const char* argv[])
+{
+    App app(argc, argv);
+    return app.run();
 }
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index cbe50a40f6..4333b76257 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -746,10 +746,12 @@ TEST(erode)
 
         d_src.upload(src);
 
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        Ptr<gpu::Filter> erode = gpu::createMorphologyFilter(MORPH_ERODE, d_src.type(), ker);
+
+        erode->apply(d_src, d_dst);
 
         GPU_ON;
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        erode->apply(d_src, d_dst);
         GPU_OFF;
     }
 }

From 4bb297afc2ff5c1555c5f1f3dd1a0f760823a446 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Apr 2013 16:28:15 +0400
Subject: [PATCH 6/9] refactored Image Rank Filters

---
 .../gpufilters/include/opencv2/gpufilters.hpp |  17 +-
 modules/gpufilters/src/filtering.cpp          | 157 +++++++++++-------
 2 files changed, 106 insertions(+), 68 deletions(-)

diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 7595bd5439..c5a61a06e9 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -241,7 +241,18 @@ inline void morphologyEx(InputArray src, OutputArray dst, int op, InputArray ker
     f->apply(src, dst, stream);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Image Rank Filter
 
+//! Result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! Result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
 
 
@@ -326,12 +337,6 @@ CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dst
 
 
 
-//! returns maximum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-//! returns minimum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
 
 
 
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index 145e049842..a8ec83a74c 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -63,6 +63,8 @@ Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, i
 
 Ptr<Filter> cv::gpu::createMorphologyFilter(int, int, InputArray, Point, int) { throw_no_cuda(); return Ptr<Filter>(); }
 
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createBoxMinFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
 
 
@@ -70,8 +72,6 @@ Ptr<Filter> cv::gpu::createMorphologyFilter(int, int, InputArray, Point, int) {
 
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 
 
 
@@ -783,6 +783,99 @@ Ptr<Filter> cv::gpu::createMorphologyFilter(int op, int srcType, InputArray kern
     }
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Image Rank Filter
+
+namespace
+{
+    enum
+    {
+        RANK_MAX,
+        RANK_MIN
+    };
+
+    class NPPRankFilter : public Filter
+    {
+    public:
+        NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        Size ksize_;
+        Point anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+        nppFilterRank_t func_;
+
+        GpuMat srcBorder_;
+    };
+
+    NPPRankFilter::NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        type_(srcType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
+        static const nppFilterRank_t minFuncs[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+
+        normalizeAnchor(anchor_, ksize_);
+
+        if (op == RANK_MAX)
+            func_ = maxFuncs[CV_MAT_CN(srcType)];
+        else
+            func_ = minFuncs[CV_MAT_CN(srcType)];
+    }
+
+    void NPPRankFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
+{
+    return new NPPRankFilter(RANK_MAX, srcType, ksize, anchor, borderMode, borderVal);
+}
+
+Ptr<Filter> cv::gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
+{
+    return new NPPRankFilter(RANK_MIN, srcType, ksize, anchor, borderMode, borderVal);
+}
+
 
 
 
@@ -924,64 +1017,4 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstTy
 
 
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Image Rank Filter
-
-namespace
-{
-    typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        NppiSize oMaskSize, NppiPoint oAnchor);
-
-    struct NPPRankFilter : public BaseFilter_GPU
-    {
-        NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        nppFilterRank_t func;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
-{
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
-{
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
-}
-
 #endif

From 4317cd1ffa4295b49759070c6089d58e1f099e05 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 29 Apr 2013 17:09:17 +0400
Subject: [PATCH 7/9] refactored 1D Sum Filters

---
 .../gpufilters/include/opencv2/gpufilters.hpp |  91 +------
 modules/gpufilters/src/filtering.cpp          | 222 ++++++++----------
 2 files changed, 106 insertions(+), 207 deletions(-)

diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index c5a61a06e9..76b5b731d2 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -254,95 +254,16 @@ CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
                                           int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
 
-
-
-
-
-
-/*!
-The Base Class for 1D or Row-wise Filters
-
-This is the base class for linear or non-linear filters that process 1D data.
-In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseRowFilter_GPU
-{
-public:
-    BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseRowFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
-};
-
-/*!
-The Base Class for Column-wise Filters
-
-This is the base class for linear or non-linear filters that process columns of 2D arrays.
-Such filters are used for the "vertical" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseColumnFilter_GPU
-{
-public:
-    BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseColumnFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
-};
-
-/*!
-The Base Class for Non-Separable 2D Filters.
-
-This is the base class for linear or non-linear 2D filters.
-*/
-class CV_EXPORTS BaseFilter_GPU
-{
-public:
-    BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    Size ksize;
-    Point anchor;
-};
-
-/*!
-The Base Class for Filter Engine.
-
-The class can be used to apply an arbitrary filtering operation to an image.
-It contains all the necessary intermediate buffers.
-*/
-class CV_EXPORTS FilterEngine_GPU
-{
-public:
-    virtual ~FilterEngine_GPU() {}
-
-    virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-};
-
-
-
-//! returns horizontal 1D box filter
+//! creates a horizontal 1D box filter
 //! supports only CV_8UC1 source type and CV_32FC1 sum type
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
+CV_EXPORTS Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! returns vertical 1D box filter
+//! creates a vertical 1D box filter
 //! supports only CV_8UC1 sum type and CV_32FC1 dst type
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
 }} // namespace cv { namespace gpu {
 
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index a8ec83a74c..7f02bdac59 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -66,14 +66,8 @@ Ptr<Filter> cv::gpu::createMorphologyFilter(int, int, InputArray, Point, int) {
 Ptr<Filter> cv::gpu::createBoxMaxFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 Ptr<Filter> cv::gpu::createBoxMinFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
-
-
-
-
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-
-
+Ptr<Filter> cv::gpu::createRowSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createColumnSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
 #else
 
@@ -876,145 +870,129 @@ Ptr<Filter> cv::gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor, i
     return new NPPRankFilter(RANK_MIN, srcType, ksize, anchor, borderMode, borderVal);
 }
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-namespace
-{
-    inline void normalizeROI(Rect& roi, const Size& ksize, const Point& anchor, const Size& src_size)
-    {
-        if (roi == Rect(0,0,-1,-1))
-            roi = Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height);
-
-        CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
-    }
-
-    inline void normalizeKernel(const Mat& kernel, GpuMat& gpu_krnl, int type = CV_8U, int* nDivisor = 0, bool reverse = false)
-    {
-        int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
-        if (nDivisor) *nDivisor = scale;
-
-        Mat temp(kernel.size(), type);
-        kernel.convertTo(temp, type, scale);
-        Mat cont_krnl = temp.reshape(1, 1);
-
-        if (reverse)
-        {
-            int count = cont_krnl.cols >> 1;
-            for (int i = 0; i < count; ++i)
-            {
-                std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
-            }
-        }
-
-        gpu_krnl.upload(cont_krnl);
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // 1D Sum Filter
 
 namespace
 {
-    struct NppRowSumFilter : public BaseRowFilter_GPU
+    class NppRowSumFilter : public Filter
     {
-        NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {}
+    public:
+        NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
 
-            NppStreamHandler h(stream);
-
-            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
+        GpuMat srcBorder_;
     };
+
+    NppRowSumFilter::NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppRowSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, 0, 0, ksize_, ksize_, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_, 0, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowRow_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor)
+Ptr<Filter> cv::gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
 {
-    CV_Assert(srcType == CV_8UC1 && sumType == CV_32FC1);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseRowFilter_GPU>(new NppRowSumFilter(ksize, anchor));
+    return new NppRowSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
 namespace
 {
-    struct NppColumnSumFilter : public BaseColumnFilter_GPU
+    class NppColumnSumFilter : public Filter
     {
-        NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {}
+    public:
+        NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
 
-            NppStreamHandler h(stream);
-
-            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
+        GpuMat srcBorder_;
     };
+
+    NppColumnSumFilter::NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppColumnSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_, ksize_, 0, 0, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(0, ksize_, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                   dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                   oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor)
+Ptr<Filter> cv::gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
 {
-    CV_Assert(sumType == CV_8UC1 && dstType == CV_32FC1);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseColumnFilter_GPU>(new NppColumnSumFilter(ksize, anchor));
+    return new NppColumnSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
-
-
 #endif

From 24108f81106d4fcbef0a18384d3c8cc5325898a8 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 17 Jun 2013 09:53:39 +0400
Subject: [PATCH 8/9] updated documentation

---
 modules/gpuarithm/doc/arithm.rst              |   2 -
 modules/gpufilters/doc/filtering.rst          | 887 +++++-------------
 .../gpufilters/include/opencv2/gpufilters.hpp |  41 +-
 3 files changed, 279 insertions(+), 651 deletions(-)

diff --git a/modules/gpuarithm/doc/arithm.rst b/modules/gpuarithm/doc/arithm.rst
index 2f1d74df5e..09b7220c09 100644
--- a/modules/gpuarithm/doc/arithm.rst
+++ b/modules/gpuarithm/doc/arithm.rst
@@ -157,8 +157,6 @@ Computes a convolution (or cross-correlation) of two images.
 
     :param stream: Stream for the asynchronous version.
 
-.. seealso:: :ocv:func:`gpu::filter2D`
-
 
 
 gpu::createConvolution
diff --git a/modules/gpufilters/doc/filtering.rst b/modules/gpufilters/doc/filtering.rst
index 79c2ea51cf..925b05f2cf 100644
--- a/modules/gpufilters/doc/filtering.rst
+++ b/modules/gpufilters/doc/filtering.rst
@@ -7,346 +7,236 @@ Functions and classes described in this section are used to perform various line
 
 
 
-gpu::BaseRowFilter_GPU
-----------------------
-.. ocv:class:: gpu::BaseRowFilter_GPU
+gpu::Filter
+-----------
+.. ocv:class:: gpu::Filter
 
-Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
+Common interface for all GPU filters ::
 
-    class BaseRowFilter_GPU
+    class CV_EXPORTS Filter : public Algorithm
     {
     public:
-        BaseRowFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseRowFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
+        virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
     };
 
 
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
 
-
-
-gpu::BaseColumnFilter_GPU
--------------------------
-.. ocv:class:: gpu::BaseColumnFilter_GPU
-
-Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
-
-    class BaseColumnFilter_GPU
-    {
-    public:
-        BaseColumnFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseColumnFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::BaseFilter_GPU
--------------------
-.. ocv:class:: gpu::BaseFilter_GPU
-
-Base class for non-separable 2D filters. ::
-
-    class CV_EXPORTS BaseFilter_GPU
-    {
-    public:
-        BaseFilter_GPU(const Size& ksize_, const Point& anchor_);
-        virtual ~BaseFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        Size ksize;
-        Point anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::FilterEngine_GPU
----------------------
-.. ocv:class:: gpu::FilterEngine_GPU
-
-Base class for the Filter Engine. ::
-
-    class CV_EXPORTS FilterEngine_GPU
-    {
-    public:
-        virtual ~FilterEngine_GPU() {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst,
-                           Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-    };
-
-
-The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`gpu::filter2D`, :ocv:func:`gpu::erode`, :ocv:func:`gpu::Sobel` , and others.
-
-By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
-
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        // Allocate and release buffers at each iterations
-        gpu::GaussianBlur(src, dst, ksize, sigma1);
-    }
-
-    // Allocate buffers only once
-    cv::Ptr<gpu::FilterEngine_GPU> filter =
-        gpu::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-    // Release buffers only once
-    filter.release();
-
-
-``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
-
-.. note:: The GPU filters do not support the in-place mode.
-
-.. seealso:: :ocv:class:`gpu::BaseRowFilter_GPU`, :ocv:class:`gpu::BaseColumnFilter_GPU`, :ocv:class:`gpu::BaseFilter_GPU`, :ocv:func:`gpu::createFilter2D_GPU`, :ocv:func:`gpu::createSeparableFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`, :ocv:func:`gpu::createMorphologyFilter_GPU`, :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`gpu::createDerivFilter_GPU`, :ocv:func:`gpu::createGaussianFilter_GPU`
-
-
-
-gpu::createFilter2D_GPU
----------------------------
-Creates a non-separable filter engine with the specified filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createFilter2D_GPU( const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-
-    :param filter2D: Non-separable 2D filter.
-
-    :param srcType: Input image type. It must be supported by  ``filter2D`` .
-
-    :param dstType: Output image type. It must be supported by  ``filter2D`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`.
-
-
-
-gpu::createSeparableFilter_GPU
-----------------------------------
-Creates a separable filter engine with the specified filters.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableFilter_GPU( const Ptr<BaseRowFilter_GPU>& rowFilter, const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-
-    :param rowFilter: "Horizontal" 1D filter.
-
-    :param columnFilter: "Vertical" 1D filter.
-
-    :param srcType: Input image type. It must be supported by  ``rowFilter`` .
-
-    :param bufType: Buffer image type. It must be supported by  ``rowFilter``  and  ``columnFilter`` .
-
-    :param dstType: Output image type. It must be supported by  ``columnFilter`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createSeparableLinearFilter_GPU`.
-
-
-
-gpu::getRowSumFilter_GPU
-----------------------------
-Creates a horizontal 1D box filter.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1)
-
-    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::getColumnSumFilter_GPU
--------------------------------
-Creates a vertical 1D box filter.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1)
-
-    :param sumType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param dstType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::createBoxFilter_GPU
-----------------------------
-Creates a normalized 2D box filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor = Point(-1,-1))
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1))
-
-    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
-
-    :param dstType: Output image type.  It supports only the same values as the source type.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`boxFilter`
-
-
-
-gpu::boxFilter
+gpu::Filter::apply
 ------------------
-Smooths the image using the normalized box filter.
+Applies the specified filter to the image.
 
-.. ocv:function:: void gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::Filter::apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0
 
-    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+    :param src: Input image.
 
-    :param dst: Output image type. The size and type is the same as ``src`` .
-
-    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+    :param dst: Output image.
 
     :param stream: Stream for the asynchronous version.
 
-.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+
+gpu::createBoxFilter
+--------------------
+Creates a normalized 2D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` and ``CV_8UC4`` are supported for now.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
 
 .. seealso:: :ocv:func:`boxFilter`
 
 
 
-gpu::blur
--------------
-Acts as a synonym for the normalized box filter.
+gpu::createLinearFilter
+-----------------------
+Creates a non-separable linear 2D filter.
 
-.. ocv:function:: void gpu::blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: Ptr<Filter> gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
 
-    :param dst: Output image type with the same size and type as  ``src`` .
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
 
-    :param ksize: Kernel size.
+    :param kernel: 2D array of filter coefficients.
 
     :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
 
-    :param stream: Stream for the asynchronous version.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderVal: Default border value.
 
-.. seealso:: :ocv:func:`blur`, :ocv:func:`gpu::boxFilter`
+.. seealso:: :ocv:func:`filter2D`
 
 
 
-gpu::createMorphologyFilter_GPU
------------------------------------
+gpu::createLaplacianFilter
+--------------------------
+Creates a Laplacian operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
+
+    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+.. seealso:: :ocv:func:`Laplacian`
+
+
+
+gpu::createSeparableLinearFilter
+--------------------------------
+Creates a separable linear filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source array type.
+
+    :param dstType: Destination array type.
+
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`sepFilter2D`
+
+
+
+gpu::createDerivFilter
+----------------------
+Creates a generalized Deriv operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize = false, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+
+    :param normalize: Flag indicating whether to normalize (scale down) the filter coefficients or not. See  :ocv:func:`getDerivKernels` for details.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+
+
+gpu::createSobelFilter
+----------------------
+Creates a Sobel operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Sobel`
+
+
+
+gpu::createScharrFilter
+-----------------------
+Creates a vertical or horizontal Scharr operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Order of the derivative x.
+
+    :param dy: Order of the derivative y.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Scharr`
+
+
+
+gpu::createGaussianFilter
+-------------------------
+Creates a Gaussian filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2 = 0, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`GaussianBlur`
+
+
+
+gpu::createMorphologyFilter
+---------------------------
 Creates a 2D morphological filter.
 
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1)
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor=Point(-1,-1))
-
-    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
-
-    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
-
-    :param kernel: 2D 8-bit structuring element for the morphological operation.
-
-    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
-
-    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`createMorphologyFilter`
-
-
-
-gpu::erode
---------------
-Erodes an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. Only  ``CV_8UC1``  and  ``CV_8UC4``  types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param kernel: Structuring element used for erosion. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times erosion to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`erode`
-
-
-
-gpu::dilate
----------------
-Dilates an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
-
-    :param dst: Destination image with the same size and type as ``src``.
-
-    :param kernel: Structuring element used for dilation. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times dilation to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`dilate`
-
-
-
-gpu::morphologyEx
----------------------
-Applies an advanced morphological operation to an image.
-
-.. ocv:function::  void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
+.. ocv:function:: Ptr<Filter> gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1)
 
     :param op: Type of morphological operation. The following types are possible:
 
+        * **MORPH_ERODE** erode
+
+        * **MORPH_DILATE** dilate
+
         * **MORPH_OPEN** opening
 
         * **MORPH_CLOSE** closing
@@ -357,363 +247,88 @@ Applies an advanced morphological operation to an image.
 
         * **MORPH_BLACKHAT** "black hat"
 
-    :param kernel: Structuring element.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
 
-    :param anchor: Position of an anchor within the element. The default value ``Point(-1, -1)`` means that the anchor is at the element center.
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
+
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
 
     :param iterations: Number of times erosion and dilation to be applied.
 
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
 .. seealso:: :ocv:func:`morphologyEx`
 
 
 
-gpu::createLinearFilter_GPU
--------------------------------
-Creates a non-separable linear filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT)
-
-    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dstType: Output image type. The same type as ``src`` is supported.
-
-    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:class:`gpu::Convolution`.
-
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-.. seealso:: :ocv:func:`createLinearFilter`
-
-
-
-gpu::filter2D
------------------
-Applies the non-separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
-
-    :param kernel: 2D array of filter coefficients.
-
-    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`filter2D`, :ocv:class:`gpu::Convolution`
-
-
-
-gpu::Laplacian
-------------------
-Applies the Laplacian operator to an image.
-
-.. ocv:function:: void gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image. The size and number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. It supports only the same depth as the source image depth.
-
-    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
-
-    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`Laplacian`, :ocv:func:`gpu::filter2D`
-
-
-
-gpu::getLinearRowFilter_GPU
--------------------------------
-Creates a primitive row filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getLinearRowFilter_GPU( int srcType, int bufType, const Mat& rowKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
-
-    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``srcType == CV_8UC1`` or ``srcType == CV_8UC4`` and ``bufType == srcType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside the image.
-
-.. seealso:: :ocv:func:`createSeparableLinearFilter` .
-
-
-
-gpu::getLinearColumnFilter_GPU
-----------------------------------
-Creates a primitive column filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getLinearColumnFilter_GPU( int bufType, int dstType, const Mat& columnKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
-
-    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
-
-    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` . For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``dstType == CV_8UC1`` or ``dstType == CV_8UC4`` and ``bufType == dstType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside image.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::createSeparableLinearFilter_GPU
-----------------------------------------
-Creates a separable linear filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
-
-    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`. For details on limitations, see :ocv:func:`gpu::getLinearRowFilter_GPU`, cpp:ocv:func:`gpu::getLinearColumnFilter_GPU`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`gpu::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::sepFilter2D
---------------------
-Applies a separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` .
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param kernelX: Horizontal filter coefficients.
-
-    :param kernelY: Vertical filter coefficients.
-
-    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
-
-
-
-gpu::createDerivFilter_GPU
-------------------------------
-Creates a filter engine for the generalized Sobel operator.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
-
-
-
-gpu::Sobel
---------------
-Applies the generalized Sobel operator to an image.
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as source image.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Sobel`
-
-
-
-gpu::Scharr
----------------
-Calculates the first x- or y- image derivative using the Scharr operator.
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` has.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Order of the derivative x.
-
-    :param dy: Order of the derivative y.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Scharr`
-
-
-
-gpu::createGaussianFilter_GPU
----------------------------------
-Creates a Gaussian filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createGaussianFilter_GPU( int type, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
-
-    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
-
-
-
-gpu::GaussianBlur
----------------------
-Smooths an image using the Gaussian filter.
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param ksize: Gaussian kernel size.  ``ksize.width``  and  ``ksize.height``  can differ but they both must be positive and odd. If they are zeros, they are computed from  ``sigma1``  and  ``sigma2`` .
-
-    :param sigma1: Gaussian kernel standard deviation in X direction.
-
-    :param sigma2: Gaussian kernel standard deviation in Y direction. If  ``sigma2``  is zero, it is set to be equal to  ``sigma1`` . If they are both zeros, they are computed from  ``ksize.width``  and  ``ksize.height``, respectively. See  :ocv:func:`getGaussianKernel` for details. To fully control the result regardless of possible future modification of all this semantics, you are recommended to specify all of  ``ksize`` , ``sigma1`` , and  ``sigma2`` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createGaussianFilter_GPU`, :ocv:func:`GaussianBlur`
-
-
-
-gpu::getMaxFilter_GPU
--------------------------
+gpu::createBoxMaxFilter
+-----------------------
 Creates the maximum filter.
 
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
 
     :param ksize: Kernel size.
 
     :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
 
 
 
-gpu::getMinFilter_GPU
--------------------------
+gpu::createBoxMinFilter
+-----------------------
 Creates the minimum filter.
 
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
 
     :param ksize: Kernel size.
 
     :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createRowSumFilter
+-----------------------
+Creates a horizontal 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createColumnSumFilter
+--------------------------
+Creates a vertical 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 76b5b731d2..a2cc8db6b6 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -72,11 +72,13 @@ public:
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Box Filter
 
-//! smooths the image using the normalized box filter
+//! creates a normalized 2D box filter
 //! supports CV_8UC1, CV_8UC4 types
 CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
                                        int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
+// obsolete
+
 __OPENCV_GPUFILTERS_DEPR_BEFORE__ void boxFilter(InputArray src, OutputArray dst, int dstType,
                                                  Size ksize, Point anchor = Point(-1,-1),
                                                  Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
@@ -100,9 +102,12 @@ inline void blur(InputArray src, OutputArray dst, Size ksize, Point anchor, Stre
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear Filter
 
-//! non-separable linear 2D filter
+//! Creates a non-separable linear 2D filter
+//! supports 1 and 4 channel CV_8U, CV_16U and CV_32F input
 CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
-                                            int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+// obsolete
 
 __OPENCV_GPUFILTERS_DEPR_BEFORE__ void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernel,
                                                 Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT,
@@ -117,10 +122,12 @@ inline void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray ker
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian Filter
 
-//! applies Laplacian operator to the image
+//! creates a Laplacian operator
 //! supports only ksize = 1 and ksize = 3
 CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
-                                            int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+                                             int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+// obsolete
 
 __OPENCV_GPUFILTERS_DEPR_BEFORE__ void Laplacian(InputArray src, OutputArray dst, int ddepth,
                                                  int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT,
@@ -135,10 +142,12 @@ inline void Laplacian(InputArray src, OutputArray dst, int ddepth, int ksize, do
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
-//! separable linear 2D filter
+//! creates a separable linear filter
 CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
                                                    Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
+// obsolete
+
 __OPENCV_GPUFILTERS_DEPR_BEFORE__ void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernelX, InputArray kernelY,
                             Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
                                                    Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
@@ -152,19 +161,21 @@ inline void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deriv Filter
 
-//! the generalized Deriv operator
+//! creates a generalized Deriv operator
 CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
                                          int ksize, bool normalize = false, double scale = 1,
                                          int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
-//! the Sobel operator
+//! creates a Sobel operator
 CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
-//! the vertical or horizontal Scharr operator
+//! creates a vertical or horizontal Scharr operator
 CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
                                           double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
+// obsolete
+
 __OPENCV_GPUFILTERS_DEPR_BEFORE__ void Sobel(InputArray src, OutputArray dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
                                              int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
                                              Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
@@ -188,11 +199,13 @@ inline void Scharr(InputArray src, OutputArray dst, int ddepth, int dx, int dy,
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
-//! smooths the image using Gaussian filter
+//! creates a Gaussian filter
 CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
                                             double sigma1, double sigma2 = 0,
                                             int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
+// obsolete
+
 __OPENCV_GPUFILTERS_DEPR_BEFORE__ void GaussianBlur(InputArray src, OutputArray dst, Size ksize,
                                                     double sigma1, double sigma2 = 0,
                                                     int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
@@ -207,10 +220,12 @@ inline void GaussianBlur(InputArray src, OutputArray dst, Size ksize, double sig
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Morphology Filter
 
-//! returns 2D morphological filter
+//! creates a 2D morphological filter
 //! supports CV_8UC1 and CV_8UC4 types
 CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
 
+// obsolete
+
 __OPENCV_GPUFILTERS_DEPR_BEFORE__ void erode(InputArray src, OutputArray dst, InputArray kernel,
                                              Point anchor = Point(-1, -1), int iterations = 1,
                                              Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
@@ -244,12 +259,12 @@ inline void morphologyEx(InputArray src, OutputArray dst, int op, InputArray ker
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Image Rank Filter
 
-//! Result pixel value is the maximum of pixel values under the rectangular mask region
+//! result pixel value is the maximum of pixel values under the rectangular mask region
 CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
                                           int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! Result pixel value is the maximum of pixel values under the rectangular mask region
+//! result pixel value is the maximum of pixel values under the rectangular mask region
 CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
                                           int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

From f08d75a84bcb8c96af4d540af03b15c59c8f4b50 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Thu, 20 Jun 2013 11:24:29 +0400
Subject: [PATCH 9/9] removed obsolete API

---
 .../gpufilters/include/opencv2/gpufilters.hpp | 139 ------------------
 1 file changed, 139 deletions(-)

diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index a2cc8db6b6..b0ebfd73c5 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -50,17 +50,6 @@
 #include "opencv2/core/gpu.hpp"
 #include "opencv2/imgproc.hpp"
 
-#if defined __GNUC__
-    #define __OPENCV_GPUFILTERS_DEPR_BEFORE__
-    #define __OPENCV_GPUFILTERS_DEPR_AFTER__ __attribute__ ((deprecated))
-#elif (defined WIN32 || defined _WIN32)
-    #define __OPENCV_GPUFILTERS_DEPR_BEFORE__ __declspec(deprecated)
-    #define __OPENCV_GPUFILTERS_DEPR_AFTER__
-#else
-    #define __OPENCV_GPUFILTERS_DEPR_BEFORE__
-    #define __OPENCV_GPUFILTERS_DEPR_AFTER__
-#endif
-
 namespace cv { namespace gpu {
 
 class CV_EXPORTS Filter : public Algorithm
@@ -77,28 +66,6 @@ public:
 CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
                                        int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-// obsolete
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void boxFilter(InputArray src, OutputArray dst, int dstType,
-                                                 Size ksize, Point anchor = Point(-1,-1),
-                                                 Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void boxFilter(InputArray src, OutputArray dst, int dstType, Size ksize, Point anchor, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createBoxFilter(src.type(), dstType, ksize, anchor);
-    f->apply(src, dst, stream);
-}
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void blur(InputArray src, OutputArray dst, Size ksize,
-                                            Point anchor = Point(-1,-1),
-                                            Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void blur(InputArray src, OutputArray dst, Size ksize, Point anchor, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createBoxFilter(src.type(), -1, ksize, anchor);
-    f->apply(src, dst, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear Filter
 
@@ -107,18 +74,6 @@ inline void blur(InputArray src, OutputArray dst, Size ksize, Point anchor, Stre
 CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
                                           int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-// obsolete
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernel,
-                                                Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT,
-                                                Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernel, Point anchor, int borderType, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createLinearFilter(src.type(), ddepth, kernel, anchor, borderType);
-    f->apply(src, dst, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian Filter
 
@@ -127,18 +82,6 @@ inline void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray ker
 CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
                                              int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-// obsolete
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Laplacian(InputArray src, OutputArray dst, int ddepth,
-                                                 int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT,
-                                                 Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void Laplacian(InputArray src, OutputArray dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createLaplacianFilter(src.type(), ddepth, ksize, scale, borderType);
-    f->apply(src, dst, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
@@ -146,18 +89,6 @@ inline void Laplacian(InputArray src, OutputArray dst, int ddepth, int ksize, do
 CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
                                                    Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
-// obsolete
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernelX, InputArray kernelY,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                                                   Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernelX, InputArray kernelY, Point anchor, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createSeparableLinearFilter(src.type(), ddepth, kernelX, kernelY, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deriv Filter
 
@@ -174,28 +105,6 @@ CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int d
 CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
                                           double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
-// obsolete
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Sobel(InputArray src, OutputArray dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-                                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                                             Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void Sobel(InputArray src, OutputArray dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createSobelFilter(src.type(), ddepth, dx, dy, ksize, scale, rowBorderType, columnBorderType);
-    f->apply(src, dst, stream);
-}
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Scharr(InputArray src, OutputArray dst, int ddepth, int dx, int dy, double scale = 1,
-                                              int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                                              Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void Scharr(InputArray src, OutputArray dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createScharrFilter(src.type(), ddepth, dx, dy, scale, rowBorderType, columnBorderType);
-    f->apply(src, dst, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
@@ -204,19 +113,6 @@ CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize
                                             double sigma1, double sigma2 = 0,
                                             int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
-// obsolete
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void GaussianBlur(InputArray src, OutputArray dst, Size ksize,
-                                                    double sigma1, double sigma2 = 0,
-                                                    int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                                                    Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void GaussianBlur(InputArray src, OutputArray dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Morphology Filter
 
@@ -224,38 +120,6 @@ inline void GaussianBlur(InputArray src, OutputArray dst, Size ksize, double sig
 //! supports CV_8UC1 and CV_8UC4 types
 CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
 
-// obsolete
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void erode(InputArray src, OutputArray dst, InputArray kernel,
-                                             Point anchor = Point(-1, -1), int iterations = 1,
-                                             Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void erode(InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createMorphologyFilter(MORPH_ERODE, src.type(), kernel, anchor, iterations);
-    f->apply(src, dst, stream);
-}
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void dilate(InputArray src, OutputArray dst, InputArray kernel,
-                                              Point anchor = Point(-1, -1), int iterations = 1,
-                                              Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void dilate(InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createMorphologyFilter(MORPH_DILATE, src.type(), kernel, anchor, iterations);
-    f->apply(src, dst, stream);
-}
-
-__OPENCV_GPUFILTERS_DEPR_BEFORE__ void morphologyEx(InputArray src, OutputArray dst, int op,
-                                                    InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1,
-                                                    Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
-
-inline void morphologyEx(InputArray src, OutputArray dst, int op, InputArray kernel, Point anchor, int iterations, Stream& stream)
-{
-    Ptr<gpu::Filter> f = gpu::createMorphologyFilter(op, src.type(), kernel, anchor, iterations);
-    f->apply(src, dst, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Image Rank Filter
 
@@ -282,7 +146,4 @@ CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize
 
 }} // namespace cv { namespace gpu {
 
-#undef __OPENCV_GPUFILTERS_DEPR_BEFORE__
-#undef __OPENCV_GPUFILTERS_DEPR_AFTER__
-
 #endif /* __OPENCV_GPUFILTERS_HPP__ */