diff --git a/modules/gpuarithm/doc/arithm.rst b/modules/gpuarithm/doc/arithm.rst
index 2f1d74df5e..09b7220c09 100644
--- a/modules/gpuarithm/doc/arithm.rst
+++ b/modules/gpuarithm/doc/arithm.rst
@@ -157,8 +157,6 @@ Computes a convolution (or cross-correlation) of two images.
 
     :param stream: Stream for the asynchronous version.
 
-.. seealso:: :ocv:func:`gpu::filter2D`
-
 
 
 gpu::createConvolution
diff --git a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
index e7a29b5763..3fe62ec94b 100644
--- a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
+++ b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
@@ -321,7 +321,7 @@ private:
     GpuMat colors_;
     GpuMat weights_;
 
-    Ptr<FilterEngine_GPU> boxFilter_;
+    Ptr<gpu::Filter> boxFilter_;
     GpuMat buf_;
 };
 
diff --git a/modules/gpubgsegm/src/fgd.cpp b/modules/gpubgsegm/src/fgd.cpp
index 1b4038304a..fb14ff172a 100644
--- a/modules/gpubgsegm/src/fgd.cpp
+++ b/modules/gpubgsegm/src/fgd.cpp
@@ -228,11 +228,10 @@ private:
     cv::gpu::GpuMat countBuf_;
 
     cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
     cv::gpu::GpuMat filterBrd_;
 
-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
+    cv::Ptr<cv::gpu::Filter> dilateFilter_;
+    cv::Ptr<cv::gpu::Filter> erodeFilter_;
 
     CvMemStorage* storage_;
 };
@@ -305,8 +304,8 @@ void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, cons
         cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
         cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
 
-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
+        dilateFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, CV_8UC1, kernel, anchor);
+        erodeFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, CV_8UC1, kernel, anchor);
     }
 }
 
@@ -326,7 +325,6 @@ void cv::gpu::FGDStatModel::Impl::release()
     countBuf_.release();
 
     buf_.release();
-    filterBuf_.release();
     filterBrd_.release();
 }
 
@@ -488,14 +486,14 @@ namespace
 
 namespace
 {
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
+    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::Filter>& filter, cv::Scalar brdVal)
     {
         cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
+        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst);
     }
 
     void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
+                          cv::Ptr<cv::gpu::Filter>& erodeFilter, cv::Ptr<cv::gpu::Filter>& dilateFilter,
                           const cv::gpu::FGDStatModel::Params& params)
     {
         const int brd = params.perform_morphing;
diff --git a/modules/gpubgsegm/src/gmg.cpp b/modules/gpubgsegm/src/gmg.cpp
index a38cbffaca..b97f0836f4 100644
--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
@@ -100,7 +100,7 @@ void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
     nfeatures_.setTo(cv::Scalar::all(0));
 
     if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
+        boxFilter_ = cv::gpu::createBoxFilter(CV_8UC1, -1, cv::Size(smoothingRadius, smoothingRadius));
 
     loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
 }
@@ -141,7 +141,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
     // medianBlur
     if (smoothingRadius > 0)
     {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
+        boxFilter_->apply(fgmask, buf_, stream);
         int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
         double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
         cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
diff --git a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
index 0c821745f9..cc73da9d9e 100644
--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
@@ -351,7 +351,7 @@ private:
 
     FAST_GPU fastDetector_;
 
-    Ptr<FilterEngine_GPU> blurFilter;
+    Ptr<gpu::Filter> blurFilter;
 
     GpuMat d_keypoints_;
 };
diff --git a/modules/gpufeatures2d/src/orb.cpp b/modules/gpufeatures2d/src/orb.cpp
index 495ca3f6ef..7cb1cbecc1 100644
--- a/modules/gpufeatures2d/src/orb.cpp
+++ b/modules/gpufeatures2d/src/orb.cpp
@@ -468,7 +468,7 @@ cv::gpu::ORB_GPU::ORB_GPU(int nFeatures, float scaleFactor, int nLevels, int edg
 
     pattern_.upload(h_pattern);
 
-    blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+    blurFilter = gpu::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
 
     blurForDescriptor = false;
 }
@@ -632,7 +632,7 @@ void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
         {
             // preprocess the resized image
             ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
+            blurFilter->apply(imagePyr_[level], buf_);
         }
 
         computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
diff --git a/modules/gpufilters/CMakeLists.txt b/modules/gpufilters/CMakeLists.txt
index 18f6d7f7b6..640de8c115 100644
--- a/modules/gpufilters/CMakeLists.txt
+++ b/modules/gpufilters/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "GPU-accelerated Image Filtering")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
 
-ocv_define_module(gpufilters opencv_imgproc OPTIONAL opencv_gpuarithm)
+ocv_define_module(gpufilters opencv_imgproc opencv_gpuarithm)
diff --git a/modules/gpufilters/doc/filtering.rst b/modules/gpufilters/doc/filtering.rst
index 79c2ea51cf..925b05f2cf 100644
--- a/modules/gpufilters/doc/filtering.rst
+++ b/modules/gpufilters/doc/filtering.rst
@@ -7,346 +7,236 @@ Functions and classes described in this section are used to perform various line
 
 
 
-gpu::BaseRowFilter_GPU
-----------------------
-.. ocv:class:: gpu::BaseRowFilter_GPU
+gpu::Filter
+-----------
+.. ocv:class:: gpu::Filter
 
-Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
+Common interface for all GPU filters ::
 
-    class BaseRowFilter_GPU
+    class CV_EXPORTS Filter : public Algorithm
     {
     public:
-        BaseRowFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseRowFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
+        virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
     };
 
 
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
 
-
-
-gpu::BaseColumnFilter_GPU
--------------------------
-.. ocv:class:: gpu::BaseColumnFilter_GPU
-
-Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
-
-    class BaseColumnFilter_GPU
-    {
-    public:
-        BaseColumnFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseColumnFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::BaseFilter_GPU
--------------------
-.. ocv:class:: gpu::BaseFilter_GPU
-
-Base class for non-separable 2D filters. ::
-
-    class CV_EXPORTS BaseFilter_GPU
-    {
-    public:
-        BaseFilter_GPU(const Size& ksize_, const Point& anchor_);
-        virtual ~BaseFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        Size ksize;
-        Point anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::FilterEngine_GPU
----------------------
-.. ocv:class:: gpu::FilterEngine_GPU
-
-Base class for the Filter Engine. ::
-
-    class CV_EXPORTS FilterEngine_GPU
-    {
-    public:
-        virtual ~FilterEngine_GPU() {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst,
-                           Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-    };
-
-
-The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`gpu::filter2D`, :ocv:func:`gpu::erode`, :ocv:func:`gpu::Sobel` , and others.
-
-By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
-
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        // Allocate and release buffers at each iterations
-        gpu::GaussianBlur(src, dst, ksize, sigma1);
-    }
-
-    // Allocate buffers only once
-    cv::Ptr<gpu::FilterEngine_GPU> filter =
-        gpu::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-    // Release buffers only once
-    filter.release();
-
-
-``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
-
-.. note:: The GPU filters do not support the in-place mode.
-
-.. seealso:: :ocv:class:`gpu::BaseRowFilter_GPU`, :ocv:class:`gpu::BaseColumnFilter_GPU`, :ocv:class:`gpu::BaseFilter_GPU`, :ocv:func:`gpu::createFilter2D_GPU`, :ocv:func:`gpu::createSeparableFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`, :ocv:func:`gpu::createMorphologyFilter_GPU`, :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`gpu::createDerivFilter_GPU`, :ocv:func:`gpu::createGaussianFilter_GPU`
-
-
-
-gpu::createFilter2D_GPU
----------------------------
-Creates a non-separable filter engine with the specified filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createFilter2D_GPU( const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-
-    :param filter2D: Non-separable 2D filter.
-
-    :param srcType: Input image type. It must be supported by  ``filter2D`` .
-
-    :param dstType: Output image type. It must be supported by  ``filter2D`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`.
-
-
-
-gpu::createSeparableFilter_GPU
-----------------------------------
-Creates a separable filter engine with the specified filters.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableFilter_GPU( const Ptr<BaseRowFilter_GPU>& rowFilter, const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-
-    :param rowFilter: "Horizontal" 1D filter.
-
-    :param columnFilter: "Vertical" 1D filter.
-
-    :param srcType: Input image type. It must be supported by  ``rowFilter`` .
-
-    :param bufType: Buffer image type. It must be supported by  ``rowFilter``  and  ``columnFilter`` .
-
-    :param dstType: Output image type. It must be supported by  ``columnFilter`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createSeparableLinearFilter_GPU`.
-
-
-
-gpu::getRowSumFilter_GPU
-----------------------------
-Creates a horizontal 1D box filter.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1)
-
-    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::getColumnSumFilter_GPU
--------------------------------
-Creates a vertical 1D box filter.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1)
-
-    :param sumType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param dstType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::createBoxFilter_GPU
-----------------------------
-Creates a normalized 2D box filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor = Point(-1,-1))
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1))
-
-    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
-
-    :param dstType: Output image type.  It supports only the same values as the source type.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`boxFilter`
-
-
-
-gpu::boxFilter
+gpu::Filter::apply
 ------------------
-Smooths the image using the normalized box filter.
+Applies the specified filter to the image.
 
-.. ocv:function:: void gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::Filter::apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0
 
-    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+    :param src: Input image.
 
-    :param dst: Output image type. The size and type is the same as ``src`` .
-
-    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+    :param dst: Output image.
 
     :param stream: Stream for the asynchronous version.
 
-.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+
+gpu::createBoxFilter
+--------------------
+Creates a normalized 2D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` and ``CV_8UC4`` are supported for now.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
 
 .. seealso:: :ocv:func:`boxFilter`
 
 
 
-gpu::blur
--------------
-Acts as a synonym for the normalized box filter.
+gpu::createLinearFilter
+-----------------------
+Creates a non-separable linear 2D filter.
 
-.. ocv:function:: void gpu::blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: Ptr<Filter> gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
 
-    :param dst: Output image type with the same size and type as  ``src`` .
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
 
-    :param ksize: Kernel size.
+    :param kernel: 2D array of filter coefficients.
 
     :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
 
-    :param stream: Stream for the asynchronous version.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderVal: Default border value.
 
-.. seealso:: :ocv:func:`blur`, :ocv:func:`gpu::boxFilter`
+.. seealso:: :ocv:func:`filter2D`
 
 
 
-gpu::createMorphologyFilter_GPU
------------------------------------
+gpu::createLaplacianFilter
+--------------------------
+Creates a Laplacian operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
+
+    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+.. seealso:: :ocv:func:`Laplacian`
+
+
+
+gpu::createSeparableLinearFilter
+--------------------------------
+Creates a separable linear filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source array type.
+
+    :param dstType: Destination array type.
+
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`sepFilter2D`
+
+
+
+gpu::createDerivFilter
+----------------------
+Creates a generalized Deriv operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize = false, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+
+    :param normalize: Flag indicating whether to normalize (scale down) the filter coefficients or not. See  :ocv:func:`getDerivKernels` for details.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+
+
+gpu::createSobelFilter
+----------------------
+Creates a Sobel operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Sobel`
+
+
+
+gpu::createScharrFilter
+-----------------------
+Creates a vertical or horizontal Scharr operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Order of the derivative x.
+
+    :param dy: Order of the derivative y.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Scharr`
+
+
+
+gpu::createGaussianFilter
+-------------------------
+Creates a Gaussian filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2 = 0, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`GaussianBlur`
+
+
+
+gpu::createMorphologyFilter
+---------------------------
 Creates a 2D morphological filter.
 
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1)
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor=Point(-1,-1))
-
-    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
-
-    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
-
-    :param kernel: 2D 8-bit structuring element for the morphological operation.
-
-    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
-
-    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`createMorphologyFilter`
-
-
-
-gpu::erode
---------------
-Erodes an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. Only  ``CV_8UC1``  and  ``CV_8UC4``  types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param kernel: Structuring element used for erosion. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times erosion to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`erode`
-
-
-
-gpu::dilate
----------------
-Dilates an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
-
-    :param dst: Destination image with the same size and type as ``src``.
-
-    :param kernel: Structuring element used for dilation. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times dilation to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`dilate`
-
-
-
-gpu::morphologyEx
----------------------
-Applies an advanced morphological operation to an image.
-
-.. ocv:function::  void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
+.. ocv:function:: Ptr<Filter> gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1)
 
     :param op: Type of morphological operation. The following types are possible:
 
+        * **MORPH_ERODE** erode
+
+        * **MORPH_DILATE** dilate
+
         * **MORPH_OPEN** opening
 
         * **MORPH_CLOSE** closing
@@ -357,363 +247,88 @@ Applies an advanced morphological operation to an image.
 
         * **MORPH_BLACKHAT** "black hat"
 
-    :param kernel: Structuring element.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
 
-    :param anchor: Position of an anchor within the element. The default value ``Point(-1, -1)`` means that the anchor is at the element center.
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
+
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
 
     :param iterations: Number of times erosion and dilation to be applied.
 
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
 .. seealso:: :ocv:func:`morphologyEx`
 
 
 
-gpu::createLinearFilter_GPU
--------------------------------
-Creates a non-separable linear filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT)
-
-    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dstType: Output image type. The same type as ``src`` is supported.
-
-    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:class:`gpu::Convolution`.
-
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-.. seealso:: :ocv:func:`createLinearFilter`
-
-
-
-gpu::filter2D
------------------
-Applies the non-separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
-
-    :param kernel: 2D array of filter coefficients.
-
-    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`filter2D`, :ocv:class:`gpu::Convolution`
-
-
-
-gpu::Laplacian
-------------------
-Applies the Laplacian operator to an image.
-
-.. ocv:function:: void gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image. The size and number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. It supports only the same depth as the source image depth.
-
-    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
-
-    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`Laplacian`, :ocv:func:`gpu::filter2D`
-
-
-
-gpu::getLinearRowFilter_GPU
--------------------------------
-Creates a primitive row filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getLinearRowFilter_GPU( int srcType, int bufType, const Mat& rowKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
-
-    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``srcType == CV_8UC1`` or ``srcType == CV_8UC4`` and ``bufType == srcType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside the image.
-
-.. seealso:: :ocv:func:`createSeparableLinearFilter` .
-
-
-
-gpu::getLinearColumnFilter_GPU
-----------------------------------
-Creates a primitive column filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getLinearColumnFilter_GPU( int bufType, int dstType, const Mat& columnKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
-
-    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
-
-    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` . For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``dstType == CV_8UC1`` or ``dstType == CV_8UC4`` and ``bufType == dstType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside image.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::createSeparableLinearFilter_GPU
-----------------------------------------
-Creates a separable linear filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
-
-    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`. For details on limitations, see :ocv:func:`gpu::getLinearRowFilter_GPU`, cpp:ocv:func:`gpu::getLinearColumnFilter_GPU`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`gpu::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::sepFilter2D
---------------------
-Applies a separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` .
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param kernelX: Horizontal filter coefficients.
-
-    :param kernelY: Vertical filter coefficients.
-
-    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
-
-
-
-gpu::createDerivFilter_GPU
-------------------------------
-Creates a filter engine for the generalized Sobel operator.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
-
-
-
-gpu::Sobel
---------------
-Applies the generalized Sobel operator to an image.
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as source image.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Sobel`
-
-
-
-gpu::Scharr
----------------
-Calculates the first x- or y- image derivative using the Scharr operator.
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` has.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Order of the derivative x.
-
-    :param dy: Order of the derivative y.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Scharr`
-
-
-
-gpu::createGaussianFilter_GPU
----------------------------------
-Creates a Gaussian filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createGaussianFilter_GPU( int type, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
-
-    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
-
-
-
-gpu::GaussianBlur
----------------------
-Smooths an image using the Gaussian filter.
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param ksize: Gaussian kernel size.  ``ksize.width``  and  ``ksize.height``  can differ but they both must be positive and odd. If they are zeros, they are computed from  ``sigma1``  and  ``sigma2`` .
-
-    :param sigma1: Gaussian kernel standard deviation in X direction.
-
-    :param sigma2: Gaussian kernel standard deviation in Y direction. If  ``sigma2``  is zero, it is set to be equal to  ``sigma1`` . If they are both zeros, they are computed from  ``ksize.width``  and  ``ksize.height``, respectively. See  :ocv:func:`getGaussianKernel` for details. To fully control the result regardless of possible future modification of all this semantics, you are recommended to specify all of  ``ksize`` , ``sigma1`` , and  ``sigma2`` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createGaussianFilter_GPU`, :ocv:func:`GaussianBlur`
-
-
-
-gpu::getMaxFilter_GPU
--------------------------
+gpu::createBoxMaxFilter
+-----------------------
 Creates the maximum filter.
 
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
 
     :param ksize: Kernel size.
 
     :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
 
 
 
-gpu::getMinFilter_GPU
--------------------------
+gpu::createBoxMinFilter
+-----------------------
 Creates the minimum filter.
 
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
 
     :param ksize: Kernel size.
 
     :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createRowSumFilter
+-----------------------
+Creates a horizontal 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createColumnSumFilter
+--------------------------
+Creates a vertical 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 582c55d999..b0ebfd73c5 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -48,221 +48,101 @@
 #endif
 
 #include "opencv2/core/gpu.hpp"
-#include "opencv2/core/base.hpp"
+#include "opencv2/imgproc.hpp"
 
 namespace cv { namespace gpu {
 
-/*!
-The Base Class for 1D or Row-wise Filters
-
-This is the base class for linear or non-linear filters that process 1D data.
-In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseRowFilter_GPU
+class CV_EXPORTS Filter : public Algorithm
 {
 public:
-    BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseRowFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
+    virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };
 
-/*!
-The Base Class for Column-wise Filters
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Box Filter
 
-This is the base class for linear or non-linear filters that process columns of 2D arrays.
-Such filters are used for the "vertical" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseColumnFilter_GPU
-{
-public:
-    BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseColumnFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
-};
-
-/*!
-The Base Class for Non-Separable 2D Filters.
-
-This is the base class for linear or non-linear 2D filters.
-*/
-class CV_EXPORTS BaseFilter_GPU
-{
-public:
-    BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    Size ksize;
-    Point anchor;
-};
-
-/*!
-The Base Class for Filter Engine.
-
-The class can be used to apply an arbitrary filtering operation to an image.
-It contains all the necessary intermediate buffers.
-*/
-class CV_EXPORTS FilterEngine_GPU
-{
-public:
-    virtual ~FilterEngine_GPU() {}
-
-    virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-};
-
-//! returns the non-separable filter engine with the specified filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
-
-//! returns the separable filter engine with the specified filters
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);
-
-//! returns horizontal 1D box filter
-//! supports only CV_8UC1 source type and CV_32FC1 sum type
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
-
-//! returns vertical 1D box filter
-//! supports only CV_8UC1 sum type and CV_32FC1 dst type
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
-
-//! returns 2D box filter
-//! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
-CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
-
-//! returns box filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
-    const Point& anchor = Point(-1,-1));
-
-//! returns 2D morphological filter
-//! only MORPH_ERODE and MORPH_DILATE are supported
-//! supports CV_8UC1 and CV_8UC4 types
-//! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
-    Point anchor=Point(-1,-1));
-
-//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-
-//! returns 2D filter with the specified kernel
-//! supports CV_8U, CV_16U and CV_32F one and four channel image
-CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-//! returns the non-separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-    Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT);
-
-//! returns the primitive row filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the primitive column filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-
-//! returns filter engine for the generalized Sobel operator
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns the Gaussian filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns maximum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-//! returns minimum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-//! smooths the image using the normalized box filter
+//! creates a normalized 2D box filter
 //! supports CV_8UC1, CV_8UC4 types
-CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
+                                       int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! a synonym for normalized box filter
-static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
-{
-    boxFilter(src, dst, -1, ksize, anchor, stream);
-}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
 
-//! erodes the image (applies the local minimum operator)
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                      Point anchor = Point(-1, -1), int iterations = 1,
-                      Stream& stream = Stream::Null());
+//! Creates a non-separable linear 2D filter
+//! supports 1 and 4 channel CV_8U, CV_16U and CV_32F input
+CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! dilates the image (applies the local maximum operator)
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                       Point anchor = Point(-1, -1), int iterations = 1,
-                       Stream& stream = Stream::Null());
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
 
-//! applies an advanced morphological operation to the image
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
-                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-
-//! applies non-separable 2D linear filter to the image
-CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
-//! applies separable 2D linear filter to the image
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                            Stream& stream = Stream::Null());
-
-//! applies generalized Sobel operator to the image
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! applies the vertical or horizontal Scharr operator to the image
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! smooths the image using Gaussian filter.
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! applies Laplacian operator to the image
+//! creates a Laplacian operator
 //! supports only ksize = 1 and ksize = 3
-CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
+                                             int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
+
+//! creates a separable linear filter
+CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
+                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+//! creates a generalized Deriv operator
+CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
+                                         int ksize, bool normalize = false, double scale = 1,
+                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! creates a Sobel operator
+CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
+                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! creates a vertical or horizontal Scharr operator
+CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
+                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+//! creates a Gaussian filter
+CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
+                                            double sigma1, double sigma2 = 0,
+                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+//! creates a 2D morphological filter
+//! supports CV_8UC1 and CV_8UC4 types
+CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Image Rank Filter
+
+//! result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
+
+//! creates a horizontal 1D box filter
+//! supports only CV_8UC1 source type and CV_32FC1 sum type
+CV_EXPORTS Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! creates a vertical 1D box filter
+//! supports only CV_8UC1 sum type and CV_32FC1 dst type
+CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
 }} // namespace cv { namespace gpu {
 
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index 64cf4cc5db..6ad0998a5b 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -70,7 +70,9 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
+        cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(d_src.type(), -1, cv::Size(ksize, ksize));
+
+        TEST_CYCLE() blurFilter->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst, 1);
     }
@@ -84,6 +86,79 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(d_src.type(), -1, kernel);
+
+        TEST_CYCLE() filter2D->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(d_src.type(), -1, ksize);
+
+        TEST_CYCLE() laplacian->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // Sobel
 
@@ -102,9 +177,10 @@ PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);
+        cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(d_src.type(), -1, 1, 1, ksize);
+
+        TEST_CYCLE() sobel->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -135,9 +211,10 @@ PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);
+        cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(d_src.type(), -1, 1, 0);
+
+        TEST_CYCLE() scharr->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -169,9 +246,10 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE() gauss->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -185,39 +263,6 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     }
 }
 
-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // Erode
 
@@ -237,9 +282,10 @@ PERF_TEST_P(Sz_Type, Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8U
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), ker);
+
+        TEST_CYCLE() erode->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -272,9 +318,10 @@ PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), ker);
+
+        TEST_CYCLE() dilate->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -312,10 +359,10 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf1;
-        cv::gpu::GpuMat d_buf2;
 
-        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);
+        cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), ker);
+
+        TEST_CYCLE() morph->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -328,39 +375,3 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         CPU_SANITY_CHECK(dst);
     }
 }
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    declare.in(kernel, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
diff --git a/modules/gpufilters/src/cuda/filter2d.cu b/modules/gpufilters/src/cuda/filter2d.cu
index 80c93c54ed..4e913124df 100644
--- a/modules/gpufilters/src/cuda/filter2d.cu
+++ b/modules/gpufilters/src/cuda/filter2d.cu
@@ -48,111 +48,104 @@
 
 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
+    template <class SrcPtr, typename D>
+    __global__ void filter2D(const SrcPtr src, PtrStepSz<D> dst,
+                             const float* __restrict__ kernel,
+                             const int kWidth, const int kHeight,
+                             const int anchorX, const int anchorY)
     {
-        #define FILTER2D_MAX_KERNEL_SIZE 16
+        typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
 
-        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-        template <class SrcT, typename D>
-        __global__ void filter2D(const SrcT src, PtrStepSz<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        sum_t res = VecTraits<sum_t>::all(0);
+        int kInd = 0;
+
+        for (int i = 0; i < kHeight; ++i)
         {
-            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            sum_t res = VecTraits<sum_t>::all(0);
-            int kInd = 0;
-
-            for (int i = 0; i < kHeight; ++i)
-            {
-                for (int j = 0; j < kWidth; ++j)
-                    res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
-            }
-
-            dst(y, x) = saturate_cast<D>(res);
+            for (int j = 0; j < kWidth; ++j)
+                res = res + src(y - anchorY + i, x - anchorX + j) * kernel[kInd++];
         }
 
-        template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
-
-        #define IMPLEMENT_FILTER2D_TEX_READER(type) \
-            texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_filter2D_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
-            { \
-                static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, \
-                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(16, 16); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_filter2D_ ## type , srcWhole); \
-                    tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
-                    Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
-                    filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    if (stream == 0) \
-                        cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        IMPLEMENT_FILTER2D_TEX_READER(uchar);
-        IMPLEMENT_FILTER2D_TEX_READER(uchar4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(ushort);
-        IMPLEMENT_FILTER2D_TEX_READER(ushort4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(float);
-        IMPLEMENT_FILTER2D_TEX_READER(float4);
-
-        #undef IMPLEMENT_FILTER2D_TEX_READER
-
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                Filter2DCaller<T, D, BrdConstant>::call,
-                Filter2DCaller<T, D, BrdReplicate>::call,
-                Filter2DCaller<T, D, BrdReflect>::call,
-                Filter2DCaller<T, D, BrdWrap>::call,
-                Filter2DCaller<T, D, BrdReflect101>::call
-            };
-
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
-        }
-
-        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        dst(y, x) = saturate_cast<D>(res);
     }
+
+    template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+
+    #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+        texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        struct tex_filter2D_ ## type ## _reader \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            const int xoff; \
+            const int yoff; \
+            tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
+            } \
+        }; \
+        template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
+        { \
+            static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel, \
+                int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+            { \
+                typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                dim3 block(16, 16); \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kernel, kWidth, kHeight, anchorX, anchorY); \
+                cudaSafeCall( cudaGetLastError() ); \
+                if (stream == 0) \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+            } \
+        };
+
+    IMPLEMENT_FILTER2D_TEX_READER(uchar);
+    IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(ushort);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(float);
+    IMPLEMENT_FILTER2D_TEX_READER(float4);
+
+    #undef IMPLEMENT_FILTER2D_TEX_READER
+
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel,
+                               int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            Filter2DCaller<T, D, BrdConstant>::call,
+            Filter2DCaller<T, D, BrdReplicate>::call,
+            Filter2DCaller<T, D, BrdReflect>::call,
+            Filter2DCaller<T, D, BrdWrap>::call,
+            Filter2DCaller<T, D, BrdReflect101>::call
+        };
+
+        funcs[borderMode]((PtrStepSz<T>) srcWhole, ofsX, ofsY, (PtrStepSz<D>) dst, kernel,
+                          kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+    }
+
+    template void filter2D<uchar  , uchar  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<uchar4 , uchar4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort , ushort >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float  , float  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float4 , float4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
 }}}
 
 #endif // CUDA_DISABLER
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index d40293d4ac..7f02bdac59 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -47,286 +47,45 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<Filter> cv::gpu::createBoxFilter(int, int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
-void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_no_cuda(); }
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_no_cuda(); }
+Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createLaplacianFilter(int, int, int, double, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int, int, InputArray, InputArray, Point, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createDerivFilter(int, int, int, int, int, bool, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createSobelFilter(int, int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createScharrFilter(int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createMorphologyFilter(int, int, InputArray, Point, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createBoxMinFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createRowSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createColumnSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
 #else
 
 namespace
 {
-    inline void normalizeAnchor(int& anchor, int ksize)
+    void normalizeAnchor(int& anchor, int ksize)
     {
         if (anchor < 0)
             anchor = ksize >> 1;
 
-        CV_Assert(0 <= anchor && anchor < ksize);
+        CV_Assert( 0 <= anchor && anchor < ksize );
     }
 
-    inline void normalizeAnchor(Point& anchor, const Size& ksize)
+    void normalizeAnchor(Point& anchor, Size ksize)
     {
         normalizeAnchor(anchor.x, ksize.width);
         normalizeAnchor(anchor.y, ksize.height);
     }
-
-    inline void normalizeROI(Rect& roi, const Size& ksize, const Point& anchor, const Size& src_size)
-    {
-        if (roi == Rect(0,0,-1,-1))
-            roi = Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height);
-
-        CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
-    }
-
-    inline void normalizeKernel(const Mat& kernel, GpuMat& gpu_krnl, int type = CV_8U, int* nDivisor = 0, bool reverse = false)
-    {
-        int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
-        if (nDivisor) *nDivisor = scale;
-
-        Mat temp(kernel.size(), type);
-        kernel.convertTo(temp, type, scale);
-        Mat cont_krnl = temp.reshape(1, 1);
-
-        if (reverse)
-        {
-            int count = cont_krnl.cols >> 1;
-            for (int i = 0; i < count; ++i)
-            {
-                std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
-            }
-        }
-
-        gpu_krnl.upload(cont_krnl);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-namespace
-{
-    struct Filter2DEngine_GPU : public FilterEngine_GPU
-    {
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
-            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
-        {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-
-            (*filter2D)(srcROI, dstROI, stream);
-        }
-
-        Ptr<BaseFilter_GPU> filter2D;
-        int srcType, dstType;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// SeparableFilter
-
-namespace
-{
-    struct SeparableFilterEngine_GPU : public FilterEngine_GPU
-    {
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf;
-        }
-
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_,
-                                  GpuMat& buf_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf_;
-        }
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            ensureSizeIsEnough(src_size, bufType, *pbuf);
-
-            normalizeROI(roi, ksize, anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-            GpuMat bufROI = (*pbuf)(roi);
-
-            (*rowFilter)(srcROI, bufROI, stream);
-            (*columnFilter)(bufROI, dstROI, stream);
-        }
-
-        Ptr<BaseRowFilter_GPU> rowFilter;
-        Ptr<BaseColumnFilter_GPU> columnFilter;
-
-        int srcType, bufType, dstType;
-
-        Size ksize;
-        Point anchor;
-
-        GpuMat buf;
-        GpuMat* pbuf;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// 1D Sum Filter
-
-namespace
-{
-    struct NppRowSumFilter : public BaseRowFilter_GPU
-    {
-        NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor)
-{
-    CV_Assert(srcType == CV_8UC1 && sumType == CV_32FC1);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseRowFilter_GPU>(new NppRowSumFilter(ksize, anchor));
-}
-
-namespace
-{
-    struct NppColumnSumFilter : public BaseColumnFilter_GPU
-    {
-        NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor)
-{
-    CV_Assert(sumType == CV_8UC1 && dstType == CV_32FC1);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseColumnFilter_GPU>(new NppColumnSumFilter(ksize, anchor));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -334,328 +93,83 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstTy
 
 namespace
 {
-    typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        NppiSize oMaskSize, NppiPoint oAnchor);
-
-    struct NPPBoxFilter : public BaseFilter_GPU
+    class NPPBoxFilter : public Filter
     {
-        NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
+    public:
+        NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
+    private:
+        typedef NppStatus (*nppFilterBox_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep,
+                                            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
 
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        nppFilterBox_t func;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
-{
-    static const nppFilterBox_t nppFilterBox_callers[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPBoxFilter(ksize, anchor, nppFilterBox_callers[CV_MAT_CN(srcType)]));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)
-{
-    Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);
-    return createFilter2D_GPU(boxFilter, srcType, dstType);
-}
-
-void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor, Stream& stream)
-{
-    int sdepth = src.depth(), cn = src.channels();
-    if( ddepth < 0 )
-        ddepth = sdepth;
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, cn));
-
-    Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor);
-    f->apply(src, dst, Rect(0,0,-1,-1), stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Morphology Filter
-
-namespace
-{
-    typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint);
-
-    struct NPPMorphFilter : public BaseFilter_GPU
-    {
-        NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        nppMorfFilter_t func;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
-{
-    static const nppMorfFilter_t nppMorfFilter_callers[2][5] =
-    {
-        {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
-        {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
+        Size ksize_;
+        Point anchor_;
+        int type_;
+        nppFilterBox_t func_;
+        int borderMode_;
+        Scalar borderVal_;
+        GpuMat srcBorder_;
     };
 
-    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
-    CV_Assert(type == CV_8UC1 || type == CV_8UC4);
-
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl);
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
-}
-
-namespace
-{
-    struct MorphologyFilterEngine_GPU : public FilterEngine_GPU
+    NPPBoxFilter::NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        ksize_(ksize), anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
     {
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) :
-            filter2D(filter2D_), type(type_), iters(iters_)
-        {
-            pbuf = &buf;
-        }
+        static const nppFilterBox_t funcs[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
 
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) :
-            filter2D(filter2D_), type(type_), iters(iters_)
-        {
-            pbuf = &buf_;
-        }
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+        CV_Assert( dstType == srcType );
 
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == type);
+        normalizeAnchor(anchor_, ksize);
 
-            Size src_size = src.size();
-
-            dst.create(src_size, type);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-            if (iters > 1)
-                pbuf->create(src_size, type);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-
-            (*filter2D)(srcROI, dstROI, stream);
-
-            for(int i = 1; i < iters; ++i)
-            {
-                dst.swap((*pbuf));
-
-                dstROI = dst(roi);
-                GpuMat bufROI = (*pbuf)(roi);
-
-                (*filter2D)(bufROI, dstROI, stream);
-            }
-        }
-
-        Ptr<BaseFilter_GPU> filter2D;
-
-        int type;
-        int iters;
-
-        GpuMat buf;
-        GpuMat* pbuf;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor, int iterations)
-{
-    CV_Assert(iterations > 0);
-
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, const Point& anchor, int iterations)
-{
-    CV_Assert(iterations > 0);
-
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations, buf));
-}
-
-namespace
-{
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream = Stream::Null())
-    {
-        Mat kernel;
-        Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
-
-        normalizeAnchor(anchor, ksize);
-
-        if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
-        {
-            src.copyTo(dst, stream);
-            return;
-        }
-
-        dst.create(src.size(), src.type());
-
-        if (!_kernel.data)
-        {
-            kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2));
-            anchor = Point(iterations, iterations);
-            iterations = 1;
-        }
-        else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
-        {
-            anchor = Point(anchor.x * iterations, anchor.y * iterations);
-            kernel = getStructuringElement(MORPH_RECT,
-                                           Size(ksize.width + (iterations - 1) * (ksize.width - 1),
-                                                ksize.height + (iterations - 1) * (ksize.height - 1)),
-                                           anchor);
-            iterations = 1;
-        }
-        else
-            kernel = _kernel;
-
-        Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, buf, anchor, iterations);
-
-        f->apply(src, dst, Rect(0,0,-1,-1), stream);
+        func_ = funcs[CV_MAT_CN(srcType)];
     }
 
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations)
+    void NPPBoxFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
     {
-        GpuMat buf;
-        morphOp(op, src, dst, _kernel, buf, anchor, iterations);
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                           dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
+Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations);
-}
+    if (dstType < 0)
+        dstType = srcType;
 
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
-{
-    morphOp(MORPH_ERODE, src, dst, kernel, buf, anchor, iterations, stream);
-}
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
 
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
-{
-    morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations);
-}
-
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
-{
-    morphOp(MORPH_DILATE, src, dst, kernel, buf, anchor, iterations, stream);
-}
-
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations)
-{
-    GpuMat buf1;
-    GpuMat buf2;
-    morphologyEx(src, dst, op, kernel, buf1, buf2, anchor, iterations);
-}
-
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor, int iterations, Stream& stream)
-{
-    switch( op )
-    {
-    case MORPH_ERODE:
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_DILATE:
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_OPEN:
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
-        dilate(buf2, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_CLOSE:
-        dilate(src, buf2, kernel, buf1, anchor, iterations, stream);
-        erode(buf2, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-#ifdef HAVE_OPENCV_GPUARITHM
-    case MORPH_GRADIENT:
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(dst, buf2, dst, GpuMat(), -1, stream);
-        break;
-
-    case MORPH_TOPHAT:
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
-        dilate(dst, buf2, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(src, buf2, dst, GpuMat(), -1, stream);
-        break;
-
-    case MORPH_BLACKHAT:
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        erode(dst, buf2, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(buf2, src, dst, GpuMat(), -1, stream);
-        break;
-#endif
-
-    default:
-        CV_Error(cv::Error::StsBadArg, "unknown morphological operation");
-    }
+    return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -663,165 +177,127 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
 
 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
-    {
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream);
-    }
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream);
 }}}
 
 namespace
 {
-    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);
-
-    struct NPPLinearFilter : public BaseFilter_GPU
+    class LinearFilter : public Filter
     {
-        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+    public:
+        LinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter2D_t func;
-    };
-
-    typedef void (*gpuFilter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
+    private:
+        typedef void (*filter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                                   int kWidth, int kHeight, int anchorX, int anchorY,
                                    int borderMode, const float* borderValue, cudaStream_t stream);
 
-    struct GpuFilter2D : public BaseFilter_GPU
-    {
-        GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) :
-            BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_)
-        {
-        }
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-        {
-            using namespace cv::gpu::cudev::imgproc;
-
-            Point ofs;
-            Size wholeSize;
-            src.locateROI(wholeSize, ofs);
-            GpuMat srcWhole(wholeSize, src.type(), src.datastart);
-
-            static const Scalar_<float> zero = Scalar_<float>::all(0.0f);
-            func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), brd_type, zero.val, StreamAccessor::getStream(stream));
-        }
-
-        gpuFilter2D_t func;
-        GpuMat kernel;
-        int brd_type;
+        GpuMat kernel_;
+        Point anchor_;
+        int type_;
+        filter2D_t func_;
+        int borderMode_;
+        Scalar_<float> borderVal_;
     };
-}
 
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    int sdepth = CV_MAT_DEPTH(srcType);
-    int scn = CV_MAT_CN(srcType);
-
-    CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F);
-    CV_Assert(scn == 1 || scn == 4);
-    CV_Assert(dstType == srcType);
-    CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP);
-
-    Size ksize = kernel.size();
-
-#if 0
-    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT)
+    LinearFilter::LinearFilter(int srcType, int dstType, InputArray _kernel, Point anchor, int borderMode, Scalar borderVal) :
+        anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
     {
-        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int scn = CV_MAT_CN(srcType);
 
-        GpuMat gpu_krnl;
-        int nDivisor;
-        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
+        Mat kernel = _kernel.getMat();
 
-        normalizeAnchor(anchor, ksize);
+        CV_Assert( sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F );
+        CV_Assert( scn == 1 || scn == 4 );
+        CV_Assert( dstType == srcType );
+        CV_Assert( kernel.channels() == 1 );
+        CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
 
-        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
-    }
-#endif
+        Mat kernel32F;
+        kernel.convertTo(kernel32F, CV_32F);
 
-    CV_Assert(ksize.width * ksize.height <= 16 * 16);
+        kernel_ = gpu::createContinuous(kernel.size(), CV_32FC1);
+        kernel_.upload(kernel32F);
 
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl, CV_32F);
+        normalizeAnchor(anchor_, kernel.size());
 
-    normalizeAnchor(anchor, ksize);
-
-    gpuFilter2D_t func = 0;
-
-    switch (srcType)
-    {
-    case CV_8UC1:
-        func = filter2D_gpu<uchar, uchar>;
-        break;
-    case CV_8UC4:
-        func = filter2D_gpu<uchar4, uchar4>;
-        break;
-    case CV_16UC1:
-        func = filter2D_gpu<ushort, ushort>;
-        break;
-    case CV_16UC4:
-        func = filter2D_gpu<ushort4, ushort4>;
-        break;
-    case CV_32FC1:
-        func = filter2D_gpu<float, float>;
-        break;
-    case CV_32FC4:
-        func = filter2D_gpu<float4, float4>;
-        break;
+        switch (srcType)
+        {
+        case CV_8UC1:
+            func_ = cudev::filter2D<uchar, uchar>;
+            break;
+        case CV_8UC4:
+            func_ = cudev::filter2D<uchar4, uchar4>;
+            break;
+        case CV_16UC1:
+            func_ = cudev::filter2D<ushort, ushort>;
+            break;
+        case CV_16UC4:
+            func_ = cudev::filter2D<ushort4, ushort4>;
+            break;
+        case CV_32FC1:
+            func_ = cudev::filter2D<float, float>;
+            break;
+        case CV_32FC4:
+            func_ = cudev::filter2D<float4, float4>;
+            break;
+        }
     }
 
-    return Ptr<BaseFilter_GPU>(new GpuFilter2D(ksize, anchor, func, gpu_krnl, brd_type));
+    void LinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        Point ofs;
+        Size wholeSize;
+        src.locateROI(wholeSize, ofs);
+
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart);
+
+        func_(srcWhole, ofs.x, ofs.y, dst, kernel_.ptr<float>(),
+              kernel_.cols, kernel_.rows, anchor_.x, anchor_.y,
+              borderMode_, borderVal_.val, StreamAccessor::getStream(_stream));
+    }
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType)
+Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal)
 {
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType);
+    if (dstType < 0)
+        dstType = srcType;
 
-    return createFilter2D_GPU(linearFilter, srcType, dstType);
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
+    return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
 }
 
-void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
+
+Ptr<Filter> cv::gpu::createLaplacianFilter(int srcType, int dstType, int ksize, double scale, int borderMode, Scalar borderVal)
 {
-    if (ddepth < 0)
-        ddepth = src.depth();
+    CV_Assert( ksize == 1 || ksize == 3 );
 
-    int dst_type = CV_MAKE_TYPE(ddepth, src.channels());
+    static const float K[2][9] =
+    {
+        {0.0f, 1.0f, 0.0f, 1.0f, -4.0f, 1.0f, 0.0f, 1.0f, 0.0f},
+        {2.0f, 0.0f, 2.0f, 0.0f, -8.0f, 0.0f, 2.0f, 0.0f, 2.0f}
+    };
 
-    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType);
+    Mat kernel(3, 3, CV_32FC1, (void*)K[ksize == 3]);
+    if (scale != 1)
+        kernel *= scale;
 
-    dst.create(src.size(), dst_type);
-
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+    return gpu::createLinearFilter(srcType, dstType, kernel, Point(-1,-1), borderMode, borderVal);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -838,294 +314,130 @@ namespace filter
 
 namespace
 {
-    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
-        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
-
-    typedef void (*gpuFilter1D_t)(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-
-    struct NppLinearRowFilter : public BaseRowFilter_GPU
+    class SeparableLinearFilter : public Filter
     {
-        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+    public:
+        SeparableLinearFilter(int srcType, int dstType,
+                              InputArray rowKernel, InputArray columnKernel,
+                              Point anchor, int rowBorderMode, int columnBorderMode);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+        int srcType_, bufType_, dstType_;
+        GpuMat rowKernel_, columnKernel_;
+        func_t rowFilter_, columnFilter_;
+        Point anchor_;
+        int rowBorderMode_, columnBorderMode_;
+
+        GpuMat buf_;
+    };
+
+    SeparableLinearFilter::SeparableLinearFilter(int srcType, int dstType,
+                                                 InputArray _rowKernel, InputArray _columnKernel,
+                                                 Point anchor, int rowBorderMode, int columnBorderMode) :
+        srcType_(srcType), dstType_(dstType), anchor_(anchor), rowBorderMode_(rowBorderMode), columnBorderMode_(columnBorderMode)
+    {
+        static const func_t rowFilterFuncs[7][4] =
         {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
+            {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+            {0, 0, 0, 0},
+            {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
+            {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
+            {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
+            {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+            {0, 0, 0, 0}
+        };
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearRowFilter : public BaseRowFilter_GPU
-    {
-        GpuLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+        static const func_t columnFilterFuncs[7][4] =
         {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
+            {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+            {0, 0, 0, 0},
+            {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
+            {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+            {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
+            {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+            {0, 0, 0, 0}
+        };
 
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int cn = CV_MAT_CN(srcType);
+        const int ddepth = CV_MAT_DEPTH(dstType);
 
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
-        {0, 0, 0, 0},
-        {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
-        {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
-        {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
-        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
-    };
+        Mat rowKernel = _rowKernel.getMat();
+        Mat columnKernel = _columnKernel.getMat();
 
-    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
+        CV_Assert( sdepth <= CV_64F && cn <= 4 );
+        CV_Assert( rowKernel.channels() == 1 );
+        CV_Assert( columnKernel.channels() == 1 );
+        CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP );
+        CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP );
 
-        GpuMat gpu_row_krnl;
-        int nDivisor;
-        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
+        Mat kernel32F;
 
-        const int ksize = gpu_row_krnl.cols;
-        normalizeAnchor(anchor, ksize);
+        rowKernel.convertTo(kernel32F, CV_32F);
+        rowKernel_.upload(kernel32F.reshape(1, 1));
 
-        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, npp_funcs[CV_MAT_CN(srcType)]));
+        columnKernel.convertTo(kernel32F, CV_32F);
+        columnKernel_.upload(kernel32F.reshape(1, 1));
+
+        CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 );
+        CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 );
+
+        normalizeAnchor(anchor_.x, rowKernel_.cols);
+        normalizeAnchor(anchor_.y, columnKernel_.cols);
+
+        bufType_ = CV_MAKE_TYPE(CV_32F, cn);
+
+        rowFilter_ = rowFilterFuncs[sdepth][cn - 1];
+        CV_Assert( rowFilter_ != 0 );
+
+        columnFilter_ = columnFilterFuncs[ddepth][cn - 1];
+        CV_Assert( columnFilter_ != 0 );
     }
 
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int sdepth = CV_MAT_DEPTH(srcType);
-    const int cn = CV_MAT_CN(srcType);
-    CV_Assert( sdepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    const gpuFilter1D_t func = funcs[sdepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_row_krnl;
-    normalizeKernel(rowKernel, gpu_row_krnl, CV_32F);
-
-    const int ksize = gpu_row_krnl.cols;
-    CV_Assert( ksize > 0 && ksize <= 32 );
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, borderType));
-}
-
-namespace
-{
-    struct NppLinearColumnFilter : public BaseColumnFilter_GPU
+    void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
     {
-        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
+        ensureSizeIsEnough(src.size(), bufType_, buf_);
 
-            NppStreamHandler h(stream);
+        DeviceInfo devInfo;
+        const int cc = devInfo.major() * 10 + devInfo.minor();
 
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
-    {
-        GpuLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            if (ksize > 16 && cc < 20)
-                CV_Error(cv::Error::StsNotImplemented, "column linear filter doesn't implemented for kernel size > 16 for device with compute capabilities less than 2.0");
-
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
-
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
-        {0, 0, 0, 0},
-        {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
-        {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
-        {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
-        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
-    };
-
-    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
-
-        GpuMat gpu_col_krnl;
-        int nDivisor;
-        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
-
-        const int ksize = gpu_col_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, npp_funcs[CV_MAT_CN(bufType)]));
+        rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
+        columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
     }
-
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int ddepth = CV_MAT_DEPTH(dstType);
-    const int cn = CV_MAT_CN(dstType);
-    CV_Assert( ddepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    gpuFilter1D_t func = funcs[ddepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_col_krnl;
-    normalizeKernel(columnKernel, gpu_col_krnl, CV_32F);
-
-    const int ksize = gpu_col_krnl.cols;
-    CV_Assert(ksize > 0 && ksize <= 32);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, borderType));
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
-    const Point& anchor, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor, int rowBorderMode, int columnBorderMode)
 {
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
+    if (dstType < 0)
+        dstType = srcType;
 
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
 
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
+    if (columnBorderMode < 0)
+        columnBorderMode = rowBorderMode;
 
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
-    const Point& anchor, int rowBorderType, int columnBorderType)
-{
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
-
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
-
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                          Point anchor, int rowBorderType, int columnBorderType)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                          Point anchor, int rowBorderType, int columnBorderType,
-                          Stream& stream)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+    return new SeparableLinearFilter(srcType, dstType, rowKernel, columnKernel, anchor, rowBorderMode, columnBorderMode);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deriv Filter
 
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize, double scale, int rowBorderMode, int columnBorderMode)
 {
     Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
-{
-    GpuMat buf;
-    Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
+    getDerivKernels(kx, ky, dx, dy, ksize, normalize, CV_32F);
 
     if (scale != 1)
     {
@@ -1137,55 +449,25 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;
     }
 
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
+    return gpu::createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1, -1), rowBorderMode, columnBorderMode);
 }
 
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize, double scale, int rowBorderMode, int columnBorderMode)
 {
-    GpuMat buf;
-    Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType);
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, ksize, false, scale, rowBorderMode, columnBorderMode);
 }
 
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+Ptr<Filter> cv::gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale, int rowBorderMode, int columnBorderMode)
 {
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
-
-    if( scale != 1 )
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if( dx == 0 )
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
-}
-
-void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
-{
-    CV_Assert(ksize == 1 || ksize == 3);
-
-    static const int K[2][9] =
-    {
-        {0, 1, 0, 1, -4, 1, 0, 1, 0},
-        {2, 0, 2, 0, -8, 0, 2, 0, 2}
-    };
-    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
-    if (scale != 1)
-        kernel *= scale;
-
-    filter2D(src, dst, ddepth, kernel, Point(-1,-1), borderType, stream);
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, -1, false, scale, rowBorderMode, columnBorderMode);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2, int rowBorderMode, int columnBorderMode)
 {
-    int depth = CV_MAT_DEPTH(type);
+    const int depth = CV_MAT_DEPTH(srcType);
 
     if (sigma2 <= 0)
         sigma2 = sigma1;
@@ -1201,70 +483,298 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
     sigma1 = std::max(sigma1, 0.0);
     sigma2 = std::max(sigma2, 0.0);
 
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
+    Mat kx = getGaussianKernel(ksize.width, sigma1, CV_32F);
     Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
         ky = kx;
     else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
+        ky = getGaussianKernel(ksize.height, sigma2, CV_32F);
 
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
+    return createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1,-1), rowBorderMode, columnBorderMode);
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+namespace
 {
-    int depth = CV_MAT_DEPTH(type);
-
-    if (sigma2 <= 0)
-        sigma2 = sigma1;
-
-    // automatic detection of kernel size from sigma
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
-
-    sigma1 = std::max(sigma1, 0.0);
-    sigma2 = std::max(sigma2, 0.0);
-
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
-    Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    if (ksize.width == 1 && ksize.height == 1)
+    class MorphologyFilter : public Filter
     {
-        src.copyTo(dst);
-        return;
+    public:
+        MorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppMorfFilter_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        GpuMat kernel_;
+        Point anchor_;
+        int iters_;
+        nppMorfFilter_t func_;
+
+        GpuMat srcBorder_;
+        GpuMat buf_;
+    };
+
+    MorphologyFilter::MorphologyFilter(int op, int srcType, InputArray _kernel, Point anchor, int iterations) :
+        type_(srcType), anchor_(anchor), iters_(iterations)
+    {
+        static const nppMorfFilter_t funcs[2][5] =
+        {
+            {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
+            {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
+        };
+
+        CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE );
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+
+        Mat kernel = _kernel.getMat();
+        Size ksize = !kernel.empty() ? _kernel.size() : Size(3, 3);
+
+        normalizeAnchor(anchor_, ksize);
+
+        if (kernel.empty())
+        {
+            kernel = getStructuringElement(MORPH_RECT, Size(1 + iters_ * 2, 1 + iters_ * 2));
+            anchor_ = Point(iters_, iters_);
+            iters_ = 1;
+        }
+        else if (iters_ > 1 && countNonZero(kernel) == (int) kernel.total())
+        {
+            anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_);
+            kernel = getStructuringElement(MORPH_RECT,
+                                           Size(ksize.width + (iters_ - 1) * (ksize.width - 1),
+                                                ksize.height + (iters_ - 1) * (ksize.height - 1)),
+                                           anchor_);
+            iters_ = 1;
+        }
+
+        CV_Assert( kernel.channels() == 1 );
+
+        Mat kernel8U;
+        kernel.convertTo(kernel8U, CV_8U);
+
+        kernel_ = gpu::createContinuous(kernel.size(), CV_8UC1);
+        kernel_.upload(kernel8U);
+
+        func_ = funcs[op][CV_MAT_CN(srcType)];
     }
 
-    dst.create(src.size(), src.type());
+    void MorphologyFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
 
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
+        Size ksize = kernel_.size();
+        gpu::copyMakeBorder(src, srcBorder_, ksize.height, ksize.height, ksize.width, ksize.width, BORDER_DEFAULT, Scalar(), _stream);
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+
+        GpuMat bufRoi;
+        if (iters_ > 1)
+        {
+            ensureSizeIsEnough(srcBorder_.size(), type_, buf_);
+            buf_.setTo(Scalar::all(0), _stream);
+            bufRoi = buf_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+        }
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize.height;
+        oMaskSize.width = ksize.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+
+        for(int i = 1; i < iters_; ++i)
+        {
+            dst.copyTo(bufRoi, _stream);
+
+            nppSafeCall( func_(bufRoi.ptr<Npp8u>(), static_cast<int>(bufRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                               oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
+namespace
 {
-    if (ksize.width == 1 && ksize.height == 1)
+    class MorphologyExFilter : public Filter
     {
-        src.copyTo(dst);
-        return;
+    public:
+        MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+    protected:
+        Ptr<gpu::Filter> erodeFilter_, dilateFilter_;
+        GpuMat buf_;
+    };
+
+    MorphologyExFilter::MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations)
+    {
+        erodeFilter_ = gpu::createMorphologyFilter(MORPH_ERODE, srcType, kernel, anchor, iterations);
+        dilateFilter_ = gpu::createMorphologyFilter(MORPH_DILATE, srcType, kernel, anchor, iterations);
     }
 
-    dst.create(src.size(), src.type());
+    // MORPH_OPEN
 
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+    class MorphologyOpenFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyOpenFilter::MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyOpenFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_CLOSE
+
+    class MorphologyCloseFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyCloseFilter::MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyCloseFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, buf_, stream);
+        erodeFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_GRADIENT
+
+    class MorphologyGradientFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyGradientFilter::MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyGradientFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(src, dst, stream);
+        gpu::subtract(dst, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_TOPHAT
+
+    class MorphologyTophatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyTophatFilter::MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyTophatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, dst, stream);
+        dilateFilter_->apply(dst, buf_, stream);
+        gpu::subtract(src, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_BLACKHAT
+
+    class MorphologyBlackhatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyBlackhatFilter::MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyBlackhatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, dst, stream);
+        erodeFilter_->apply(dst, buf_, stream);
+        gpu::subtract(buf_, src, dst, noArray(), -1, stream);
+    }
+}
+
+Ptr<Filter> cv::gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations)
+{
+    switch( op )
+    {
+    case MORPH_ERODE:
+    case MORPH_DILATE:
+        return new MorphologyFilter(op, srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_OPEN:
+        return new MorphologyOpenFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_CLOSE:
+        return new MorphologyCloseFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_GRADIENT:
+        return new MorphologyGradientFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_TOPHAT:
+        return new MorphologyTophatFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_BLACKHAT:
+        return new MorphologyBlackhatFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    default:
+        CV_Error(Error::StsBadArg, "Unknown morphological operation");
+        return Ptr<Filter>();
+    }
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1272,59 +782,217 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& b
 
 namespace
 {
-    typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        NppiSize oMaskSize, NppiPoint oAnchor);
-
-    struct NPPRankFilter : public BaseFilter_GPU
+    enum
     {
-        NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        nppFilterRank_t func;
+        RANK_MAX,
+        RANK_MIN
     };
+
+    class NPPRankFilter : public Filter
+    {
+    public:
+        NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        Size ksize_;
+        Point anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+        nppFilterRank_t func_;
+
+        GpuMat srcBorder_;
+    };
+
+    NPPRankFilter::NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        type_(srcType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
+        static const nppFilterRank_t minFuncs[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+
+        normalizeAnchor(anchor_, ksize_);
+
+        if (op == RANK_MAX)
+            func_ = maxFuncs[CV_MAT_CN(srcType)];
+        else
+            func_ = minFuncs[CV_MAT_CN(srcType)];
+    }
+
+    void NPPRankFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
+    return new NPPRankFilter(RANK_MAX, srcType, ksize, anchor, borderMode, borderVal);
 }
 
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
+Ptr<Filter> cv::gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+    return new NPPRankFilter(RANK_MIN, srcType, ksize, anchor, borderMode, borderVal);
+}
 
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
 
-    normalizeAnchor(anchor, ksize);
+namespace
+{
+    class NppRowSumFilter : public Filter
+    {
+    public:
+        NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
 
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+
+        GpuMat srcBorder_;
+    };
+
+    NppRowSumFilter::NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppRowSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, 0, 0, ksize_, ksize_, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_, 0, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowRow_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
+{
+    return new NppRowSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
+}
+
+namespace
+{
+    class NppColumnSumFilter : public Filter
+    {
+    public:
+        NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+
+        GpuMat srcBorder_;
+    };
+
+    NppColumnSumFilter::NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppColumnSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_, ksize_, 0, 0, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(0, ksize_, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                   dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                   oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
+{
+    return new NppColumnSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
 #endif
diff --git a/modules/gpufilters/src/precomp.hpp b/modules/gpufilters/src/precomp.hpp
index 3add0f2af1..c3d5e020d2 100644
--- a/modules/gpufilters/src/precomp.hpp
+++ b/modules/gpufilters/src/precomp.hpp
@@ -46,14 +46,9 @@
 #include <limits>
 
 #include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuarithm.hpp"
 #include "opencv2/imgproc.hpp"
 
 #include "opencv2/core/private.gpu.hpp"
 
-#include "opencv2/opencv_modules.hpp"
-
-#ifdef HAVE_OPENCV_GPUARITHM
-#  include "opencv2/gpuarithm.hpp"
-#endif
-
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index 5adcd87a41..03bea05e6d 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -70,13 +70,14 @@ namespace
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Blur
 
-PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi)
+PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
     int type;
     cv::Size ksize;
     cv::Point anchor;
+    int borderType;
     bool useRoi;
 
     virtual void SetUp()
@@ -86,7 +87,8 @@ PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, Use
         type = GET_PARAM(2);
         ksize = GET_PARAM(3);
         anchor = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
 
         cv::gpu::setDevice(devInfo.deviceID());
     }
@@ -96,13 +98,15 @@ GPU_TEST_P(Blur, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(src.type(), -1, ksize, anchor, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::blur(loadMat(src, useRoi), dst, ksize, anchor);
+    blurFilter->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
-    cv::blur(src, dst_gold, ksize, anchor);
+    cv::blur(src, dst_gold, ksize, anchor, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
@@ -111,6 +115,173 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
     testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
     testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Filter2D, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(src.type(), -1, kernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter2D->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Laplacian, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(src.type(), -1, ksize.width);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    laplacian->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::Laplacian(src, dst_gold, -1, ksize.width);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SeparableLinearFilter
+
+PARAM_TEST_CASE(SeparableLinearFilter, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        anchor = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+GPU_TEST_P(SeparableLinearFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat rowKernel = randomMat(Size(ksize.width, 1), CV_32FC1, 0.0, 1.0);
+    cv::Mat columnKernel = randomMat(Size(ksize.height, 1), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter = cv::gpu::createSeparableLinearFilter(src.type(), -1, rowKernel, columnKernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::sepFilter2D(src, dst_gold, -1, rowKernel, columnKernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, SeparableLinearFilter, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(23, 15)),
+                    KSize(cv::Size(31, 3))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -155,13 +326,15 @@ GPU_TEST_P(Sobel, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(src.type(), -1, dx, dy, ksize.width, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Sobel(loadMat(src, useRoi), dst, -1, dx, dy, ksize.width, 1.0, borderType);
+    sobel->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Sobel, testing::Combine(
@@ -218,13 +391,15 @@ GPU_TEST_P(Scharr, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(src.type(), -1, dx, dy, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Scharr(loadMat(src, useRoi), dst, -1, dx, dy, 1.0, borderType);
+    scharr->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Scharr, testing::Combine(
@@ -277,28 +452,15 @@ GPU_TEST_P(GaussianBlur, Accuracy)
     double sigma1 = randomDouble(0.1, 1.0);
     double sigma2 = randomDouble(0.1, 1.0);
 
-    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
+    cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, borderType);
 
-        cv::Mat dst_gold;
-        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    gauss->apply(loadMat(src, useRoi), dst);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
-    }
+    cv::Mat dst_gold;
+    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 4.0 : 1e-4);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
@@ -327,49 +489,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Laplacian, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
-
-    cv::Mat dst_gold;
-    cv::Laplacian(src, dst_gold, -1, ksize.width);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
-    WHOLE_SUBMAT));
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
 
@@ -400,8 +519,10 @@ GPU_TEST_P(Erode, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::erode(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    erode->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::erode(src, dst_gold, kernel, anchor, iterations);
@@ -449,8 +570,10 @@ GPU_TEST_P(Dilate, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::dilate(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    dilate->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::dilate(src, dst_gold, kernel, anchor, iterations);
@@ -502,8 +625,10 @@ GPU_TEST_P(MorphEx, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::morphologyEx(loadMat(src, useRoi), dst, morphOp, kernel, anchor, iterations);
+    morph->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::morphologyEx(src, dst_gold, morphOp, kernel, anchor, iterations);
@@ -522,56 +647,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, MorphEx, testing::Combine(
     testing::Values(Iterations(1), Iterations(2), Iterations(3)),
     WHOLE_SUBMAT));
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    cv::Point anchor;
-    int borderType;
-    bool useRoi;
-
-    cv::Mat img;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        anchor = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Filter2D, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType);
-
-    cv::Mat dst_gold;
-    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
-    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
-    WHOLE_SUBMAT));
-
 #endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
index cf1b8e6706..3fe9f82f4c 100644
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -158,7 +158,7 @@ struct CV_EXPORTS CannyBuf
     GpuMat mag;
     GpuMat map;
     GpuMat st1, st2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
+    Ptr<Filter> filterDX, filterDY;
 };
 
 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
diff --git a/modules/gpuimgproc/src/canny.cpp b/modules/gpuimgproc/src/canny.cpp
index 8d361fe50f..9a33575648 100644
--- a/modules/gpuimgproc/src/canny.cpp
+++ b/modules/gpuimgproc/src/canny.cpp
@@ -65,8 +65,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 
         if (apperture_size != 3)
         {
-            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+            filterDX = createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size, false, 1, BORDER_REPLICATE);
+            filterDY = createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size, false, 1, BORDER_REPLICATE);
         }
     }
 
@@ -150,8 +150,8 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
     }
     else
     {
-        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
-        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+        buf.filterDX->apply(src, buf.dx);
+        buf.filterDY->apply(src, buf.dy);
 
         calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
     }
diff --git a/modules/gpuimgproc/src/corners.cpp b/modules/gpuimgproc/src/corners.cpp
index 44dc1505d4..824a3308ee 100644
--- a/modules/gpuimgproc/src/corners.cpp
+++ b/modules/gpuimgproc/src/corners.cpp
@@ -70,6 +70,8 @@ namespace
 {
     void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
     {
+        (void) buf;
+
         double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
 
         if (ksize < 0)
@@ -83,16 +85,21 @@ namespace
         Dx.create(src.size(), CV_32F);
         Dy.create(src.size(), CV_32F);
 
+        Ptr<gpu::Filter> filterDx, filterDy;
+
         if (ksize > 0)
         {
-            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
-            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
+            filterDx = gpu::createSobelFilter(src.type(), CV_32F, 1, 0, ksize, scale, borderType);
+            filterDy = gpu::createSobelFilter(src.type(), CV_32F, 0, 1, ksize, scale, borderType);
         }
         else
         {
-            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
-            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
+            filterDx = gpu::createScharrFilter(src.type(), CV_32F, 1, 0, scale, borderType);
+            filterDy = gpu::createScharrFilter(src.type(), CV_32F, 0, 1, scale, borderType);
         }
+
+        filterDx->apply(src, Dx);
+        filterDy->apply(src, Dy);
     }
 }
 
diff --git a/modules/superres/src/btv_l1_gpu.cpp b/modules/superres/src/btv_l1_gpu.cpp
index 6813187c45..7b2ad73700 100644
--- a/modules/superres/src/btv_l1_gpu.cpp
+++ b/modules/superres/src/btv_l1_gpu.cpp
@@ -230,7 +230,7 @@ namespace
         Ptr<DenseOpticalFlowExt> opticalFlow_;
 
     private:
-        std::vector<Ptr<FilterEngine_GPU> > filters_;
+        std::vector<Ptr<gpu::Filter> > filters_;
         int curBlurKernelSize_;
         double curBlurSigma_;
         int curSrcType_;
@@ -299,7 +299,7 @@ namespace
         {
             filters_.resize(src.size());
             for (size_t i = 0; i < src.size(); ++i)
-                filters_[i] = createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                filters_[i] = gpu::createGaussianFilter(src[0].type(), -1, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
             curBlurKernelSize_ = blurKernelSize_;
             curBlurSigma_ = blurSigma_;
             curSrcType_ = src[0].type();
@@ -346,7 +346,7 @@ namespace
                 // a = M * Ih
                 gpu::remap(highRes_, a_[k], backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
                 // b = HM * Ih
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // c = DHF * Ih
                 gpu::resize(b_[k], c_[k], lowResSize, 0, 0, INTER_NEAREST, streams_[k]);
 
@@ -355,7 +355,7 @@ namespace
                 // a = Dt * diff
                 upscale(c_[k], a_[k], scale_, streams_[k]);
                 // b = HtDt * diff
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // diffTerm = MtHtDt * diff
                 gpu::remap(b_[k], diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
             }
diff --git a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
index 87b5255990..1815cc6dec 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
@@ -308,6 +308,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
     gpu::split(tmp2, vI2);
     Scalar mssim;
 
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(vI2[0].type(), -1, Size(11, 11), 1.5);
+
     for( int i = 0; i < gI1.channels(); ++i )
     {
         gpu::GpuMat I2_2, I1_2, I1_I2;
@@ -318,8 +320,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         /*************************** END INITS **********************************/
         gpu::GpuMat mu1, mu2;   // PRELIMINARY COMPUTING
-        gpu::GaussianBlur(vI1[i], mu1, Size(11, 11), 1.5);
-        gpu::GaussianBlur(vI2[i], mu2, Size(11, 11), 1.5);
+        gauss->apply(vI1[i], mu1);
+        gauss->apply(vI2[i], mu2);
 
         gpu::GpuMat mu1_2, mu2_2, mu1_mu2;
         gpu::multiply(mu1, mu1, mu1_2);
@@ -328,13 +330,13 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         gpu::GpuMat sigma1_2, sigma2_2, sigma12;
 
-        gpu::GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
+        gauss->apply(I1_2, sigma1_2);
         gpu::subtract(sigma1_2, mu1_2, sigma1_2); // sigma1_2 -= mu1_2;
 
-        gpu::GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
+        gauss->apply(I2_2, sigma2_2);
         gpu::subtract(sigma2_2, mu2_2, sigma2_2); // sigma2_2 -= mu2_2;
 
-        gpu::GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
+        gauss->apply(I1_I2, sigma12);
         gpu::subtract(sigma12, mu1_mu2, sigma12); // sigma12 -= mu1_mu2;
 
         ///////////////////////////////// FORMULA ////////////////////////////////
@@ -375,7 +377,7 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
     gpu::split(b.t2, b.vI2, stream);
     Scalar mssim;
 
-    gpu::GpuMat buf;
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(b.vI1[0].type(), -1, Size(11, 11), 1.5);
 
     for( int i = 0; i < b.gI1.channels(); ++i )
     {
@@ -383,22 +385,22 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
         gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
         gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
 
-        gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
-        gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.vI1[i], b.mu1, stream);
+        gauss->apply(b.vI2[i], b.mu2, stream);
 
         gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
         gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
         gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
 
-        gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_2, b.sigma1_2, stream);
         gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);
         //b.sigma1_2 -= b.mu1_2;  - This would result in an extra data transfer operation
 
-        gpu::GaussianBlur(b.I2_2, b.sigma2_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I2_2, b.sigma2_2, stream);
         gpu::subtract(b.sigma2_2, b.mu2_2, b.sigma2_2, gpu::GpuMat(), -1, stream);
         //b.sigma2_2 -= b.mu2_2;
 
-        gpu::GaussianBlur(b.I1_I2, b.sigma12, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_I2, b.sigma12, stream);
         gpu::subtract(b.sigma12, b.mu1_mu2, b.sigma12, gpu::GpuMat(), -1, stream);
         //b.sigma12 -= b.mu1_mu2;
 
diff --git a/samples/gpu/morphology.cpp b/samples/gpu/morphology.cpp
index 1ed8f96dc9..abc6c980b0 100644
--- a/samples/gpu/morphology.cpp
+++ b/samples/gpu/morphology.cpp
@@ -1,120 +1,186 @@
+#include <iostream>
 
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuimgproc.hpp"
 
 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
 
-static void help()
+class App
 {
+public:
+    App(int argc, const char* argv[]);
 
-printf("\nShow off image morphology: erosion, dialation, open and close\n"
-    "Call:\n   morphology2 [image]\n"
-    "This program also shows use of rect, elipse and cross kernels\n\n");
-printf( "Hot keys: \n"
-    "\tESC - quit the program\n"
-    "\tr - use rectangle structuring element\n"
-    "\te - use elliptic structuring element\n"
-    "\tc - use cross-shaped structuring element\n"
-    "\tSPACE - loop through all the options\n" );
-}
+    int run();
 
-GpuMat src, dst;
+private:
+    void help();
 
-int element_shape = MORPH_RECT;
+    void OpenClose();
+    void ErodeDilate();
 
-//the address of variable which receives trackbar position update
-int max_iters = 10;
-int open_close_pos = 0;
-int erode_dilate_pos = 0;
+    static void OpenCloseCallback(int, void*);
+    static void ErodeDilateCallback(int, void*);
 
-// callback function for open/close trackbar
-static void OpenClose(int, void*)
+    gpu::GpuMat src, dst;
+
+    int element_shape;
+
+    int max_iters;
+    int open_close_pos;
+    int erode_dilate_pos;
+};
+
+App::App(int argc, const char* argv[])
 {
-    int n = open_close_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::morphologyEx(src, dst, MORPH_OPEN, element);
-    else
-        cv::gpu::morphologyEx(src, dst, MORPH_CLOSE, element);
-    imshow("Open/Close",(Mat)dst);
-}
+    element_shape = MORPH_RECT;
+    open_close_pos = erode_dilate_pos = max_iters = 10;
 
-// callback function for erode/dilate trackbar
-static void ErodeDilate(int, void*)
-{
-    int n = erode_dilate_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::erode(src, dst, element);
-    else
-        cv::gpu::dilate(src, dst, element);
-    imshow("Erode/Dilate",(Mat)dst);
-}
-
-
-int main( int argc, char** argv )
-{
-    char* filename = argc == 2 ? argv[1] : (char*)"baboon.jpg";
-    if (string(argv[1]) == "--help")
+    if (argc == 2 && String(argv[1]) == "--help")
     {
         help();
-        return -1;
+        exit(0);
     }
 
-    src.upload(imread(filename, 1));
-    if (src.empty())
+    String filename = argc == 2 ? argv[1] : "baboon.jpg";
+
+    Mat img = imread(filename);
+    if (img.empty())
     {
-        help();
-        return -1;
+        cerr << "Can't open image " << filename.c_str() << endl;
+        exit(-1);
     }
 
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    help();
-
-
+    src.upload(img);
     if (src.channels() == 3)
     {
         // gpu support only 4th channel images
-        GpuMat src4ch;
-        cv::gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
+        gpu::GpuMat src4ch;
+        gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
         src = src4ch;
     }
 
-    //create windows for output images
-    namedWindow("Open/Close",1);
-    namedWindow("Erode/Dilate",1);
+    help();
 
-    open_close_pos = erode_dilate_pos = max_iters;
-    createTrackbar("iterations", "Open/Close",&open_close_pos,max_iters*2+1,OpenClose);
-    createTrackbar("iterations", "Erode/Dilate",&erode_dilate_pos,max_iters*2+1,ErodeDilate);
+    gpu::printShortCudaDeviceInfo(gpu::getDevice());
+}
+
+int App::run()
+{
+    // create windows for output images
+    namedWindow("Open/Close");
+    namedWindow("Erode/Dilate");
+
+    createTrackbar("iterations", "Open/Close", &open_close_pos, max_iters * 2 + 1, OpenCloseCallback, this);
+    createTrackbar("iterations", "Erode/Dilate", &erode_dilate_pos, max_iters * 2 + 1, ErodeDilateCallback, this);
 
     for(;;)
     {
-        int c;
+        OpenClose();
+        ErodeDilate();
 
-        OpenClose(open_close_pos, 0);
-        ErodeDilate(erode_dilate_pos, 0);
-        c = waitKey();
+        char c = (char) waitKey();
 
-        if( (char)c == 27 )
+        switch (c)
+        {
+        case 27:
+            return 0;
             break;
-        if( (char)c == 'e' )
+
+        case 'e':
             element_shape = MORPH_ELLIPSE;
-        else if( (char)c == 'r' )
+            break;
+
+        case 'r':
             element_shape = MORPH_RECT;
-        else if( (char)c == 'c' )
+            break;
+
+        case 'c':
             element_shape = MORPH_CROSS;
-        else if( (char)c == ' ' )
+            break;
+
+        case ' ':
             element_shape = (element_shape + 1) % 3;
+            break;
+        }
+    }
+}
+
+void App::help()
+{
+    cout << "Show off image morphology: erosion, dialation, open and close \n";
+    cout << "Call: \n";
+    cout << "   gpu-example-morphology [image] \n";
+    cout << "This program also shows use of rect, elipse and cross kernels \n" << endl;
+
+    cout << "Hot keys: \n";
+    cout << "\tESC - quit the program \n";
+    cout << "\tr - use rectangle structuring element \n";
+    cout << "\te - use elliptic structuring element \n";
+    cout << "\tc - use cross-shaped structuring element \n";
+    cout << "\tSPACE - loop through all the options \n" << endl;
+}
+
+void App::OpenClose()
+{
+    int n = open_close_pos - max_iters;
+    int an = n > 0 ? n : -n;
+
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> openFilter = gpu::createMorphologyFilter(MORPH_OPEN, src.type(), element);
+        openFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> closeFilter = gpu::createMorphologyFilter(MORPH_CLOSE, src.type(), element);
+        closeFilter->apply(src, dst);
     }
 
-    return 0;
+    Mat h_dst(dst);
+    imshow("Open/Close", h_dst);
+}
+
+void App::ErodeDilate()
+{
+    int n = erode_dilate_pos - max_iters;
+    int an = n > 0 ? n : -n;
+
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> erodeFilter = gpu::createMorphologyFilter(MORPH_ERODE, src.type(), element);
+        erodeFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> dilateFilter = gpu::createMorphologyFilter(MORPH_DILATE, src.type(), element);
+        dilateFilter->apply(src, dst);
+    }
+
+    Mat h_dst(dst);
+    imshow("Erode/Dilate", h_dst);
+}
+
+void App::OpenCloseCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->OpenClose();
+}
+
+void App::ErodeDilateCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->ErodeDilate();
+}
+
+int main(int argc, const char* argv[])
+{
+    App app(argc, argv);
+    return app.run();
 }
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 97eb7a82aa..4333b76257 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -746,10 +746,12 @@ TEST(erode)
 
         d_src.upload(src);
 
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        Ptr<gpu::Filter> erode = gpu::createMorphologyFilter(MORPH_ERODE, d_src.type(), ker);
+
+        erode->apply(d_src, d_dst);
 
         GPU_ON;
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        erode->apply(d_src, d_dst);
         GPU_OFF;
     }
 }
@@ -929,10 +931,12 @@ TEST(GaussianBlur)
         gpu::GpuMat d_dst(src.size(), src.type());
         gpu::GpuMat d_buf;
 
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(3, 3), 1);
+
+        gauss->apply(d_src, d_dst);
 
         GPU_ON;
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        gauss->apply(d_src, d_dst);
         GPU_OFF;
     }
 }
@@ -961,10 +965,11 @@ TEST(filter2D)
             gpu::GpuMat d_src(src);
             gpu::GpuMat d_dst;
 
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            Ptr<gpu::Filter> filter2D = gpu::createLinearFilter(d_src.type(), -1, kernel);
+            filter2D->apply(d_src, d_dst);
 
             GPU_ON;
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            filter2D->apply(d_src, d_dst);
             GPU_OFF;
         }
     }