diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 0626d8938a..0cb8129b58 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -47,37 +47,33 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "internal_shared.hpp"
 
-using namespace cv::gpu;
-using namespace cv::gpu::device;
-
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
-
     //////////////////////////////////////////////////////////////////////////////////////
     // Compare
 
-    template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
+    template <typename T> struct NotEqual : binary_function<T, T, uchar>
     {
-        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
+        __device__ __forceinline__ uchar operator()(T src1, T src2) const
         {
             return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
         }
     };
 
-    template <typename T1, typename T2>
+    template <typename T>
     inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
     {
-        NotEqual<T1, T2> op;
-        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
+        NotEqual<T> op;
+        transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);
     }
 
     void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
     {
-        compare_ne<uint, uint>(src1, src2, dst, stream);
+        compare_ne<uint>(src1, src2, dst, stream);
     }
     void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
     {
-        compare_ne<float, float>(src1, src2, dst, stream);
+        compare_ne<float>(src1, src2, dst, stream);
     }
 
 
@@ -354,6 +350,35 @@ namespace cv { namespace gpu { namespace mathfunc
 
     //////////////////////////////////////////////////////////////////////////
     // min/max
+
+    namespace detail
+    {
+        template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+
+    template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
+    {
+    };
     
     template <typename T>
     void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
@@ -413,7 +438,39 @@ namespace cv { namespace gpu { namespace mathfunc
 
     
     //////////////////////////////////////////////////////////////////////////
-    // threshold  
+    // threshold
+
+    namespace detail
+    {
+        template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+
+    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
+    {
+    };
 
     template <template <typename> class Op, typename T>
     void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
@@ -454,8 +511,13 @@ namespace cv { namespace gpu { namespace mathfunc
     //////////////////////////////////////////////////////////////////////////
     // subtract
 
-    template <typename T>
-    void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
+    template <> struct TransformFunctorTraits< minus<short> > : DefaultTransformFunctorTraits< minus<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <typename T> void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
     {
         transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
     }
@@ -499,10 +561,35 @@ namespace cv { namespace gpu { namespace mathfunc
 
         __device__ __forceinline__ float operator()(const float& e) const
         {
-            return __powf(fabs(e), power);
+            return __powf(::fabs(e), power);
         }
     };
 
+    namespace detail
+    {
+        template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+        };
+        template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 8 };
+        };
+        template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+
+    template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
+    {
+    };
+
     template<typename T>
     void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream)
     {
@@ -514,6 +601,5 @@ namespace cv { namespace gpu { namespace mathfunc
     template void pow_caller<short>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
     template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
     template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
-    template void pow_caller<uint>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
     template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
 }}}
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index b5ee973038..3e93aa5d0a 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -40,14 +40,9 @@
 //
 //M*/
 
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"
 
 using namespace cv::gpu;
-using namespace cv::gpu::device;
 
 #ifndef CV_PI
 #define CV_PI   3.1415926535897932384626433832795f
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu
index 3636853b4d..f6ba358444 100644
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -45,9 +45,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 
-using namespace cv::gpu::device;
-
-namespace cv { namespace gpu { namespace matrix_operations {
+namespace cv { namespace gpu { namespace device {
 
     template <typename T> struct shift_and_sizeof;
     template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
@@ -249,7 +247,55 @@ namespace cv { namespace gpu { namespace matrix_operations {
 
         const double alpha, beta;
     };
-    
+
+    namespace detail
+    {
+        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 8 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+
+        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+        {
+        };
+    }
+
+    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    {
+    };
+        
     template<typename T, typename D>
     void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
     {
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index b1dd03f763..b00391e2aa 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -71,23 +71,16 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
 
 #include "opencv2/gpu/stream_accessor.hpp"
 
-namespace cv 
-{
-    namespace gpu
-    {
-        namespace matrix_operations
-        {            
-            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+namespace cv { namespace gpu { namespace device {            
+    void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
 
-            template <typename T>
-            void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
-            template <typename T>
-            void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
+    template <typename T>
+    void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
 
-            void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
-        }
-    }
-}
+    void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+}}}
 
 struct Stream::Impl
 {
@@ -108,14 +101,14 @@ namespace
     void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
     {
         Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
+        device::set_to_gpu(src, sf.val, src.channels(), stream);
     }
 
     template <typename T>
     void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
     {
         Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+        device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
     }
 }
 
@@ -262,7 +255,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
         psrc = &(temp = src);
 
     dst.create( src.size(), rtype );
-    matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
+    device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
 }
 
 cv::gpu::Stream::operator bool() const
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 640bae71df..62d0ea861d 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -67,7 +67,6 @@ void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
 double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_nogpu(); return 0.0;}
-
 void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&)  { throw_nogpu(); }
 
 #else
@@ -180,7 +179,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
     nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream));
 }
 
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
     template <typename T>
     void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
@@ -192,7 +191,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stre
     {
         CV_Assert(src1.size() == src2.size());
         dst.create(src1.size(), src1.type());
-        mathfunc::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+        device::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
     }
     else
         nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
@@ -338,7 +337,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
 //////////////////////////////////////////////////////////////////////////////
 // Comparison of two matrixes
 
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
     void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
     void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
@@ -375,7 +374,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         }
         else
         {
-            mathfunc::compare_ne_8uc4(src1, src2, dst, stream);
+            device::compare_ne_8uc4(src1, src2, dst, stream);
         }
     }
     else
@@ -393,7 +392,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         }
         else
         {
-            mathfunc::compare_ne_32f(src1, src2, dst, stream);
+            device::compare_ne_32f(src1, src2, dst, stream);
         }
     }
 }
@@ -402,7 +401,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
     void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream);
 
@@ -416,7 +415,7 @@ namespace
     {
         dst.create(src.size(), src.type());
 
-        cv::gpu::mathfunc::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), 
+        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), 
                                               dst.channels(), src, dst, stream);
     }
 
@@ -426,10 +425,10 @@ namespace
         using namespace cv::gpu;
 
         typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskNotCaller<unsigned char>, mathfunc::bitwiseMaskNotCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskNotCaller<unsigned short>, mathfunc::bitwiseMaskNotCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskNotCaller<unsigned int>, mathfunc::bitwiseMaskNotCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskNotCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskNotCaller<unsigned char>, device::bitwiseMaskNotCaller<unsigned char>, 
+                                   device::bitwiseMaskNotCaller<unsigned short>, device::bitwiseMaskNotCaller<unsigned short>,
+                                   device::bitwiseMaskNotCaller<unsigned int>, device::bitwiseMaskNotCaller<unsigned int>,
+                                   device::bitwiseMaskNotCaller<unsigned int>};
 
         CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
         dst.create(src.size(), src.type());
@@ -456,7 +455,7 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
 
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
     void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream);
 
@@ -482,7 +481,7 @@ namespace
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
 
-        cv::gpu::mathfunc::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), 
+        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), 
                                              dst.channels(), src1, src2, dst, stream);
     }
 
@@ -492,10 +491,10 @@ namespace
         using namespace cv::gpu;
 
         typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskOrCaller<unsigned char>, mathfunc::bitwiseMaskOrCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskOrCaller<unsigned short>, mathfunc::bitwiseMaskOrCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskOrCaller<unsigned int>, mathfunc::bitwiseMaskOrCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskOrCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskOrCaller<unsigned char>, device::bitwiseMaskOrCaller<unsigned char>, 
+                                   device::bitwiseMaskOrCaller<unsigned short>, device::bitwiseMaskOrCaller<unsigned short>,
+                                   device::bitwiseMaskOrCaller<unsigned int>, device::bitwiseMaskOrCaller<unsigned int>,
+                                   device::bitwiseMaskOrCaller<unsigned int>};
 
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
@@ -513,7 +512,7 @@ namespace
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
 
-        cv::gpu::mathfunc::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), 
+        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), 
                                               dst.channels(), src1, src2, dst, stream);
     }
 
@@ -523,10 +522,10 @@ namespace
         using namespace cv::gpu;
 
         typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskAndCaller<unsigned char>, mathfunc::bitwiseMaskAndCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskAndCaller<unsigned short>, mathfunc::bitwiseMaskAndCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskAndCaller<unsigned int>, mathfunc::bitwiseMaskAndCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskAndCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskAndCaller<unsigned char>, device::bitwiseMaskAndCaller<unsigned char>, 
+                                   device::bitwiseMaskAndCaller<unsigned short>, device::bitwiseMaskAndCaller<unsigned short>,
+                                   device::bitwiseMaskAndCaller<unsigned int>, device::bitwiseMaskAndCaller<unsigned int>,
+                                   device::bitwiseMaskAndCaller<unsigned int>};
 
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
@@ -544,7 +543,7 @@ namespace
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
 
-        cv::gpu::mathfunc::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), 
+        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), 
                                               dst.channels(), src1, src2, dst, stream);
     }
 
@@ -554,10 +553,10 @@ namespace
         using namespace cv::gpu;
 
         typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-        static Caller callers[] = {mathfunc::bitwiseMaskXorCaller<unsigned char>, mathfunc::bitwiseMaskXorCaller<unsigned char>, 
-                                   mathfunc::bitwiseMaskXorCaller<unsigned short>, mathfunc::bitwiseMaskXorCaller<unsigned short>,
-                                   mathfunc::bitwiseMaskXorCaller<unsigned int>, mathfunc::bitwiseMaskXorCaller<unsigned int>,
-                                   mathfunc::bitwiseMaskXorCaller<unsigned int>};
+        static Caller callers[] = {device::bitwiseMaskXorCaller<unsigned char>, device::bitwiseMaskXorCaller<unsigned char>, 
+                                   device::bitwiseMaskXorCaller<unsigned short>, device::bitwiseMaskXorCaller<unsigned short>,
+                                   device::bitwiseMaskXorCaller<unsigned int>, device::bitwiseMaskXorCaller<unsigned int>,
+                                   device::bitwiseMaskXorCaller<unsigned int>};
 
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
@@ -601,7 +600,7 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations
 
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
     template <typename T>
     void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
@@ -623,14 +622,14 @@ namespace
     {
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
-        mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
     }
 
     template <typename T>
     void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
     {
         dst.create(src1.size(), src1.type());
-        mathfunc::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
     }
     
     template <typename T>
@@ -638,14 +637,14 @@ namespace
     {
         CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
         dst.create(src1.size(), src1.type());
-        mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
     }
 
     template <typename T>
     void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
     {
         dst.create(src1.size(), src1.type());
-        mathfunc::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
     }
 }
 
@@ -709,7 +708,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // threshold
 
-namespace cv { namespace gpu { namespace mathfunc
+namespace cv { namespace gpu { namespace device
 {
     template <typename T>
     void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, T thresh, T maxVal, int type,
@@ -718,24 +717,25 @@ namespace cv { namespace gpu { namespace mathfunc
 
 namespace
 {
-    template <typename T>
-    void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, 
-        cudaStream_t stream)
+    template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
     {
-        mathfunc::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
+        device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
     }
 }
 
 double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
 {
+    CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
+    CV_Assert(type <= THRESH_TOZERO_INV);
+
+    dst.create(src.size(), src.type());
+
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     if (src.type() == CV_32FC1 && type == THRESH_TRUNC)
     {
         NppStreamHandler h(stream);
 
-        dst.create(src.size(), src.type());
-
         NppiSize sz;
         sz.width  = src.cols;
         sz.height = src.rows;
@@ -761,12 +761,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
             threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
         };
 
-        CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
-        CV_Assert(type <= THRESH_TOZERO_INV);
-
-        dst.create(src.size(), src.type());
-
-        if (src.depth() != CV_32F)
+        if (src.depth() != CV_32F && src.depth() != CV_64F)
         {
             thresh = cvFloor(thresh);
             maxVal = cvRound(maxVal);
@@ -781,17 +776,11 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 ////////////////////////////////////////////////////////////////////////
 // pow
 
-namespace cv
+namespace cv { namespace gpu { namespace device
 {
-    namespace gpu
-    {
-        namespace mathfunc
-        {
-            template<typename T>
-            void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
-        }
-    }
-}
+    template<typename T>
+    void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
+}}}
 
 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {    
@@ -802,9 +791,9 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 
     static const caller_t callers[] = 
     {
-        mathfunc::pow_caller<unsigned char>,  mathfunc::pow_caller<signed char>, 
-        mathfunc::pow_caller<unsigned short>, mathfunc::pow_caller<short>, 
-        mathfunc::pow_caller<int>, mathfunc::pow_caller<float>
+        device::pow_caller<unsigned char>,  device::pow_caller<signed char>, 
+        device::pow_caller<unsigned short>, device::pow_caller<short>, 
+        device::pow_caller<int>, device::pow_caller<float>
     };
 
     callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));    
diff --git a/modules/gpu/src/gpumat.cpp b/modules/gpu/src/gpumat.cpp
index ca4b147a40..ad189adac6 100644
--- a/modules/gpu/src/gpumat.cpp
+++ b/modules/gpu/src/gpumat.cpp
@@ -393,7 +393,7 @@ void cv::gpu::ensureSizeIsEnough(int, int, int, GpuMat&) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
-namespace cv { namespace gpu { namespace matrix_operations
+namespace cv { namespace gpu { namespace device
 {
     void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
 
@@ -449,7 +449,7 @@ void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
     else
     {
         mat.create(size(), type());
-        cv::gpu::matrix_operations::copy_to_with_mask(*this, mat, depth(), mask, channels());
+        device::copy_to_with_mask(*this, mat, depth(), mask, channels());
     }
 }
 
@@ -508,7 +508,7 @@ namespace
 
     void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
     {
-        matrix_operations::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
+        device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
     }
 }
 
@@ -540,7 +540,7 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
     dst.create( size(), rtype );
 
     if (!noScale)
-        matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
+        device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
     else
     {
         typedef void (*convert_caller_t)(const GpuMat& src, GpuMat& dst);
@@ -681,7 +681,7 @@ namespace
     void kernelSet(GpuMat& src, const Scalar& s)
     {
         Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
+        device::set_to_gpu(src, sf.val, src.channels(), 0);
     }
 
     template<int SDEPTH, int SCN> struct NppSetMaskFunc
@@ -732,7 +732,7 @@ namespace
     void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
     {
         Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
+        device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
     }
 }
 
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
index 22eb7ffada..96eca2ea13 100644
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
@@ -45,6 +45,7 @@
 
 #include "internal_shared.hpp"
 #include "../vec_traits.hpp"
+#include "../functional.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -68,51 +69,17 @@ namespace cv { namespace gpu { namespace device
 
         //! Read Write Traits
 
-        template <size_t src_elem_size, size_t dst_elem_size>
-        struct UnReadWriteTraits_
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
         {
-            enum { shift = 1 };
-        };
-        template <size_t src_elem_size>
-        struct UnReadWriteTraits_<src_elem_size, 1>
-        {
-            enum { shift = 4 };
-        };
-        template <size_t src_elem_size>
-        struct UnReadWriteTraits_<src_elem_size, 2>
-        {
-            enum { shift = 2 };
-        };
-        template <typename T, typename D> struct UnReadWriteTraits
-        {
-            enum { shift = UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift };
-            
             typedef typename TypeVec<T, shift>::vec_type read_type;
             typedef typename TypeVec<D, shift>::vec_type write_type;
         };
 
-        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
-        struct BinReadWriteTraits_
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
         {
-            enum { shift = 1 };
-        };
-        template <size_t src_elem_size1, size_t src_elem_size2>
-        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
-        {
-            enum { shift = 4 };
-        };
-        template <size_t src_elem_size1, size_t src_elem_size2>
-        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
-        {
-            enum { shift = 2 };
-        };
-        template <typename T1, typename T2, typename D> struct BinReadWriteTraits
-        {
-            enum {shift = BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
-
             typedef typename TypeVec<T1, shift>::vec_type read_type1;
             typedef typename TypeVec<T2, shift>::vec_type read_type2;
-            typedef typename TypeVec<D , shift>::vec_type write_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
         };
 
         //! Transform kernels
@@ -206,29 +173,73 @@ namespace cv { namespace gpu { namespace device
                     dst.w = op(src1.w, src2.w);
             }
         };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
 
         template <typename T, typename D, typename UnOp, typename Mask>
         __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, const UnOp op)
         {
-            typedef typename UnReadWriteTraits<T, D>::read_type read_type;
-            typedef typename UnReadWriteTraits<T, D>::write_type write_type;
-            const int shift = UnReadWriteTraits<T, D>::shift;
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
 
             const int x = threadIdx.x + blockIdx.x * blockDim.x;
             const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * shift;
+            const int x_shifted = x * ft::smart_shift;
 
             if (y < src_.rows)
             {
                 const T* src = src_.ptr(y);
                 D* dst = dst_.ptr(y);
 
-                if (x_shifted + shift - 1 < src_.cols)
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
                 {
                     const read_type src_n_el = ((const read_type*)src)[x];
                     write_type dst_n_el;
 
-                    OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
 
                     ((write_type*)dst)[x] = dst_n_el;
                 }
@@ -259,14 +270,14 @@ namespace cv { namespace gpu { namespace device
         __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
             const Mask mask, const BinOp op)
         {
-            typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
-            typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
-            typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
-            const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
 
             const int x = threadIdx.x + blockIdx.x * blockDim.x;
             const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * shift;
+            const int x_shifted = x * ft::smart_shift;
 
             if (y < src1_.rows)
             {
@@ -274,13 +285,13 @@ namespace cv { namespace gpu { namespace device
                 const T2* src2 = src2_.ptr(y);
                 D* dst = dst_.ptr(y);
 
-                if (x_shifted + shift - 1 < src1_.cols)
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
                 {
                     const read_type1 src1_n_el = ((const read_type1*)src1)[x];
                     const read_type2 src2_n_el = ((const read_type2*)src2)[x];
                     write_type dst_n_el;
                     
-                    OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
 
                     ((write_type*)dst)[x] = dst_n_el;
                 }
@@ -308,7 +319,7 @@ namespace cv { namespace gpu { namespace device
                 const T2 src2_data = src2.ptr(y)[x];
                 dst.ptr(y)[x] = op(src1_data, src2_data);
             }
-        }        
+        }
 
         template <bool UseSmart> struct TransformDispatcher;
         template<> struct TransformDispatcher<false>
@@ -316,11 +327,10 @@ namespace cv { namespace gpu { namespace device
             template <typename T, typename D, typename UnOp, typename Mask>
             static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
             {
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
+                typedef TransformFunctorTraits<UnOp> ft;
 
-                grid.x = divUp(src.cols, threads.x);
-                grid.y = divUp(src.rows, threads.y);        
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     
 
                 transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
                 cudaSafeCall( cudaGetLastError() );
@@ -332,11 +342,10 @@ namespace cv { namespace gpu { namespace device
             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
             static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
             {
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
+                typedef TransformFunctorTraits<BinOp> ft;
 
-                grid.x = divUp(src1.cols, threads.x);
-                grid.y = divUp(src1.rows, threads.y);        
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     
 
                 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                 cudaSafeCall( cudaGetLastError() );
@@ -350,13 +359,12 @@ namespace cv { namespace gpu { namespace device
             template <typename T, typename D, typename UnOp, typename Mask>
             static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
             {
-                const int shift = UnReadWriteTraits<T, D>::shift;
+                typedef TransformFunctorTraits<UnOp> ft;
 
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);            
+                StaticAssert<ft::smart_shift != 1>::check();
 
-                grid.x = divUp(src.cols, threads.x * shift);
-                grid.y = divUp(src.rows, threads.y);        
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      
 
                 transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
                 cudaSafeCall( cudaGetLastError() );
@@ -368,13 +376,12 @@ namespace cv { namespace gpu { namespace device
             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
             static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
             {
-                const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+                typedef TransformFunctorTraits<BinOp> ft;
 
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
+                StaticAssert<ft::smart_shift != 1>::check();
 
-                grid.x = divUp(src1.cols, threads.x * shift);
-                grid.y = divUp(src1.rows, threads.y);        
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    
 
                 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                 cudaSafeCall( cudaGetLastError() );
@@ -382,44 +389,20 @@ namespace cv { namespace gpu { namespace device
                 if (stream == 0)
                     cudaSafeCall( cudaDeviceSynchronize() );            
             }
-        };
-
-        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
-        {
-            static const bool value = false;
-        };
-        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
-        {
-            static const bool value = UnReadWriteTraits<T, D>::shift != 1;
-        };
-        template <typename T, typename D> struct UseSmartUn
-        {
-            static const bool value = UseSmartUn_<T, D, VecTraits<T>::cn, VecTraits<D>::cn>::value;
-        };
-
-        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
-        {
-            static const bool value = false;
-        };
-        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
-        {
-            static const bool value = BinReadWriteTraits<T1, T2, D>::shift != 1;
-        };
-        template <typename T1, typename T2, typename D> struct UseSmartBin
-        {
-            static const bool value = UseSmartBin_<T1, T2, D, VecTraits<T1>::cn, VecTraits<T2>::cn, VecTraits<D>::cn>::value;
-        };
+        };        
 
         template <typename T, typename D, typename UnOp, typename Mask>
         static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
         {
-            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
+            typedef TransformFunctorTraits<UnOp> ft;
+            TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
         }
 
         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
         static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
         {
-            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
+            typedef TransformFunctorTraits<BinOp> ft;
+            TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
         }
     }
 }}}
diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp
index b3abdf64a1..be3ea7d278 100644
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -46,18 +46,25 @@
 #include <thrust/functional.h>
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
+#include "vec_traits.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
+    // Function Objects
+
     using thrust::unary_function;
     using thrust::binary_function;
 
+    // Arithmetic Operations
+
     using thrust::plus;
     using thrust::minus;
     using thrust::multiplies;
     using thrust::divides;
     using thrust::modulus;
     using thrust::negate;
+
+    // Comparison Operations
     
     using thrust::equal_to;
     using thrust::not_equal_to;
@@ -65,11 +72,15 @@ namespace cv { namespace gpu { namespace device
     using thrust::less;
     using thrust::greater_equal;
     using thrust::less_equal;
+
+    // Logical Operations
     
     using thrust::logical_and;
     using thrust::logical_or;
     using thrust::logical_not;
 
+    // Bitwise Operations
+
     using thrust::bit_and;
     using thrust::bit_or;
     using thrust::bit_xor;
@@ -78,7 +89,13 @@ namespace cv { namespace gpu { namespace device
         __forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
     };
 
-    using thrust::identity;
+    // Generalized Identity Operations
+
+    using thrust::identity;    
+    using thrust::project1st;
+    using thrust::project2nd;
+
+    // Min/Max Operations
 
 #define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
     template <> struct name<type> : binary_function<type, type, type> \
@@ -115,15 +132,8 @@ namespace cv { namespace gpu { namespace device
     OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)
 
 #undef OPENCV_GPU_IMPLEMENT_MINMAX
-    
-    using thrust::project1st;
-    using thrust::project2nd;
 
-    using thrust::unary_negate;
-    using thrust::not1;
-
-    using thrust::binary_negate;
-    using thrust::not2;
+    // Math functions
 
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
     template <typename T> struct func ## _func : unary_function<T, float> \
@@ -192,6 +202,8 @@ namespace cv { namespace gpu { namespace device
         }
     };
 
+    // Saturate Cast Functor
+
     template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
     {
         __forceinline__ __device__ D operator ()(const T& v) const
@@ -200,6 +212,8 @@ namespace cv { namespace gpu { namespace device
         }
     };
 
+    // Threshold Functors
+
     template <typename T> struct thresh_binary_func : unary_function<T, T>
     {
         __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
@@ -256,7 +270,15 @@ namespace cv { namespace gpu { namespace device
         }
 
         const T thresh;
-    };
+    };    
+
+    // Function Object Adaptors
+
+    using thrust::unary_negate;
+    using thrust::not1;
+
+    using thrust::binary_negate;
+    using thrust::not2;
 
     template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
     {
@@ -291,46 +313,77 @@ namespace cv { namespace gpu { namespace device
         return binder2nd<Op>(op, typename Op::second_argument_type(x));
     }
 
-    template <typename T1, typename T2> struct BinOpTraits
+    // Functor Traits
+
+    template <typename F> struct IsUnaryFunction
     {
-        typedef int argument_type;
+        struct Yes {};
+        struct No {Yes a[2];};
+
+        template <typename T, typename D> static Yes check(unary_function<T, D>*);
+        static No check(...);
+
+        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
     };
-    template <typename T> struct BinOpTraits<T, T>
+
+    template <typename F> struct IsBinaryFunction
     {
-        typedef T argument_type;
+        struct Yes {};
+        struct No {Yes a[2];};
+
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>*);
+        static No check(...);
+
+        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
     };
-    template <typename T> struct BinOpTraits<T, double>
+
+    namespace detail
     {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<double, T>
+        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+
+        template <typename T, typename D> struct DefaultUnaryShift
+        {
+            enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
+        };
+        
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
+
+        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+        {
+            enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
+        };
+
+        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+        template <typename Func> struct ShiftDispatcher<Func, true>
+        {
+            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+        };
+        template <typename Func> struct ShiftDispatcher<Func, false>
+        {
+            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        };
+    }
+
+    template <typename Func> struct DefaultTransformShift
     {
-        typedef double argument_type;
+        enum { shift = detail::ShiftDispatcher<Func>::shift };
     };
-    template <> struct BinOpTraits<double, double>
+
+    template <typename Func> struct DefaultTransformFunctorTraits
     {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, float>
-    {
-        typedef float argument_type;
-    };
-    template <typename T> struct BinOpTraits<float, T>
-    {
-        typedef float argument_type;
-    };
-    template <> struct BinOpTraits<float, float>
-    {
-        typedef float argument_type;
-    };
-    template <> struct BinOpTraits<double, float>
-    {
-        typedef double argument_type;
-    };
-    template <> struct BinOpTraits<float, double>
-    {
-        typedef double argument_type;
+        enum { simple_block_dim_x = 16 };
+        enum { simple_block_dim_y = 16 };
+
+        enum { smart_block_dim_x = 16 };
+        enum { smart_block_dim_y = 16 };
+        enum { smart_shift = DefaultTransformShift<Func>::shift };
     };
+
+    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
 }}}
 
 #endif // __OPENCV_GPU_FUNCTIONAL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
index 238f5c771e..5c0051c29a 100644
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -150,6 +150,50 @@ namespace cv {  namespace gpu { namespace device
         return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
     }
 
+    namespace detail
+    {    
+        template <typename T1, typename T2> struct BinOpTraits
+        {
+            typedef int argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, T>
+        {
+            typedef T argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<double, T>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<double, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, float>
+        {
+            typedef float argument_type;
+        };
+        template <typename T> struct BinOpTraits<float, T>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<float, float>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<double, float>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<float, double>
+        {
+            typedef double argument_type;
+        };
+    }
+
 #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
     static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
     { \
@@ -157,16 +201,16 @@ namespace cv {  namespace gpu { namespace device
         return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
     { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
     { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
     } \
     static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
     { \
@@ -174,16 +218,16 @@ namespace cv {  namespace gpu { namespace device
         return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
     { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
     { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
     } \
     static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
     { \
@@ -191,16 +235,16 @@ namespace cv {  namespace gpu { namespace device
         return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
     { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
     { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
     } \
     static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
     { \
@@ -208,16 +252,16 @@ namespace cv {  namespace gpu { namespace device
         return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
     { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
     } \
     template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
     { \
-        func<typename BinOpTraits<T, type>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+        func<typename detail::BinOpTraits<T, type>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
     }
 
 #define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
index 62d3710620..c1e2d81a5b 100644
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -49,6 +49,79 @@ namespace cv { namespace gpu { namespace device
 {
     template<typename T, int N> struct TypeVec;
 
+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+
 #define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
     template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
     template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
@@ -57,7 +130,9 @@ namespace cv { namespace gpu { namespace device
     template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
     template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
     template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
-    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; };
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
+    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
 
     OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
     OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
@@ -74,11 +149,13 @@ namespace cv { namespace gpu { namespace device
     template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
     template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
     template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
 
     template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
     template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
     template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
     template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
 
     template<typename T> struct VecTraits;
 
@@ -87,36 +164,43 @@ namespace cv { namespace gpu { namespace device
     { \
         typedef type elem_type; \
         enum {cn=1}; \
-        static __device__ __host__ type all(type v) {return v;} \
-        static __device__ __host__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
+        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
     }; \
     template<> struct VecTraits<type ## 1> \
     { \
         typedef type elem_type; \
         enum {cn=1}; \
-        static __device__ __host__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
-        static __device__ __host__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
     }; \
     template<> struct VecTraits<type ## 2> \
     { \
         typedef type elem_type; \
         enum {cn=2}; \
-        static __device__ __host__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
-        static __device__ __host__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
     }; \
     template<> struct VecTraits<type ## 3> \
     { \
         typedef type elem_type; \
         enum {cn=3}; \
-        static __device__ __host__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
-        static __device__ __host__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
     }; \
     template<> struct VecTraits<type ## 4> \
     { \
         typedef type elem_type; \
         enum {cn=4}; \
-        static __device__ __host__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
-        static __device__ __host__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+    }; \
+    template<> struct VecTraits<type ## 8> \
+    { \
+        typedef type elem_type; \
+        enum {cn=8}; \
+        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
     };
 
     OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
@@ -134,8 +218,8 @@ namespace cv { namespace gpu { namespace device
     { 
         typedef schar elem_type; 
         enum {cn=1}; 
-        static __device__ __host__ schar all(schar v) {return v;}
-        static __device__ __host__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
     };
 }}}
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index ebc2dad2ad..21e9766ff7 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -286,7 +286,7 @@ TEST(BruteForceMatcher)
 {
     // Init CPU matcher
 
-    int desc_len = 64;
+    int desc_len = 128;
 
     BruteForceMatcher< L2<float> > matcher;
 
@@ -328,7 +328,7 @@ TEST(BruteForceMatcher)
     d_matcher.knnMatch(d_query, d_train, d_matches, knn);
     GPU_OFF;
 
-    /*SUBTEST << "radiusMatch";
+    SUBTEST << "radiusMatch";
     float max_distance = 3.8f;
 
     CPU_ON;
@@ -337,7 +337,7 @@ TEST(BruteForceMatcher)
 
     GPU_ON;
     d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
-    GPU_OFF;*/
+    GPU_OFF;
 }
 
 
@@ -689,26 +689,7 @@ TEST(threshold)
     Mat src, dst;
     gpu::GpuMat d_src, d_dst;
 
-    for (int size = 2000; size <= 4000; size += 1000)
-    {
-        SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
-
-        gen(src, size, size, CV_8U, 0, 100);
-        dst.create(size, size, CV_8U);
-
-        CPU_ON; 
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
-        CPU_OFF;
-
-        d_src = src;
-        d_dst.create(size, size, CV_8U);
-
-        GPU_ON;
-        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        GPU_OFF;
-    }
-
-    for (int size = 2000; size <= 4000; size += 1000)
+    for (int size = 1000; size <= 4000; size += 1000)
     {
         SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
 
@@ -727,22 +708,47 @@ TEST(threshold)
         GPU_OFF;
     }
 
-    for (int size = 2000; size <= 4000; size += 1000)
+    for (int size = 1000; size <= 4000; size += 1000)
     {
-        SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
+        SUBTEST << "size " << size << ", 32F, THRESH_BINARY";
 
         gen(src, size, size, CV_32F, 0, 100);
         dst.create(size, size, CV_32F);
 
         CPU_ON; 
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
         CPU_OFF;
 
         d_src = src;
         d_dst.create(size, size, CV_32F);
 
         GPU_ON;
-        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+        GPU_OFF;
+    }
+}
+
+TEST(pow)
+{
+    Mat src, dst;
+    gpu::GpuMat d_src, d_dst;
+
+    for (int size = 1000; size <= 4000; size += 1000)
+    {
+        SUBTEST << "size " << size << ", 32F";
+
+        gen(src, size, size, CV_32F, 0, 100);
+        dst.create(size, size, CV_32F);
+
+        CPU_ON;
+        pow(src, -2.0, dst);
+        CPU_OFF;
+
+        d_src = src;
+        d_dst.create(size, size, CV_32F);
+
+        GPU_ON;
+        gpu::pow(d_src, -2.0, d_dst);
         GPU_OFF;
     }
 }