diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index a7b39dd1c9..3be2e052b2 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -2034,7 +2034,6 @@ namespace cv { namespace gpu { namespace device
 
 void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stream)
 {
-#ifdef _WIN32
     using namespace cv::gpu::device::pyramid;
 
     typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
@@ -2081,14 +2080,10 @@ void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stre
 
         szLastLayer = szCurLayer;
     }
-#else
-    throw_nogpu();
-#endif
 }
 
 void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const
 {
-#ifdef _WIN32
     using namespace cv::gpu::device::pyramid;
 
     typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
@@ -2145,9 +2140,6 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream
     CV_Assert(func != 0);
 
     func(lastLayer, outImg, StreamAccessor::getStream(stream));
-#else
-    throw_nogpu();
-#endif
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/nvidia/core/NCVPyramid.cu b/modules/gpu/src/nvidia/core/NCVPyramid.cu
index a75f3b0024..c9f5971338 100644
--- a/modules/gpu/src/nvidia/core/NCVPyramid.cu
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.cu
@@ -48,8 +48,6 @@
 #include "NCVPixelOperations.hpp"
 #include "opencv2/gpu/device/common.hpp"
 
-#ifdef _WIN32
-
 template<typename T, Ncv32u CN> struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
 
 template<typename T> struct __average4_CN<T, 1> {
@@ -179,37 +177,6 @@ template<typename Tin, typename Tout> static __host__ __device__ Tout _lerp(cons
 }
 
 
-template<typename T>
-static T _interpLinear(const T &a, const T &b, Ncv32f d)
-{
-    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
-    TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
-    return _pixDemoteClampZ<TVFlt, T>(tmp);
-}
-
-
-template<typename T>
-static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
-{
-    Ncv32u xl = (Ncv32u)x;
-    Ncv32u xh = xl+1;
-    Ncv32f dx = x - xl;
-    Ncv32u yl = (Ncv32u)y;
-    Ncv32u yh = yl+1;
-    Ncv32f dy = y - yl;
-    T p00, p01, p10, p11;
-    p00 = refLayer.at(xl, yl);
-    p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
-    p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
-    p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
-    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
-    TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
-    TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
-    TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
-    return _pixDemoteClampZ<TVFlt, T>(mixture);
-}
-
-
 template<typename T>
 __global__ void kernelDownsampleX2(T *d_src,
                                    Ncv32u srcPitch,
@@ -342,6 +309,38 @@ namespace cv { namespace gpu { namespace device
 }}}
 
 
+#ifdef _WIN32
+
+template<typename T>
+static T _interpLinear(const T &a, const T &b, Ncv32f d)
+{
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
+    return _pixDemoteClampZ<TVFlt, T>(tmp);
+}
+
+
+template<typename T>
+static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
+{
+    Ncv32u xl = (Ncv32u)x;
+    Ncv32u xh = xl+1;
+    Ncv32f dx = x - xl;
+    Ncv32u yl = (Ncv32u)y;
+    Ncv32u yh = yl+1;
+    Ncv32f dy = y - yl;
+    T p00, p01, p10, p11;
+    p00 = refLayer.at(xl, yl);
+    p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
+    p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
+    p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
+    TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
+    TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
+    return _pixDemoteClampZ<TVFlt, T>(mixture);
+}
+
 template <class T>
 NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
                                     Ncv8u numLayers,
diff --git a/modules/gpu/src/orb.cpp b/modules/gpu/src/orb.cpp
index 0a02c3b3b0..d038efabc5 100644
--- a/modules/gpu/src/orb.cpp
+++ b/modules/gpu/src/orb.cpp
@@ -577,7 +577,8 @@ void cv::gpu::ORB_GPU::computeKeyPointsPyramid()
 
         ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
 
-        keyPointsCount_[level] = fastDetector_.getKeyPoints(keyPointsPyr_[level].rowRange(0, 2));
+        GpuMat fastKpRange = keyPointsPyr_[level].rowRange(0, 2);
+        keyPointsCount_[level] = fastDetector_.getKeyPoints(fastKpRange);
 
         int n_features = n_features_per_level_[level];
         
@@ -664,7 +665,8 @@ void cv::gpu::ORB_GPU::mergeKeyPoints(GpuMat& keypoints)
 
         mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
 
-        keyPointsPyr_[level].rowRange(1, 3).copyTo(keyPointsRange.rowRange(2, 4));
+        GpuMat range = keyPointsRange.rowRange(2, 4);
+        keyPointsPyr_[level].rowRange(1, 3).copyTo(range);
         
         keyPointsRange.row(4).setTo(Scalar::all(level));
         keyPointsRange.row(5).setTo(Scalar::all(params_.patch_size_ * sf));