From 0e45a637c4f3d9fdf2d3f27eeb217681b762867a Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 20 Dec 2010 12:49:40 +0000
Subject: [PATCH] added SURF_GPU. added support of CV_32FC1 type to gpu
 copyMakeBorder.

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  96 ++-
 modules/gpu/src/brute_force_matcher.cpp |   4 +-
 modules/gpu/src/cuda/surf.cu            | 966 ++++++++++++++++++++++++
 modules/gpu/src/cuda/surf_key_point.h   |  76 ++
 modules/gpu/src/imgproc_gpu.cpp         |  24 +-
 modules/gpu/src/precomp.hpp             |   1 +
 modules/gpu/src/surf.cpp                | 356 +++++++++
 7 files changed, 1511 insertions(+), 12 deletions(-)
 create mode 100644 modules/gpu/src/cuda/surf.cu
 create mode 100644 modules/gpu/src/cuda/surf_key_point.h
 create mode 100644 modules/gpu/src/surf.cpp
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index a8c3a146ed..3e0c0de2c6 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -591,7 +591,7 @@ namespace cv
         CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR);
 
         //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-        //! supports CV_8UC1, CV_8UC4, CV_32SC1 types
+        //! supports CV_8UC1, CV_8UC4, CV_32SC1 and CV_32FC1 types
         CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar());
 
         //! computes the integral image and integral for the squared image
@@ -1366,6 +1366,100 @@ namespace cv
             Ptr<CvHaarClassifierCascade> oldCascade;
         };
 
+        ////////////////////////////////// SURF //////////////////////////////////////////
+        
+        struct CV_EXPORTS SURFParams_GPU 
+        {
+            SURFParams_GPU() :
+                threshold(0.1f), 
+                nOctaves(4),
+                nIntervals(4),
+                initialScale(2.f),
+
+                l1(3.f/1.5f),
+                l2(5.f/1.5f),
+                l3(3.f/1.5f),
+                l4(1.f/1.5f),
+                edgeScale(0.81f),
+                initialStep(1),
+
+                extended(true),
+
+                featuresRatio(0.01f)
+            {
+            }
+
+            //! The interest operator threshold
+            float threshold;
+            //! The number of octaves to process
+            int nOctaves;
+            //! The number of intervals in each octave
+            int nIntervals;
+            //! The scale associated with the first interval of the first octave
+            float initialScale;
+
+            //! mask parameter l_1
+            float l1;
+            //! mask parameter l_2 
+            float l2;
+            //! mask parameter l_3
+            float l3;
+            //! mask parameter l_4
+            float l4;
+            //! The amount to scale the edge rejection mask
+            float edgeScale;
+            //! The initial sampling step in pixels.
+            int initialStep;
+
+            //! True, if generate 128-len descriptors, false - 64-len descriptors
+            bool extended;
+
+            //! max features = featuresRatio * img.size().srea()
+            float featuresRatio;
+        };
+
+        class CV_EXPORTS SURF_GPU : public SURFParams_GPU
+        {
+        public:
+            //! returns the descriptor size in float's (64 or 128)
+            int descriptorSize() const;
+
+            //! upload host keypoints to device memory
+            static void uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
+            //! download keypoints from device to host memory
+            static void downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints);
+
+            //! download descriptors from device to host memory
+            static void downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors);
+            
+            //! finds the keypoints using fast hessian detector used in SURF
+            //! supports CV_8UC1 (0..255) and CV_32FC1 (0..1) images
+            //! keypoints will have 1 row and type CV_32FC(6)
+            //! keypoints.at<float6>(1, i) contains i'th keypoint
+            //! format: (x, y, size, response, angle, octave)
+            void operator()(const GpuMat& img, GpuMat& keypoints);
+            //! finds the keypoints and computes their descriptors. 
+            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
+            void operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, 
+                bool useProvidedKeypoints = false, bool calcOrientation = true);
+        
+            void operator()(const GpuMat& img, std::vector<KeyPoint>& keypoints);
+            void operator()(const GpuMat& img, std::vector<KeyPoint>& keypoints, GpuMat& descriptors, 
+                bool useProvidedKeypoints = false, bool calcOrientation = true);
+            
+            void operator()(const GpuMat& img, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors, 
+                bool useProvidedKeypoints = false, bool calcOrientation = true);
+
+            GpuMat img_float;
+            GpuMat img_float_tr;
+
+            GpuMat sum;
+
+            GpuMat hessianBuffer;
+            GpuMat maxPosBuffer;
+            GpuMat featuresBuffer;
+        };
+
     }
 
     //! Speckle filtering - filters small connected components on diparity image.
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index 626dfa0577..3469e5be44 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -493,9 +493,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
         }
     };
 
-    int major, minor;
-    getComputeCapability(getDevice(), major, minor);
-    CV_Assert(100 * major + 10 * minor >= 110); // works onle on device with CC >= 1.1
+    CV_Assert(hasAtomicsSupport(getDevice()));
 
     const int nQuery = queryDescs.rows;
     const int nTrain = trainDescs.rows;
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
new file mode 100644
index 0000000000..29b86ee2bb
--- /dev/null
+++ b/modules/gpu/src/cuda/surf.cu
@@ -0,0 +1,966 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
+//
+// The original code was written by Paul Furgale and Chi Hay Tong 
+// and later optimized and prepared for integration into OpenCV by Itseez.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "surf_key_point.h"
+#include "opencv2/gpu/device/limits_gpu.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+#define CV_PI 3.1415926535897932384626433832795f
+
+namespace cv { namespace gpu { namespace surf
+{
+    ////////////////////////////////////////////////////////////////////////
+    // Help funcs
+
+    // Wrapper for host reference to pass into kernel
+    template <typename T> 
+    class DeviceReference
+    {
+    public:
+        explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val)
+        {
+            cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) );
+            cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) );
+        }
+
+        ~DeviceReference()
+        {            
+            cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaFree(d_ptr) );
+        }
+
+        // Casting to device pointer
+        operator T*() {return d_ptr;}
+        operator const T*() const {return d_ptr;}
+    private:
+        T* d_ptr;
+        T* h_ptr;
+    };
+
+    __device__ void clearLastBit(int* f)
+    {
+        *f &= ~0x1;
+    }
+    __device__ void clearLastBit(float& f)
+    {
+        clearLastBit((int*)&f);
+    }
+
+    __device__ void setLastBit(int* f)
+    {
+        *f |= 0x1;
+    }
+    __device__ void setLastBit(float& f)
+    {
+        setLastBit((int*)&f);
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    // Global parameters
+
+    // The maximum number of features (before subpixel interpolation) that memory is reserved for.
+    __constant__ int c_max_candidates;
+    // The maximum number of features that memory is reserved for.
+    __constant__ int c_max_features;
+    // The number of intervals in the octave.
+    __constant__ int c_nIntervals;
+    // Mask sizes derived from the mask parameters
+    __constant__ float c_mask_width;
+    // Mask sizes derived from the mask parameters
+    __constant__ float c_mask_height;
+    // Mask sizes derived from the mask parameters
+    __constant__ float c_dxy_center_offset;
+    // Mask sizes derived from the mask parameters
+    __constant__ float c_dxy_half_width;
+    // Mask sizes derived from the mask parameters
+    __constant__ float c_dxy_scale;
+    // The scale associated with the first interval of the first octave
+    __constant__ float c_initialScale;
+    //! The interest operator threshold
+    __constant__ float c_threshold;
+
+    // Ther octave
+    __constant__ int c_octave;
+    // The width of the octave buffer.
+    __constant__ int c_x_size;
+    // The height of the octave buffer.
+    __constant__ int c_y_size;
+    // The size of the octave border in pixels.
+    __constant__ int c_border;
+    // The step size used in this octave in pixels.
+    __constant__ int c_step;
+
+    ////////////////////////////////////////////////////////////////////////
+    // Integral image texture
+
+    texture<float, 2, cudaReadModeElementType> sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp);
+
+    __device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight)
+    {
+        float result = 0.f;
+
+        result += tex2D(sumTex, cx - halfWidth, cy - halfHeight);
+        result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight);
+        result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight);
+        result += tex2D(sumTex, cx + halfWidth, cy + halfHeight);
+
+        return result;
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    // Hessian
+
+    __device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale)
+    {
+        float Dyy = 0.f;
+
+        Dyy +=     iiAreaLookupCDHalfWH(x, y, mask_width, mask_height);
+        Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale);
+
+        Dyy *=  1.0f / (fscale * fscale);
+
+        return Dyy;
+    }
+
+    __device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale)
+    {
+    	float Dxx = 0.f;
+	
+	    Dxx +=     iiAreaLookupCDHalfWH(x, y, mask_height, mask_width);
+	    Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale     , mask_width);
+
+	    Dxx *=  1.0f / (fscale * fscale);
+
+	    return Dxx;
+    }
+    
+    __device__ float evalDxy(float x, float y, float fscale)
+    {
+    	float center_offset =  c_dxy_center_offset  * fscale;
+	    float half_width    =  c_dxy_half_width     * fscale;
+
+	    float Dxy = 0.f;
+
+	    Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);
+	    Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);
+	    Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);
+	    Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);
+	
+	    Dxy *= 1.0f / (fscale * fscale);
+
+	    return Dxy;
+    }
+
+    __device__ float calcScale(int hidx_z)
+    {
+        float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2);
+        return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f;
+    }
+    
+    __global__ void fasthessian(PtrStepf hessianBuffer)
+    {
+  	    // Determine the indices in the Hessian buffer
+        int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
+        int hidx_y = threadIdx.y + blockIdx.y * blockDim.y;
+        int hidx_z = threadIdx.z;
+
+        float fscale = calcScale(hidx_z);
+
+	    // Compute the lookup location of the mask center
+        float x = hidx_x * c_step + c_border;
+        float y = hidx_y * c_step + c_border;
+
+	    // Scale the mask dimensions according to the scale
+        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
+        {
+	        float mask_width =  c_mask_width  * fscale;
+	        float mask_height = c_mask_height * fscale;
+
+	        // Compute the filter responses
+	        float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
+	        float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
+	        float Dxy = evalDxy(x, y, fscale);
+	
+	        // Combine the responses and store the Laplacian sign
+	        float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
+
+	        if (Dxx + Dyy > 0.f)
+	            setLastBit(result);
+	        else
+	            clearLastBit(result);
+
+	        hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
+        }
+    }   
+
+    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)
+    {
+        dim3 threads;
+        threads.x = 16;
+        threads.y = 8;
+        threads.z = nIntervals;
+
+        dim3 grid;
+        grid.x = divUp(x_size, threads.x);
+        grid.y = divUp(y_size, threads.y);
+        grid.z = 1;
+
+  	    fasthessian<<<grid, threads>>>(hessianBuffer);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+	}
+
+    ////////////////////////////////////////////////////////////////////////
+    // NONMAX
+
+    __global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter)
+    {        
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+
+        extern __shared__ float fh_vals[];
+
+        // The hidx variables are the indices to the hessian buffer.
+        int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2);
+        int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2);
+        int hidx_z = threadIdx.z;
+        int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+
+        // Is this thread within the hessian buffer?
+        if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
+        {
+            fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
+        }
+        __syncthreads();
+    
+        // Is this location one of the ones being processed for nonmax suppression.
+        // Blocks overlap by one so we don't process the border threads.
+        bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1 
+            &&           threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1
+            &&           threadIdx.z > 0 && threadIdx.z < blockDim.z-1;
+
+        float val = fh_vals[localLin];
+
+        if (inBounds2 && val >= c_threshold)
+        {
+            // Check to see if we have a max (in its 26 neighbours)
+            int zoff = blockDim.x * blockDim.y;
+            bool condmax  =  val > fh_vals[localLin                     + 1]
+            &&               val > fh_vals[localLin                     - 1]
+            &&               val > fh_vals[localLin        - blockDim.x + 1]
+            &&               val > fh_vals[localLin        - blockDim.x    ]
+            &&               val > fh_vals[localLin        - blockDim.x - 1]
+            &&               val > fh_vals[localLin        + blockDim.x + 1]
+            &&               val > fh_vals[localLin        + blockDim.x    ]
+            &&               val > fh_vals[localLin        + blockDim.x - 1]
+      
+            &&               val > fh_vals[localLin - zoff              + 1]
+            &&               val > fh_vals[localLin - zoff                 ]
+            &&               val > fh_vals[localLin - zoff              - 1]
+            &&               val > fh_vals[localLin - zoff - blockDim.x + 1]
+            &&               val > fh_vals[localLin - zoff - blockDim.x    ]
+            &&               val > fh_vals[localLin - zoff - blockDim.x - 1]
+            &&               val > fh_vals[localLin - zoff + blockDim.x + 1]
+            &&               val > fh_vals[localLin - zoff + blockDim.x    ]
+            &&               val > fh_vals[localLin - zoff + blockDim.x - 1]
+      
+            &&               val > fh_vals[localLin + zoff              + 1]
+            &&               val > fh_vals[localLin + zoff                 ]
+            &&               val > fh_vals[localLin + zoff              - 1]
+            &&               val > fh_vals[localLin + zoff - blockDim.x + 1]
+            &&               val > fh_vals[localLin + zoff - blockDim.x    ]
+            &&               val > fh_vals[localLin + zoff - blockDim.x - 1]
+            &&               val > fh_vals[localLin + zoff + blockDim.x + 1]
+            &&               val > fh_vals[localLin + zoff + blockDim.x    ]
+            &&               val > fh_vals[localLin + zoff + blockDim.x - 1]
+            ;
+
+            if(condmax) 
+            {
+                unsigned int i = atomicInc(maxCounter,(unsigned int) -1);
+      
+                if (i < c_max_candidates) 
+                {
+	                int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave};
+	                maxPosBuffer[i] = f;	
+                }
+            }
+        }  
+
+        #endif
+    }
+
+    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
+        int nIntervals, int x_size, int y_size)
+    {
+        dim3 threads;
+        threads.x = 16;
+        threads.y = 8;
+        threads.z = nIntervals;
+
+        dim3 grid;
+        grid.x = divUp(x_size, threads.x - 2);
+        grid.y = divUp(y_size, threads.y - 2);
+        grid.z = 1;
+
+        const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);
+
+        DeviceReference<unsigned int> maxCounterWrapper(maxCounter);
+
+        nonmaxonly<<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    // INTERPOLATION
+    
+    #define MID_IDX 1
+    __global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer, 
+        KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)
+    {        
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+
+        int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x;
+        int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y;
+        int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z;
+
+        __shared__ float fh_vals[3][3][3];
+        __shared__ KeyPoint_GPU p;
+
+        fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];
+        __syncthreads();
+
+        if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+        {
+            __shared__ float H[3][3];
+
+            //dxx
+            H[0][0] =    fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX    ] 
+	        -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]
+	        +            fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX    ];
+
+            //dyy
+            H[1][1] =    fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX + 1] 
+	        -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]
+	        +            fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX - 1];
+      
+            //dss
+            H[2][2] =    fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX    ] 
+	        -       2.0f*fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX    ]
+	        +            fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX    ];
+
+            //dxy
+            H[0][1]= 0.25f*
+                (fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX + 1] -
+		         fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX + 1] -
+		         fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX - 1] + 
+		         fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX - 1]);
+      
+            //dxs
+            H[0][2]= 0.25f*
+                (fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX    ] -
+		         fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX    ] -
+		         fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX    ] + 
+		         fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX    ]);
+
+            //dys
+            H[1][2]= 0.25f*
+                (fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX + 1] -
+		         fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX - 1] -
+		         fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX + 1] + 
+		         fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX - 1]);
+
+            //dyx = dxy
+            H[1][0] = H[0][1];
+
+            //dsx = dxs
+            H[2][0] = H[0][2];
+
+            //dsy = dys
+            H[2][1] = H[1][2];
+
+            __shared__ float dD[3];
+
+            //dx
+            dD[0] = 0.5f*(fh_vals[MID_IDX    ][MID_IDX + 1][MID_IDX    ] -
+	 	        fh_vals[MID_IDX    ][MID_IDX - 1][MID_IDX    ]);
+            //dy
+            dD[1] = 0.5f*(fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX + 1] -
+		        fh_vals[MID_IDX    ][MID_IDX    ][MID_IDX - 1]);
+            //ds
+            dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX    ][MID_IDX    ] -
+    		    fh_vals[MID_IDX - 1][MID_IDX    ][MID_IDX    ]);
+
+            __shared__ float invdet;
+            invdet = 1.f /
+                (
+                H[0][0]*H[1][1]*H[2][2] 
+                +   H[0][1]*H[1][2]*H[2][0]
+                +   H[0][2]*H[1][0]*H[2][1]
+                -   H[0][0]*H[1][2]*H[2][1]
+                -   H[0][1]*H[1][0]*H[2][2]
+                -   H[0][2]*H[1][1]*H[2][0]
+                );
+
+            //   // 1-based entries of a 3x3 inverse
+            //   /*             [ |a22 a23|   |a12 a13|  |a12 a13|]     */
+            //   /*             [ |a32 a33|  -|a32 a33|  |a22 a23|]     */
+            //   /*             [                                 ]     */
+            //   /*             [ |a21 a23|   |a11 a13|  |a11 a13|]     */
+            //   /*    A^(-1) = [-|a31 a33|   |a31 a33| -|a21 a23|] / d */
+            //   /*             [                                 ]     */
+            //   /*             [ |a21 a22|   |a11 a12|  |a11 a12|]     */
+            //   /*             [ |a31 a32|  -|a31 a32|  |a21 a22|]     */
+
+            __shared__ float Hinv[3][3];
+            Hinv[0][0] =  invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]);
+            Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]);
+            Hinv[0][2] =  invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]);
+
+            Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]);
+            Hinv[1][1] =  invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]);
+            Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]);
+
+            Hinv[2][0] =  invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]);
+            Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]);
+            Hinv[2][2] =  invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]);
+
+            __shared__ float x[3];
+
+            x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2]));
+            x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2]));
+            x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2]));
+
+            if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f) 
+            { 
+                // if the step is within the interpolation region, perform it
+	
+	            // Get a new feature index.
+	            unsigned int i = atomicInc(featureCounter, (unsigned int)-1);
+
+ 	            if (i < c_max_features) 
+                {	  
+	                p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border;
+	                p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;
+
+ 	                if (x[2] > 0)
+ 	                {
+                        float a = calcScale(maxPosBuffer[blockIdx.x].z);
+                        float b = calcScale(maxPosBuffer[blockIdx.x].z + 1);
+
+	                    p.size = (1.f - x[2]) * a + x[2] * b;
+ 	                } 
+ 	                else
+ 	                {
+                        float a = calcScale(maxPosBuffer[blockIdx.x].z);
+                        float b = calcScale(maxPosBuffer[blockIdx.x].z - 1);
+
+	                    p.size = (1.f + x[2]) * a - x[2] * b;
+ 	                }
+
+	                p.octave = c_octave;
+			
+	                p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX];
+
+	                // Should we split up this transfer over many threads?
+	                featuresBuffer[i] = p;
+	            }
+            } // If the subpixel interpolation worked
+        } // If this is thread 0.
+
+        #endif
+    }
+    #undef MID_IDX
+
+    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, 
+        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter)
+    {
+        dim3 threads;
+        threads.x = 3;
+        threads.y = 3;
+        threads.z = 3;
+    
+        dim3 grid;
+        grid.x = maxCounter;
+        grid.y = 1; 
+        grid.z = 1;
+
+        DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
+    
+        fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);
+
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    // Orientation
+
+    // precomputed values for a Gaussian with a standard deviation of 2
+    __constant__ float c_gauss1D[13] = 
+    {
+        0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f, 
+        0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f, 
+        0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f, 
+        0.002215924206f
+    };
+
+    __global__ void find_orientation(KeyPoint_GPU* features)
+    {
+        int tid = threadIdx.y * 17 + threadIdx.x;
+        int tid2 = numeric_limits_gpu<int>::max();
+
+        if (threadIdx.x < 13 && threadIdx.y < 13) 
+        {
+            tid2 = threadIdx.y * 13 + threadIdx.x;
+        }
+
+        __shared__ float texLookups[17][17];
+    
+        __shared__ float Edx[13*13];
+        __shared__ float Edy[13*13];
+        __shared__ float xys[3];
+
+        // Read my x, y, size.
+        if (tid < 3)
+        {
+	        xys[tid] = ((float*)(&features[blockIdx.x]))[tid];
+        }
+        __syncthreads();
+
+        // Read all texture locations into memory
+        // Maybe I should use __mul24 here?
+        texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE], 
+                  xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]);
+
+        __syncthreads();
+
+        float dx = 0.f;
+        float dy = 0.f;
+	 
+	    // Computes lookups for all points in a 13x13 lattice.
+	    // - SURF says to only use a circle, but the branching logic would slow it down
+	    // - Gaussian weighting should reduce the effects of the outer points anyway
+        if (tid2 < 169)
+        {
+	        dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];
+	        dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];
+	        dx -=     texLookups[threadIdx.x + 4][threadIdx.y    ];
+	        dx +=     texLookups[threadIdx.x    ][threadIdx.y + 4];
+	        dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];
+	        dx +=     texLookups[threadIdx.x + 4][threadIdx.y + 4];
+
+	        dy -=     texLookups[threadIdx.x    ][threadIdx.y    ];
+	        dy += 2.f*texLookups[threadIdx.x    ][threadIdx.y + 2];
+	        dy -=     texLookups[threadIdx.x    ][threadIdx.y + 4];
+	        dy +=     texLookups[threadIdx.x + 4][threadIdx.y    ];
+	        dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];
+	        dy +=     texLookups[threadIdx.x + 4][threadIdx.y + 4];
+
+	        float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];
+
+	        Edx[tid2] = dx * g;
+	        Edy[tid2] = dy * g;
+        }
+
+        __syncthreads();
+
+        // This is a scan to get the summed dx, dy values.
+        // Gets 128-168
+        if (tid < 41)
+        {
+            Edx[tid] += Edx[tid + 128]; 
+        } 
+        __syncthreads(); 
+        if (tid < 64) 
+        {
+            Edx[tid] += Edx[tid + 64]; 
+        } 
+        __syncthreads(); 
+        if (tid < 32) 
+        {
+            volatile float* smem = Edx;
+
+            smem[tid] += smem[tid + 32];
+            smem[tid] += smem[tid + 16];
+            smem[tid] += smem[tid + 8];
+            smem[tid] += smem[tid + 4];
+            smem[tid] += smem[tid + 2];
+            smem[tid] += smem[tid + 1];
+        }
+
+        // Gets 128-168
+        if (tid < 41) 
+        {
+            Edy[tid] += Edy[tid + 128]; 
+        } 
+        __syncthreads(); 
+        if (tid < 64) 
+        {
+            Edy[tid] += Edy[tid + 64]; 
+        } 
+        __syncthreads(); 
+        if (tid < 32) 
+        {
+            volatile float* smem = Edy;
+
+            smem[tid] += smem[tid + 32];
+            smem[tid] += smem[tid + 16];
+            smem[tid] += smem[tid + 8];
+            smem[tid] += smem[tid + 4];
+            smem[tid] += smem[tid + 2];
+            smem[tid] += smem[tid + 1];
+        }
+ 
+        // Thread 0 saves back the result.
+        if (tid == 0)
+        {
+	        features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI);
+        }
+    }
+
+    void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures) 
+    {
+        dim3 threads;
+        threads.x = 17;
+        threads.y = 17;
+
+        dim3 grid;
+        grid.x = nFeatures;
+        grid.y = 1;
+        grid.z = 1;
+
+        find_orientation<<<grid, threads>>>(features);
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    ////////////////////////////////////////////////////////////////////////
+    // Descriptors
+
+    // precomputed values for a Gaussian with a standard deviation of 3.3
+    // - it appears SURF uses a different value, but not sure what it is
+    __constant__ float c_3p3gauss1D[20] = 
+    {
+        0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,
+		0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,
+		0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,
+		0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f
+    };   
+
+    template <int BLOCK_DIM_X>
+    __global__ void normalize_descriptors(PtrStepf descriptors)
+    {
+        // no need for thread ID
+        float* descriptor_base = descriptors.ptr(blockIdx.x);
+
+        // read in the unnormalized descriptor values (squared)
+        __shared__ float sqDesc[BLOCK_DIM_X];
+        const float lookup = descriptor_base[threadIdx.x];
+        sqDesc[threadIdx.x] = lookup * lookup;
+        __syncthreads();
+
+        if (BLOCK_DIM_X >= 128)
+        {
+            if (threadIdx.x < 64)
+	            sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
+            __syncthreads();
+        }
+
+        // reduction to get total
+        if (threadIdx.x < 32)
+        {
+            volatile float* smem = sqDesc;
+
+	        smem[threadIdx.x] += smem[threadIdx.x + 32];
+	        smem[threadIdx.x] += smem[threadIdx.x + 16];
+	        smem[threadIdx.x] += smem[threadIdx.x + 8];
+	        smem[threadIdx.x] += smem[threadIdx.x + 4];
+	        smem[threadIdx.x] += smem[threadIdx.x + 2];
+	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+        }
+
+        // compute length (square root)
+        __shared__ float len;
+        if (threadIdx.x == 0)
+        {
+	        len = sqrtf(sqDesc[0]);
+        }
+        __syncthreads();
+
+        // normalize and store in output
+        descriptor_base[threadIdx.x] = lookup / len;	
+    }
+
+    __device__ void calc_dx_dy(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)
+    {
+        // get the interest point parameters (x, y, size, response, angle)
+        __shared__ float ipt[5];
+        if (threadIdx.x < 5 && threadIdx.y == 0 && threadIdx.z == 0)
+        {
+	        ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];
+        }
+        __syncthreads();
+
+        float sin_theta, cos_theta;
+        sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);
+
+        // Compute sampling points
+        // since grids are 2D, need to compute xBlock and yBlock indices
+        const int xIndex = threadIdx.y * 5 + threadIdx.x % 5;
+        const int yIndex = threadIdx.z * 5 + threadIdx.x / 5;
+
+        // Compute rotated sampling points
+        // (clockwise rotation since we are rotating the lattice)
+        // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
+        const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] 
+            + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
+        const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] 
+            + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
+
+        // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
+        //	a b c
+        //	d	f
+        //	g h i
+
+        const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
+        const float b = tex2D(sumTex, sample_x,                sample_y - ipt[SF_SIZE]);
+        const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
+        const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
+        const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
+        const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
+        const float h = tex2D(sumTex, sample_x,                sample_y + ipt[SF_SIZE]);
+        const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
+
+        // compute axis-aligned HaarX, HaarY
+        // (could group the additions together into multiplications)
+        const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
+        const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i));            // unrotated dx
+        const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i));            // unrotated dy
+
+        // rotate responses (store all dxs then all dys)
+        // - counterclockwise rotation to rotate back to zero orientation
+        sdx[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
+        sdy[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
+    }
+
+    __device__ void reduce_sum(float sdata1[4][4][25], float sdata2[4][4][25], float sdata3[4][4][25],
+        float sdata4[4][4][25])
+    {
+        // first step is to reduce from 25 to 16
+        if (threadIdx.x < 9) // use 9 threads
+        {
+	        sdata1[threadIdx.z][threadIdx.y][threadIdx.x] += sdata1[threadIdx.z][threadIdx.y][threadIdx.x + 16];
+	        sdata2[threadIdx.z][threadIdx.y][threadIdx.x] += sdata2[threadIdx.z][threadIdx.y][threadIdx.x + 16];
+	        sdata3[threadIdx.z][threadIdx.y][threadIdx.x] += sdata3[threadIdx.z][threadIdx.y][threadIdx.x + 16];
+	        sdata4[threadIdx.z][threadIdx.y][threadIdx.x] += sdata4[threadIdx.z][threadIdx.y][threadIdx.x + 16];
+        }
+        __syncthreads();
+
+        // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+        if (threadIdx.x < 16)
+        {
+            volatile float* smem = sdata1[threadIdx.z][threadIdx.y];
+
+	        smem[threadIdx.x] += smem[threadIdx.x + 8];
+	        smem[threadIdx.x] += smem[threadIdx.x + 4];
+	        smem[threadIdx.x] += smem[threadIdx.x + 2];
+	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+
+            smem = sdata2[threadIdx.z][threadIdx.y];
+
+	        smem[threadIdx.x] += smem[threadIdx.x + 8];
+	        smem[threadIdx.x] += smem[threadIdx.x + 4];
+	        smem[threadIdx.x] += smem[threadIdx.x + 2];
+	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+
+            smem = sdata3[threadIdx.z][threadIdx.y];
+
+	        smem[threadIdx.x] += smem[threadIdx.x + 8];
+	        smem[threadIdx.x] += smem[threadIdx.x + 4];
+	        smem[threadIdx.x] += smem[threadIdx.x + 2];
+	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+
+            smem = sdata4[threadIdx.z][threadIdx.y];
+
+	        smem[threadIdx.x] += smem[threadIdx.x + 8];
+	        smem[threadIdx.x] += smem[threadIdx.x + 4];
+	        smem[threadIdx.x] += smem[threadIdx.x + 2];
+	        smem[threadIdx.x] += smem[threadIdx.x + 1];
+        }
+    }
+
+    // Spawn 16 blocks per interest point
+    // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
+    __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)
+    {        
+        // 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)
+        __shared__ float sdx[4][4][25]; 
+        __shared__ float sdy[4][4][25];
+
+        calc_dx_dy(sdx, sdy, features);
+        __syncthreads();
+
+        __shared__ float sdxabs[4][4][25];
+        __shared__ float sdyabs[4][4][25];
+        
+        sdxabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); // |dx| array
+        sdyabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); // |dy| array
+        __syncthreads();
+
+        reduce_sum(sdx, sdy, sdxabs, sdyabs);
+
+        float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 16 + threadIdx.y * 4;
+
+        // write dx, dy, |dx|, |dy|
+        if (threadIdx.x == 0)
+        {
+            descriptors_block[0] = sdx[threadIdx.z][threadIdx.y][0];
+            descriptors_block[1] = sdy[threadIdx.z][threadIdx.y][0];
+            descriptors_block[2] = sdxabs[threadIdx.z][threadIdx.y][0];
+            descriptors_block[3] = sdyabs[threadIdx.z][threadIdx.y][0];
+        }
+    }    
+
+    // Spawn 16 blocks per interest point
+    // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
+    __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)
+    {        
+        // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
+        __shared__ float sdx[4][4][25]; 
+        __shared__ float sdy[4][4][25];
+        
+        calc_dx_dy(sdx, sdy, features);
+        __syncthreads();
+
+        // sum (reduce) 5x5 area response
+        __shared__ float sd1[4][4][25];
+        __shared__ float sd2[4][4][25];
+        __shared__ float sdabs1[4][4][25]; 
+        __shared__ float sdabs2[4][4][25];
+
+        if (sdy[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)
+        {
+            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];
+            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);
+            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+        }
+        else
+        {
+            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];
+            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);
+        }
+        __syncthreads();
+        
+        reduce_sum(sd1, sd2, sdabs1, sdabs2);
+        
+        float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 32 + threadIdx.y * 8;
+
+        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
+        if (threadIdx.x == 0)
+        {
+	        descriptors_block[0] = sd1[threadIdx.z][threadIdx.y][0];
+	        descriptors_block[1] = sdabs1[threadIdx.z][threadIdx.y][0];
+	        descriptors_block[2] = sd2[threadIdx.z][threadIdx.y][0];
+	        descriptors_block[3] = sdabs2[threadIdx.z][threadIdx.y][0];
+        }
+        __syncthreads();
+
+        if (sdx[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)
+        {
+            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];
+            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);
+            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+        }
+        else
+        {
+            sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;
+            sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];
+            sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);
+        }
+        __syncthreads();
+
+        reduce_sum(sd1, sd2, sdabs1, sdabs2);
+
+        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
+        if (threadIdx.x == 0)
+        {
+	        descriptors_block[4] = sd1[threadIdx.z][threadIdx.y][0];
+	        descriptors_block[5] = sdabs1[threadIdx.z][threadIdx.y][0];
+	        descriptors_block[6] = sd2[threadIdx.z][threadIdx.y][0];
+	        descriptors_block[7] = sdabs2[threadIdx.z][threadIdx.y][0];
+        }
+    }
+
+    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
+    {
+        // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
+        
+        if (descriptors.cols == 64)
+        {
+            compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
+            cudaSafeCall( cudaThreadSynchronize() );
+
+            normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
+            cudaSafeCall( cudaThreadSynchronize() );
+        }
+        else
+        {
+            compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
+            cudaSafeCall( cudaThreadSynchronize() );
+
+            normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
+            cudaSafeCall( cudaThreadSynchronize() );
+        }
+    }
+}}}
diff --git a/modules/gpu/src/cuda/surf_key_point.h b/modules/gpu/src/cuda/surf_key_point.h
new file mode 100644
index 0000000000..aa3612e186
--- /dev/null
+++ b/modules/gpu/src/cuda/surf_key_point.h
@@ -0,0 +1,76 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_SURF_KEY_POINT_H__
+#define __OPENCV_SURF_KEY_POINT_H__
+
+namespace cv
+{
+    namespace gpu
+    {
+        namespace surf
+        {
+            struct KeyPoint_GPU
+            {
+                float x;
+                float y;
+                float size;
+                float response;
+                float angle;
+                float octave;
+            };
+
+            enum KeypointLayout 
+            {
+                SF_X,
+                SF_Y,
+                SF_SIZE,
+                SF_RESPONSE,
+                SF_ANGLE,
+                SF_OCTAVE,
+                SF_FEATURE_STRIDE
+            };
+        }
+    }
+}
+
+#endif // __OPENCV_SURF_KEY_POINT_H__
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 337af7aea8..48f77eee1e 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -304,7 +304,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
 
 void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value)
 {
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1);
+    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1);
 
     dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
 
@@ -318,26 +318,34 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
     switch (src.type())
     {
     case CV_8UC1:
-                {
+        {
             Npp8u nVal = static_cast<Npp8u>(value[0]);
             nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), src.step, srcsz,
-                dst.ptr<Npp8u>(), dst.step, dstsz, top, left, nVal) );
+            dst.ptr<Npp8u>(), dst.step, dstsz, top, left, nVal) );
             break;
-                }
+        }
     case CV_8UC4:
-                {
+        {
             Npp8u nVal[] = {static_cast<Npp8u>(value[0]), static_cast<Npp8u>(value[1]), static_cast<Npp8u>(value[2]), static_cast<Npp8u>(value[3])};
             nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), src.step, srcsz,
                 dst.ptr<Npp8u>(), dst.step, dstsz, top, left, nVal) );
             break;
-                }
+        }
     case CV_32SC1:
-            {
+        {
             Npp32s nVal = static_cast<Npp32s>(value[0]);
             nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), src.step, srcsz,
                 dst.ptr<Npp32s>(), dst.step, dstsz, top, left, nVal) );
             break;
-            }
+        }
+    case CV_32FC1:
+        {
+            float val = static_cast<float>(value[0]);
+            Npp32s nVal = *(reinterpret_cast<Npp32s*>(&val));
+            nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), src.step, srcsz,
+                dst.ptr<Npp32s>(), dst.step, dstsz, top, left, nVal) );
+            break;
+        }
     default:
         CV_Assert(!"Unsupported source type");
     }
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index faf9a18e85..195deaff3c 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -67,6 +67,7 @@
     #include "opencv2/gpu/stream_accessor.hpp"
     #include "npp.h"    
     #include "npp_staging.h"
+    #include "surf_key_point.h"
 
 #define CUDART_MINIMUM_REQUIRED_VERSION 3020
 #define NPP_MINIMUM_REQUIRED_VERSION 3216
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp
new file mode 100644
index 0000000000..ab67ff3a82
--- /dev/null
+++ b/modules/gpu/src/surf.cpp
@@ -0,0 +1,356 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined (HAVE_CUDA)
+
+int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
+void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector<KeyPoint>&, GpuMat&, bool, bool) { throw_nogpu(); }
+void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector<KeyPoint>&, vector<float>&, bool, bool) { throw_nogpu(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace surf
+{    
+    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);
+    
+    void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, 
+        int nIntervals, int x_size, int y_size);
+    
+    void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, 
+        KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);
+    
+    void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures);
+    
+    void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures);
+}}}
+
+using namespace cv::gpu::surf;
+
+namespace
+{
+    class SURF_GPU_Invoker : private SURFParams_GPU
+    {
+    public:
+        SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img) : 
+            SURFParams_GPU(surf),
+
+            img_float(surf.img_float), img_float_tr(surf.img_float_tr),
+
+            sum(surf.sum), 
+
+            hessianBuffer(surf.hessianBuffer), 
+            maxPosBuffer(surf.maxPosBuffer), 
+            featuresBuffer(surf.featuresBuffer),
+
+            img_cols(img.cols), img_rows(img.rows),
+
+            mask_width(0), mask_height(0),
+
+            featureCounter(0), maxCounter(0)
+        {
+            CV_Assert((img.type() == CV_8UC1 || img.type() == CV_32FC1) && nOctaves > 0 && nIntervals > 2);
+            CV_Assert(hasAtomicsSupport(getDevice()));
+
+            max_features = static_cast<int>(img.size().area() * featuresRatio);
+            max_candidates = static_cast<int>(1.5 * max_features);
+
+            featuresBuffer.create(1, max_features, CV_32FC(6));
+            maxPosBuffer.create(1, max_candidates, CV_32SC4);
+
+            mask_width = l2 * 0.5f;
+            mask_height = 1.0f + l1;
+
+            // Dxy gap half-width
+            float dxy_center_offset = 0.5f * (l4 + l3);
+            // Dxy squares half-width
+            float dxy_half_width = 0.5f * l3;
+
+            // rescale edge_scale to fit with the filter dimensions
+            float dxy_scale = edgeScale * std::pow((2.f + 2.f * l1) * l2 / (4.f * l3 * l3), 2.f);
+            
+            // Compute border required such that the filters don't overstep the image boundaries	        
+	        float smax0 = 2.0f * initialScale + 0.5f;
+            int border0 = static_cast<int>(std::ceil(smax0 * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));
+
+            int width0 = (img_cols - 2 * border0) / initialStep;
+            int height0 = (img_rows - 2 * border0) / initialStep;
+
+            uploadConstant("cv::gpu::surf::c_max_candidates",    max_candidates);
+            uploadConstant("cv::gpu::surf::c_max_features",      max_features);
+            uploadConstant("cv::gpu::surf::c_nIntervals",        nIntervals);
+            uploadConstant("cv::gpu::surf::c_mask_width",        mask_width);
+            uploadConstant("cv::gpu::surf::c_mask_height",       mask_height);
+            uploadConstant("cv::gpu::surf::c_dxy_center_offset", dxy_center_offset);
+            uploadConstant("cv::gpu::surf::c_dxy_half_width",    dxy_half_width);
+            uploadConstant("cv::gpu::surf::c_dxy_scale",         dxy_scale);
+            uploadConstant("cv::gpu::surf::c_initialScale",      initialScale);
+            uploadConstant("cv::gpu::surf::c_threshold",         threshold);
+            
+            hessianBuffer.create(height0 * nIntervals, width0, CV_32F);
+
+            if (img.type() == CV_32FC1)
+                img_float = img;
+            else
+                img.convertTo(img_float, CV_32F, 1.0 / 255.0);
+
+            transpose(img_float, img_float_tr);
+            columnSum(img_float_tr, img_float_tr);
+            transpose(img_float_tr, sum);
+            columnSum(sum, sum);
+            
+            bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sum);
+        }
+
+        ~SURF_GPU_Invoker()
+        {
+            unbindTexture("cv::gpu::surf::sumTex");
+        }
+
+        void detectKeypoints(GpuMat& keypoints)
+        {
+            for(int octave = 0; octave < nOctaves; ++octave)
+            {
+                int step = initialStep * (1 << octave);
+
+                // Compute border required such that the filters don't overstep the image boundaries
+                float d = (initialScale * (1 << octave)) / (nIntervals - 2);
+	            float smax = initialScale * (1 << octave) + d * (nIntervals - 2.0f) + 0.5f;
+                int border = static_cast<int>(std::ceil(smax * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f)));
+                
+                int x_size = (img_cols - 2 * border) / step;
+                int y_size = (img_rows - 2 * border) / step;
+                
+                if (x_size <= 0 || y_size <= 0)
+                    break;
+
+                uploadConstant("cv::gpu::surf::c_octave", octave);
+                uploadConstant("cv::gpu::surf::c_x_size", x_size);
+                uploadConstant("cv::gpu::surf::c_y_size", y_size);
+                uploadConstant("cv::gpu::surf::c_border", border);
+                uploadConstant("cv::gpu::surf::c_step",   step);
+
+                fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);
+
+                // Reset the candidate count.
+                maxCounter = 0;
+
+                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size); 
+                
+                maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));
+
+                fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter,
+                    featuresBuffer.ptr<KeyPoint_GPU>(), featureCounter);
+
+                featureCounter = std::min(featureCounter, static_cast<unsigned int>(max_features));
+            }
+
+            featuresBuffer.colRange(0, featureCounter).copyTo(keypoints);
+        }
+
+        void findOrientation(GpuMat& keypoints)
+        {
+            if (keypoints.cols > 0)
+                find_orientation_gpu(keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
+        }
+
+        void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
+        {
+            if (keypoints.cols > 0)
+            {
+                descriptors.create(keypoints.cols, descriptorSize, CV_32F);
+                compute_descriptors_gpu(descriptors, keypoints.ptr<KeyPoint_GPU>(), keypoints.cols);
+            }
+        }
+
+    private:
+        GpuMat& img_float;
+        GpuMat& img_float_tr;
+
+        GpuMat& sum;
+
+        GpuMat& hessianBuffer;
+        GpuMat& maxPosBuffer;
+        GpuMat& featuresBuffer;
+
+        int img_cols, img_rows;
+        
+        float mask_width, mask_height;
+
+        unsigned int featureCounter;
+        unsigned int maxCounter;
+
+        int max_candidates;
+        int max_features;
+    };
+}
+
+int cv::gpu::SURF_GPU::descriptorSize() const
+{
+    return extended ? 128 : 64;
+}
+
+void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU)
+{
+    Mat keypointsCPU(1, keypoints.size(), CV_32FC(6));
+
+    const KeyPoint* keypoints_ptr = &keypoints[0];
+    KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
+    for (size_t i = 0; i < keypoints.size(); ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
+    {
+        const KeyPoint& kp = *keypoints_ptr;
+        KeyPoint_GPU& gkp = *keypointsCPU_ptr;
+
+        gkp.x = kp.pt.x;
+        gkp.y = kp.pt.y;
+
+        gkp.size = kp.size;
+
+        gkp.octave = static_cast<float>(kp.octave);
+        gkp.angle = kp.angle;
+        gkp.response = kp.response;
+    }
+
+    keypointsGPU.upload(keypointsCPU);
+}
+
+void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints)
+{
+    CV_Assert(keypointsGPU.type() == CV_32FC(6) && keypointsGPU.rows == 1);
+
+    Mat keypointsCPU = keypointsGPU;
+    keypoints.resize(keypointsGPU.cols);
+
+    KeyPoint* keypoints_ptr = &keypoints[0];
+    const KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr<KeyPoint_GPU>();
+    for (int i = 0; i < keypointsGPU.cols; ++i, ++keypoints_ptr, ++keypointsCPU_ptr)
+    {
+        KeyPoint& kp = *keypoints_ptr;
+        const KeyPoint_GPU& gkp = *keypointsCPU_ptr;
+
+        kp.pt.x = gkp.x;
+        kp.pt.y = gkp.y;
+
+        kp.size = gkp.size;
+
+        kp.octave = static_cast<int>(gkp.octave);
+        kp.angle = gkp.angle;
+        kp.response = gkp.response;
+    }
+}
+
+void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors)
+{
+    CV_Assert(descriptorsGPU.type() == CV_32F);
+
+    descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
+    Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
+    descriptorsGPU.download(descriptorsCPU);
+}
+
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints)
+{
+    SURF_GPU_Invoker surf(*this, img);
+
+    surf.detectKeypoints(keypoints);
+
+    surf.findOrientation(keypoints);
+}
+
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, 
+                                   bool useProvidedKeypoints, bool calcOrientation)
+{
+    SURF_GPU_Invoker surf(*this, img);
+    
+    if (!useProvidedKeypoints)
+        surf.detectKeypoints(keypoints);
+    
+    if (calcOrientation)
+        surf.findOrientation(keypoints);
+
+    surf.computeDescriptors(keypoints, descriptors, descriptorSize());
+}
+
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector<KeyPoint>& keypoints)
+{
+    GpuMat keypointsGPU;
+
+    (*this)(img, keypointsGPU);
+
+    downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector<KeyPoint>& keypoints, GpuMat& descriptors, 
+                                   bool useProvidedKeypoints, bool calcOrientation)
+{
+    GpuMat keypointsGPU;
+
+    if (useProvidedKeypoints)
+        uploadKeypoints(keypoints, keypointsGPU);    
+
+    (*this)(img, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation);
+
+    downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector<KeyPoint>& keypoints, vector<float>& descriptors, 
+                                   bool useProvidedKeypoints, bool calcOrientation)
+{
+    GpuMat descriptorsGPU;
+
+    (*this)(img, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation);
+
+    downloadDescriptors(descriptorsGPU, descriptors);
+}
+
+#endif /* !defined (HAVE_CUDA) */