From 0e45a637c4f3d9fdf2d3f27eeb217681b762867a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 20 Dec 2010 12:49:40 +0000 Subject: [PATCH] added SURF_GPU. added support of CV_32FC1 type to gpu copyMakeBorder. --- modules/gpu/include/opencv2/gpu/gpu.hpp | 96 ++- modules/gpu/src/brute_force_matcher.cpp | 4 +- modules/gpu/src/cuda/surf.cu | 966 ++++++++++++++++++++++++ modules/gpu/src/cuda/surf_key_point.h | 76 ++ modules/gpu/src/imgproc_gpu.cpp | 24 +- modules/gpu/src/precomp.hpp | 1 + modules/gpu/src/surf.cpp | 356 +++++++++ 7 files changed, 1511 insertions(+), 12 deletions(-) create mode 100644 modules/gpu/src/cuda/surf.cu create mode 100644 modules/gpu/src/cuda/surf_key_point.h create mode 100644 modules/gpu/src/surf.cpp diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index a8c3a146ed..3e0c0de2c6 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -591,7 +591,7 @@ namespace cv CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR); //! copies 2D array to a larger destination array and pads borders with user-specifiable constant - //! supports CV_8UC1, CV_8UC4, CV_32SC1 types + //! supports CV_8UC1, CV_8UC4, CV_32SC1 and CV_32FC1 types CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar()); //! computes the integral image and integral for the squared image @@ -1366,6 +1366,100 @@ namespace cv Ptr oldCascade; }; + ////////////////////////////////// SURF ////////////////////////////////////////// + + struct CV_EXPORTS SURFParams_GPU + { + SURFParams_GPU() : + threshold(0.1f), + nOctaves(4), + nIntervals(4), + initialScale(2.f), + + l1(3.f/1.5f), + l2(5.f/1.5f), + l3(3.f/1.5f), + l4(1.f/1.5f), + edgeScale(0.81f), + initialStep(1), + + extended(true), + + featuresRatio(0.01f) + { + } + + //! The interest operator threshold + float threshold; + //! The number of octaves to process + int nOctaves; + //! The number of intervals in each octave + int nIntervals; + //! The scale associated with the first interval of the first octave + float initialScale; + + //! mask parameter l_1 + float l1; + //! mask parameter l_2 + float l2; + //! mask parameter l_3 + float l3; + //! mask parameter l_4 + float l4; + //! The amount to scale the edge rejection mask + float edgeScale; + //! The initial sampling step in pixels. + int initialStep; + + //! True, if generate 128-len descriptors, false - 64-len descriptors + bool extended; + + //! max features = featuresRatio * img.size().srea() + float featuresRatio; + }; + + class CV_EXPORTS SURF_GPU : public SURFParams_GPU + { + public: + //! returns the descriptor size in float's (64 or 128) + int descriptorSize() const; + + //! upload host keypoints to device memory + static void uploadKeypoints(const vector& keypoints, GpuMat& keypointsGPU); + //! download keypoints from device to host memory + static void downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints); + + //! download descriptors from device to host memory + static void downloadDescriptors(const GpuMat& descriptorsGPU, vector& descriptors); + + //! finds the keypoints using fast hessian detector used in SURF + //! supports CV_8UC1 (0..255) and CV_32FC1 (0..1) images + //! keypoints will have 1 row and type CV_32FC(6) + //! keypoints.at(1, i) contains i'th keypoint + //! format: (x, y, size, response, angle, octave) + void operator()(const GpuMat& img, GpuMat& keypoints); + //! finds the keypoints and computes their descriptors. + //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction + void operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, + bool useProvidedKeypoints = false, bool calcOrientation = true); + + void operator()(const GpuMat& img, std::vector& keypoints); + void operator()(const GpuMat& img, std::vector& keypoints, GpuMat& descriptors, + bool useProvidedKeypoints = false, bool calcOrientation = true); + + void operator()(const GpuMat& img, std::vector& keypoints, std::vector& descriptors, + bool useProvidedKeypoints = false, bool calcOrientation = true); + + GpuMat img_float; + GpuMat img_float_tr; + + GpuMat sum; + + GpuMat hessianBuffer; + GpuMat maxPosBuffer; + GpuMat featuresBuffer; + }; + } //! Speckle filtering - filters small connected components on diparity image. diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 626dfa0577..3469e5be44 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -493,9 +493,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, } }; - int major, minor; - getComputeCapability(getDevice(), major, minor); - CV_Assert(100 * major + 10 * minor >= 110); // works onle on device with CC >= 1.1 + CV_Assert(hasAtomicsSupport(getDevice())); const int nQuery = queryDescs.rows; const int nTrain = trainDescs.rows; diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu new file mode 100644 index 0000000000..29b86ee2bb --- /dev/null +++ b/modules/gpu/src/cuda/surf.cu @@ -0,0 +1,966 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +// Copyright (c) 2010, Paul Furgale, Chi Hay Tong +// +// The original code was written by Paul Furgale and Chi Hay Tong +// and later optimized and prepared for integration into OpenCV by Itseez. +// +//M*/ + +#include "internal_shared.hpp" +#include "surf_key_point.h" +#include "opencv2/gpu/device/limits_gpu.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +#define CV_PI 3.1415926535897932384626433832795f + +namespace cv { namespace gpu { namespace surf +{ + //////////////////////////////////////////////////////////////////////// + // Help funcs + + // Wrapper for host reference to pass into kernel + template + class DeviceReference + { + public: + explicit DeviceReference(T& host_val) : d_ptr(0), h_ptr(&host_val) + { + cudaSafeCall( cudaMalloc((void**)&d_ptr, sizeof(T)) ); + cudaSafeCall( cudaMemcpy(d_ptr, h_ptr, sizeof(T), cudaMemcpyHostToDevice) ); + } + + ~DeviceReference() + { + cudaSafeCall( cudaMemcpy(h_ptr, d_ptr, sizeof(T), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaFree(d_ptr) ); + } + + // Casting to device pointer + operator T*() {return d_ptr;} + operator const T*() const {return d_ptr;} + private: + T* d_ptr; + T* h_ptr; + }; + + __device__ void clearLastBit(int* f) + { + *f &= ~0x1; + } + __device__ void clearLastBit(float& f) + { + clearLastBit((int*)&f); + } + + __device__ void setLastBit(int* f) + { + *f |= 0x1; + } + __device__ void setLastBit(float& f) + { + setLastBit((int*)&f); + } + + //////////////////////////////////////////////////////////////////////// + // Global parameters + + // The maximum number of features (before subpixel interpolation) that memory is reserved for. + __constant__ int c_max_candidates; + // The maximum number of features that memory is reserved for. + __constant__ int c_max_features; + // The number of intervals in the octave. + __constant__ int c_nIntervals; + // Mask sizes derived from the mask parameters + __constant__ float c_mask_width; + // Mask sizes derived from the mask parameters + __constant__ float c_mask_height; + // Mask sizes derived from the mask parameters + __constant__ float c_dxy_center_offset; + // Mask sizes derived from the mask parameters + __constant__ float c_dxy_half_width; + // Mask sizes derived from the mask parameters + __constant__ float c_dxy_scale; + // The scale associated with the first interval of the first octave + __constant__ float c_initialScale; + //! The interest operator threshold + __constant__ float c_threshold; + + // Ther octave + __constant__ int c_octave; + // The width of the octave buffer. + __constant__ int c_x_size; + // The height of the octave buffer. + __constant__ int c_y_size; + // The size of the octave border in pixels. + __constant__ int c_border; + // The step size used in this octave in pixels. + __constant__ int c_step; + + //////////////////////////////////////////////////////////////////////// + // Integral image texture + + texture sumTex(0, cudaFilterModeLinear, cudaAddressModeClamp); + + __device__ float iiAreaLookupCDHalfWH(float cx, float cy, float halfWidth, float halfHeight) + { + float result = 0.f; + + result += tex2D(sumTex, cx - halfWidth, cy - halfHeight); + result -= tex2D(sumTex, cx + halfWidth, cy - halfHeight); + result -= tex2D(sumTex, cx - halfWidth, cy + halfHeight); + result += tex2D(sumTex, cx + halfWidth, cy + halfHeight); + + return result; + } + + //////////////////////////////////////////////////////////////////////// + // Hessian + + __device__ float evalDyy(float x, float y, float t, float mask_width, float mask_height, float fscale) + { + float Dyy = 0.f; + + Dyy += iiAreaLookupCDHalfWH(x, y, mask_width, mask_height); + Dyy -= t * iiAreaLookupCDHalfWH(x, y, mask_width, fscale); + + Dyy *= 1.0f / (fscale * fscale); + + return Dyy; + } + + __device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale) + { + float Dxx = 0.f; + + Dxx += iiAreaLookupCDHalfWH(x, y, mask_height, mask_width); + Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale , mask_width); + + Dxx *= 1.0f / (fscale * fscale); + + return Dxx; + } + + __device__ float evalDxy(float x, float y, float fscale) + { + float center_offset = c_dxy_center_offset * fscale; + float half_width = c_dxy_half_width * fscale; + + float Dxy = 0.f; + + Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width); + Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width); + Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width); + Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width); + + Dxy *= 1.0f / (fscale * fscale); + + return Dxy; + } + + __device__ float calcScale(int hidx_z) + { + float d = (c_initialScale * (1 << c_octave)) / (c_nIntervals - 2); + return c_initialScale * (1 << c_octave) + d * (hidx_z - 1.0f) + 0.5f; + } + + __global__ void fasthessian(PtrStepf hessianBuffer) + { + // Determine the indices in the Hessian buffer + int hidx_x = threadIdx.x + blockIdx.x * blockDim.x; + int hidx_y = threadIdx.y + blockIdx.y * blockDim.y; + int hidx_z = threadIdx.z; + + float fscale = calcScale(hidx_z); + + // Compute the lookup location of the mask center + float x = hidx_x * c_step + c_border; + float y = hidx_y * c_step + c_border; + + // Scale the mask dimensions according to the scale + if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals) + { + float mask_width = c_mask_width * fscale; + float mask_height = c_mask_height * fscale; + + // Compute the filter responses + float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale); + float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale); + float Dxy = evalDxy(x, y, fscale); + + // Combine the responses and store the Laplacian sign + float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy); + + if (Dxx + Dyy > 0.f) + setLastBit(result); + else + clearLastBit(result); + + hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result; + } + } + + void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size) + { + dim3 threads; + threads.x = 16; + threads.y = 8; + threads.z = nIntervals; + + dim3 grid; + grid.x = divUp(x_size, threads.x); + grid.y = divUp(y_size, threads.y); + grid.z = 1; + + fasthessian<<>>(hessianBuffer); + + cudaSafeCall( cudaThreadSynchronize() ); + } + + //////////////////////////////////////////////////////////////////////// + // NONMAX + + __global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + + extern __shared__ float fh_vals[]; + + // The hidx variables are the indices to the hessian buffer. + int hidx_x = threadIdx.x + blockIdx.x * (blockDim.x - 2); + int hidx_y = threadIdx.y + blockIdx.y * (blockDim.y - 2); + int hidx_z = threadIdx.z; + int localLin = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + + // Is this thread within the hessian buffer? + if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals) + { + fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x]; + } + __syncthreads(); + + // Is this location one of the ones being processed for nonmax suppression. + // Blocks overlap by one so we don't process the border threads. + bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1 + && threadIdx.y > 0 && threadIdx.y < blockDim.y-1 && hidx_y < c_y_size - 1 + && threadIdx.z > 0 && threadIdx.z < blockDim.z-1; + + float val = fh_vals[localLin]; + + if (inBounds2 && val >= c_threshold) + { + // Check to see if we have a max (in its 26 neighbours) + int zoff = blockDim.x * blockDim.y; + bool condmax = val > fh_vals[localLin + 1] + && val > fh_vals[localLin - 1] + && val > fh_vals[localLin - blockDim.x + 1] + && val > fh_vals[localLin - blockDim.x ] + && val > fh_vals[localLin - blockDim.x - 1] + && val > fh_vals[localLin + blockDim.x + 1] + && val > fh_vals[localLin + blockDim.x ] + && val > fh_vals[localLin + blockDim.x - 1] + + && val > fh_vals[localLin - zoff + 1] + && val > fh_vals[localLin - zoff ] + && val > fh_vals[localLin - zoff - 1] + && val > fh_vals[localLin - zoff - blockDim.x + 1] + && val > fh_vals[localLin - zoff - blockDim.x ] + && val > fh_vals[localLin - zoff - blockDim.x - 1] + && val > fh_vals[localLin - zoff + blockDim.x + 1] + && val > fh_vals[localLin - zoff + blockDim.x ] + && val > fh_vals[localLin - zoff + blockDim.x - 1] + + && val > fh_vals[localLin + zoff + 1] + && val > fh_vals[localLin + zoff ] + && val > fh_vals[localLin + zoff - 1] + && val > fh_vals[localLin + zoff - blockDim.x + 1] + && val > fh_vals[localLin + zoff - blockDim.x ] + && val > fh_vals[localLin + zoff - blockDim.x - 1] + && val > fh_vals[localLin + zoff + blockDim.x + 1] + && val > fh_vals[localLin + zoff + blockDim.x ] + && val > fh_vals[localLin + zoff + blockDim.x - 1] + ; + + if(condmax) + { + unsigned int i = atomicInc(maxCounter,(unsigned int) -1); + + if (i < c_max_candidates) + { + int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave}; + maxPosBuffer[i] = f; + } + } + } + + #endif + } + + void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, + int nIntervals, int x_size, int y_size) + { + dim3 threads; + threads.x = 16; + threads.y = 8; + threads.z = nIntervals; + + dim3 grid; + grid.x = divUp(x_size, threads.x - 2); + grid.y = divUp(y_size, threads.y - 2); + grid.z = 1; + + const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float); + + DeviceReference maxCounterWrapper(maxCounter); + + nonmaxonly<<>>(hessianBuffer, maxPosBuffer, maxCounterWrapper); + + cudaSafeCall( cudaThreadSynchronize() ); + } + + //////////////////////////////////////////////////////////////////////// + // INTERPOLATION + + #define MID_IDX 1 + __global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer, + KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter) + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + + int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x; + int hidx_y = maxPosBuffer[blockIdx.x].y - 1 + threadIdx.y; + int hidx_z = maxPosBuffer[blockIdx.x].z - 1 + threadIdx.z; + + __shared__ float fh_vals[3][3][3]; + __shared__ KeyPoint_GPU p; + + fh_vals[threadIdx.z][threadIdx.y][threadIdx.x] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x]; + __syncthreads(); + + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) + { + __shared__ float H[3][3]; + + //dxx + H[0][0] = fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] + - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ] + + fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]; + + //dyy + H[1][1] = fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] + - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ] + + fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]; + + //dss + H[2][2] = fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] + - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ] + + fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]; + + //dxy + H[0][1]= 0.25f* + (fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX + 1] - + fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX + 1] - + fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX - 1] + + fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX - 1]); + + //dxs + H[0][2]= 0.25f* + (fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX ] - + fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX ] - + fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX ] + + fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX ]); + + //dys + H[1][2]= 0.25f* + (fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX + 1] - + fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX - 1] - + fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX + 1] + + fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX - 1]); + + //dyx = dxy + H[1][0] = H[0][1]; + + //dsx = dxs + H[2][0] = H[0][2]; + + //dsy = dys + H[2][1] = H[1][2]; + + __shared__ float dD[3]; + + //dx + dD[0] = 0.5f*(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] - + fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]); + //dy + dD[1] = 0.5f*(fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] - + fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]); + //ds + dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] - + fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]); + + __shared__ float invdet; + invdet = 1.f / + ( + H[0][0]*H[1][1]*H[2][2] + + H[0][1]*H[1][2]*H[2][0] + + H[0][2]*H[1][0]*H[2][1] + - H[0][0]*H[1][2]*H[2][1] + - H[0][1]*H[1][0]*H[2][2] + - H[0][2]*H[1][1]*H[2][0] + ); + + // // 1-based entries of a 3x3 inverse + // /* [ |a22 a23| |a12 a13| |a12 a13|] */ + // /* [ |a32 a33| -|a32 a33| |a22 a23|] */ + // /* [ ] */ + // /* [ |a21 a23| |a11 a13| |a11 a13|] */ + // /* A^(-1) = [-|a31 a33| |a31 a33| -|a21 a23|] / d */ + // /* [ ] */ + // /* [ |a21 a22| |a11 a12| |a11 a12|] */ + // /* [ |a31 a32| -|a31 a32| |a21 a22|] */ + + __shared__ float Hinv[3][3]; + Hinv[0][0] = invdet*(H[1][1]*H[2][2]-H[1][2]*H[2][1]); + Hinv[0][1] = -invdet*(H[0][1]*H[2][2]-H[0][2]*H[2][1]); + Hinv[0][2] = invdet*(H[0][1]*H[1][2]-H[0][2]*H[1][1]); + + Hinv[1][0] = -invdet*(H[1][0]*H[2][2]-H[1][2]*H[2][0]); + Hinv[1][1] = invdet*(H[0][0]*H[2][2]-H[0][2]*H[2][0]); + Hinv[1][2] = -invdet*(H[0][0]*H[1][2]-H[0][2]*H[1][0]); + + Hinv[2][0] = invdet*(H[1][0]*H[2][1]-H[1][1]*H[2][0]); + Hinv[2][1] = -invdet*(H[0][0]*H[2][1]-H[0][1]*H[2][0]); + Hinv[2][2] = invdet*(H[0][0]*H[1][1]-H[0][1]*H[1][0]); + + __shared__ float x[3]; + + x[0] = -(Hinv[0][0]*(dD[0]) + Hinv[0][1]*(dD[1]) + Hinv[0][2]*(dD[2])); + x[1] = -(Hinv[1][0]*(dD[0]) + Hinv[1][1]*(dD[1]) + Hinv[1][2]*(dD[2])); + x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2])); + + if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f) + { + // if the step is within the interpolation region, perform it + + // Get a new feature index. + unsigned int i = atomicInc(featureCounter, (unsigned int)-1); + + if (i < c_max_features) + { + p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border; + p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border; + + if (x[2] > 0) + { + float a = calcScale(maxPosBuffer[blockIdx.x].z); + float b = calcScale(maxPosBuffer[blockIdx.x].z + 1); + + p.size = (1.f - x[2]) * a + x[2] * b; + } + else + { + float a = calcScale(maxPosBuffer[blockIdx.x].z); + float b = calcScale(maxPosBuffer[blockIdx.x].z - 1); + + p.size = (1.f + x[2]) * a - x[2] * b; + } + + p.octave = c_octave; + + p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX]; + + // Should we split up this transfer over many threads? + featuresBuffer[i] = p; + } + } // If the subpixel interpolation worked + } // If this is thread 0. + + #endif + } + #undef MID_IDX + + void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, + KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter) + { + dim3 threads; + threads.x = 3; + threads.y = 3; + threads.z = 3; + + dim3 grid; + grid.x = maxCounter; + grid.y = 1; + grid.z = 1; + + DeviceReference featureCounterWrapper(featureCounter); + + fh_interp_extremum<<>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper); + + cudaSafeCall( cudaThreadSynchronize() ); + } + + //////////////////////////////////////////////////////////////////////// + // Orientation + + // precomputed values for a Gaussian with a standard deviation of 2 + __constant__ float c_gauss1D[13] = + { + 0.002215924206f, 0.008764150247f, 0.026995483257f, 0.064758797833f, + 0.120985362260f, 0.176032663382f, 0.199471140201f, 0.176032663382f, + 0.120985362260f, 0.064758797833f, 0.026995483257f, 0.008764150247f, + 0.002215924206f + }; + + __global__ void find_orientation(KeyPoint_GPU* features) + { + int tid = threadIdx.y * 17 + threadIdx.x; + int tid2 = numeric_limits_gpu::max(); + + if (threadIdx.x < 13 && threadIdx.y < 13) + { + tid2 = threadIdx.y * 13 + threadIdx.x; + } + + __shared__ float texLookups[17][17]; + + __shared__ float Edx[13*13]; + __shared__ float Edy[13*13]; + __shared__ float xys[3]; + + // Read my x, y, size. + if (tid < 3) + { + xys[tid] = ((float*)(&features[blockIdx.x]))[tid]; + } + __syncthreads(); + + // Read all texture locations into memory + // Maybe I should use __mul24 here? + texLookups[threadIdx.x][threadIdx.y] = tex2D(sumTex, xys[SF_X] + ((int)threadIdx.x - 8) * xys[SF_SIZE], + xys[SF_Y] + ((int)threadIdx.y - 8) * xys[SF_SIZE]); + + __syncthreads(); + + float dx = 0.f; + float dy = 0.f; + + // Computes lookups for all points in a 13x13 lattice. + // - SURF says to only use a circle, but the branching logic would slow it down + // - Gaussian weighting should reduce the effects of the outer points anyway + if (tid2 < 169) + { + dx -= texLookups[threadIdx.x ][threadIdx.y ]; + dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ]; + dx -= texLookups[threadIdx.x + 4][threadIdx.y ]; + dx += texLookups[threadIdx.x ][threadIdx.y + 4]; + dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4]; + dx += texLookups[threadIdx.x + 4][threadIdx.y + 4]; + + dy -= texLookups[threadIdx.x ][threadIdx.y ]; + dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2]; + dy -= texLookups[threadIdx.x ][threadIdx.y + 4]; + dy += texLookups[threadIdx.x + 4][threadIdx.y ]; + dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2]; + dy += texLookups[threadIdx.x + 4][threadIdx.y + 4]; + + float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y]; + + Edx[tid2] = dx * g; + Edy[tid2] = dy * g; + } + + __syncthreads(); + + // This is a scan to get the summed dx, dy values. + // Gets 128-168 + if (tid < 41) + { + Edx[tid] += Edx[tid + 128]; + } + __syncthreads(); + if (tid < 64) + { + Edx[tid] += Edx[tid + 64]; + } + __syncthreads(); + if (tid < 32) + { + volatile float* smem = Edx; + + smem[tid] += smem[tid + 32]; + smem[tid] += smem[tid + 16]; + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } + + // Gets 128-168 + if (tid < 41) + { + Edy[tid] += Edy[tid + 128]; + } + __syncthreads(); + if (tid < 64) + { + Edy[tid] += Edy[tid + 64]; + } + __syncthreads(); + if (tid < 32) + { + volatile float* smem = Edy; + + smem[tid] += smem[tid + 32]; + smem[tid] += smem[tid + 16]; + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } + + // Thread 0 saves back the result. + if (tid == 0) + { + features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI); + } + } + + void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures) + { + dim3 threads; + threads.x = 17; + threads.y = 17; + + dim3 grid; + grid.x = nFeatures; + grid.y = 1; + grid.z = 1; + + find_orientation<<>>(features); + cudaSafeCall( cudaThreadSynchronize() ); + } + + //////////////////////////////////////////////////////////////////////// + // Descriptors + + // precomputed values for a Gaussian with a standard deviation of 3.3 + // - it appears SURF uses a different value, but not sure what it is + __constant__ float c_3p3gauss1D[20] = + { + 0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f, + 0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f, + 0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f, + 0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f + }; + + template + __global__ void normalize_descriptors(PtrStepf descriptors) + { + // no need for thread ID + float* descriptor_base = descriptors.ptr(blockIdx.x); + + // read in the unnormalized descriptor values (squared) + __shared__ float sqDesc[BLOCK_DIM_X]; + const float lookup = descriptor_base[threadIdx.x]; + sqDesc[threadIdx.x] = lookup * lookup; + __syncthreads(); + + if (BLOCK_DIM_X >= 128) + { + if (threadIdx.x < 64) + sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64]; + __syncthreads(); + } + + // reduction to get total + if (threadIdx.x < 32) + { + volatile float* smem = sqDesc; + + smem[threadIdx.x] += smem[threadIdx.x + 32]; + smem[threadIdx.x] += smem[threadIdx.x + 16]; + smem[threadIdx.x] += smem[threadIdx.x + 8]; + smem[threadIdx.x] += smem[threadIdx.x + 4]; + smem[threadIdx.x] += smem[threadIdx.x + 2]; + smem[threadIdx.x] += smem[threadIdx.x + 1]; + } + + // compute length (square root) + __shared__ float len; + if (threadIdx.x == 0) + { + len = sqrtf(sqDesc[0]); + } + __syncthreads(); + + // normalize and store in output + descriptor_base[threadIdx.x] = lookup / len; + } + + __device__ void calc_dx_dy(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features) + { + // get the interest point parameters (x, y, size, response, angle) + __shared__ float ipt[5]; + if (threadIdx.x < 5 && threadIdx.y == 0 && threadIdx.z == 0) + { + ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x]; + } + __syncthreads(); + + float sin_theta, cos_theta; + sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta); + + // Compute sampling points + // since grids are 2D, need to compute xBlock and yBlock indices + const int xIndex = threadIdx.y * 5 + threadIdx.x % 5; + const int yIndex = threadIdx.z * 5 + threadIdx.x / 5; + + // Compute rotated sampling points + // (clockwise rotation since we are rotating the lattice) + // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel) + const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] + + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]); + const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] + + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]); + + // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy) + // a b c + // d f + // g h i + + const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); + const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]); + const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]); + const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y); + const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y); + const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); + const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]); + const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); + + // compute axis-aligned HaarX, HaarY + // (could group the additions together into multiplications) + const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular) + const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx + const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy + + // rotate responses (store all dxs then all dys) + // - counterclockwise rotation to rotate back to zero orientation + sdx[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx + sdy[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy + } + + __device__ void reduce_sum(float sdata1[4][4][25], float sdata2[4][4][25], float sdata3[4][4][25], + float sdata4[4][4][25]) + { + // first step is to reduce from 25 to 16 + if (threadIdx.x < 9) // use 9 threads + { + sdata1[threadIdx.z][threadIdx.y][threadIdx.x] += sdata1[threadIdx.z][threadIdx.y][threadIdx.x + 16]; + sdata2[threadIdx.z][threadIdx.y][threadIdx.x] += sdata2[threadIdx.z][threadIdx.y][threadIdx.x + 16]; + sdata3[threadIdx.z][threadIdx.y][threadIdx.x] += sdata3[threadIdx.z][threadIdx.y][threadIdx.x + 16]; + sdata4[threadIdx.z][threadIdx.y][threadIdx.x] += sdata4[threadIdx.z][threadIdx.y][threadIdx.x + 16]; + } + __syncthreads(); + + // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp) + if (threadIdx.x < 16) + { + volatile float* smem = sdata1[threadIdx.z][threadIdx.y]; + + smem[threadIdx.x] += smem[threadIdx.x + 8]; + smem[threadIdx.x] += smem[threadIdx.x + 4]; + smem[threadIdx.x] += smem[threadIdx.x + 2]; + smem[threadIdx.x] += smem[threadIdx.x + 1]; + + smem = sdata2[threadIdx.z][threadIdx.y]; + + smem[threadIdx.x] += smem[threadIdx.x + 8]; + smem[threadIdx.x] += smem[threadIdx.x + 4]; + smem[threadIdx.x] += smem[threadIdx.x + 2]; + smem[threadIdx.x] += smem[threadIdx.x + 1]; + + smem = sdata3[threadIdx.z][threadIdx.y]; + + smem[threadIdx.x] += smem[threadIdx.x + 8]; + smem[threadIdx.x] += smem[threadIdx.x + 4]; + smem[threadIdx.x] += smem[threadIdx.x + 2]; + smem[threadIdx.x] += smem[threadIdx.x + 1]; + + smem = sdata4[threadIdx.z][threadIdx.y]; + + smem[threadIdx.x] += smem[threadIdx.x + 8]; + smem[threadIdx.x] += smem[threadIdx.x + 4]; + smem[threadIdx.x] += smem[threadIdx.x + 2]; + smem[threadIdx.x] += smem[threadIdx.x + 1]; + } + } + + // Spawn 16 blocks per interest point + // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location + __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features) + { + // 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region) + __shared__ float sdx[4][4][25]; + __shared__ float sdy[4][4][25]; + + calc_dx_dy(sdx, sdy, features); + __syncthreads(); + + __shared__ float sdxabs[4][4][25]; + __shared__ float sdyabs[4][4][25]; + + sdxabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); // |dx| array + sdyabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); // |dy| array + __syncthreads(); + + reduce_sum(sdx, sdy, sdxabs, sdyabs); + + float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 16 + threadIdx.y * 4; + + // write dx, dy, |dx|, |dy| + if (threadIdx.x == 0) + { + descriptors_block[0] = sdx[threadIdx.z][threadIdx.y][0]; + descriptors_block[1] = sdy[threadIdx.z][threadIdx.y][0]; + descriptors_block[2] = sdxabs[threadIdx.z][threadIdx.y][0]; + descriptors_block[3] = sdyabs[threadIdx.z][threadIdx.y][0]; + } + } + + // Spawn 16 blocks per interest point + // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location + __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features) + { + // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) + __shared__ float sdx[4][4][25]; + __shared__ float sdy[4][4][25]; + + calc_dx_dy(sdx, sdy, features); + __syncthreads(); + + // sum (reduce) 5x5 area response + __shared__ float sd1[4][4][25]; + __shared__ float sd2[4][4][25]; + __shared__ float sdabs1[4][4][25]; + __shared__ float sdabs2[4][4][25]; + + if (sdy[threadIdx.z][threadIdx.y][threadIdx.x] >= 0) + { + sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x]; + sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); + sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + } + else + { + sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x]; + sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); + } + __syncthreads(); + + reduce_sum(sd1, sd2, sdabs1, sdabs2); + + float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 32 + threadIdx.y * 8; + + // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) + if (threadIdx.x == 0) + { + descriptors_block[0] = sd1[threadIdx.z][threadIdx.y][0]; + descriptors_block[1] = sdabs1[threadIdx.z][threadIdx.y][0]; + descriptors_block[2] = sd2[threadIdx.z][threadIdx.y][0]; + descriptors_block[3] = sdabs2[threadIdx.z][threadIdx.y][0]; + } + __syncthreads(); + + if (sdx[threadIdx.z][threadIdx.y][threadIdx.x] >= 0) + { + sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x]; + sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); + sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + } + else + { + sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0; + sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x]; + sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); + } + __syncthreads(); + + reduce_sum(sd1, sd2, sdabs1, sdabs2); + + // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) + if (threadIdx.x == 0) + { + descriptors_block[4] = sd1[threadIdx.z][threadIdx.y][0]; + descriptors_block[5] = sdabs1[threadIdx.z][threadIdx.y][0]; + descriptors_block[6] = sd2[threadIdx.z][threadIdx.y][0]; + descriptors_block[7] = sdabs2[threadIdx.z][threadIdx.y][0]; + } + } + + void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures) + { + // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D + + if (descriptors.cols == 64) + { + compute_descriptors64<<>>(descriptors, features); + cudaSafeCall( cudaThreadSynchronize() ); + + normalize_descriptors<64><<>>(descriptors); + cudaSafeCall( cudaThreadSynchronize() ); + } + else + { + compute_descriptors128<<>>(descriptors, features); + cudaSafeCall( cudaThreadSynchronize() ); + + normalize_descriptors<128><<>>(descriptors); + cudaSafeCall( cudaThreadSynchronize() ); + } + } +}}} diff --git a/modules/gpu/src/cuda/surf_key_point.h b/modules/gpu/src/cuda/surf_key_point.h new file mode 100644 index 0000000000..aa3612e186 --- /dev/null +++ b/modules/gpu/src/cuda/surf_key_point.h @@ -0,0 +1,76 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_SURF_KEY_POINT_H__ +#define __OPENCV_SURF_KEY_POINT_H__ + +namespace cv +{ + namespace gpu + { + namespace surf + { + struct KeyPoint_GPU + { + float x; + float y; + float size; + float response; + float angle; + float octave; + }; + + enum KeypointLayout + { + SF_X, + SF_Y, + SF_SIZE, + SF_RESPONSE, + SF_ANGLE, + SF_OCTAVE, + SF_FEATURE_STRIDE + }; + } + } +} + +#endif // __OPENCV_SURF_KEY_POINT_H__ diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 337af7aea8..48f77eee1e 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -304,7 +304,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value) { - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1); + CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1); dst.create(src.rows + top + bottom, src.cols + left + right, src.type()); @@ -318,26 +318,34 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom switch (src.type()) { case CV_8UC1: - { + { Npp8u nVal = static_cast(value[0]); nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr(), src.step, srcsz, - dst.ptr(), dst.step, dstsz, top, left, nVal) ); + dst.ptr(), dst.step, dstsz, top, left, nVal) ); break; - } + } case CV_8UC4: - { + { Npp8u nVal[] = {static_cast(value[0]), static_cast(value[1]), static_cast(value[2]), static_cast(value[3])}; nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr(), src.step, srcsz, dst.ptr(), dst.step, dstsz, top, left, nVal) ); break; - } + } case CV_32SC1: - { + { Npp32s nVal = static_cast(value[0]); nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr(), src.step, srcsz, dst.ptr(), dst.step, dstsz, top, left, nVal) ); break; - } + } + case CV_32FC1: + { + float val = static_cast(value[0]); + Npp32s nVal = *(reinterpret_cast(&val)); + nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr(), src.step, srcsz, + dst.ptr(), dst.step, dstsz, top, left, nVal) ); + break; + } default: CV_Assert(!"Unsupported source type"); } diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index faf9a18e85..195deaff3c 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -67,6 +67,7 @@ #include "opencv2/gpu/stream_accessor.hpp" #include "npp.h" #include "npp_staging.h" + #include "surf_key_point.h" #define CUDART_MINIMUM_REQUIRED_VERSION 3020 #define NPP_MINIMUM_REQUIRED_VERSION 3216 diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp new file mode 100644 index 0000000000..ab67ff3a82 --- /dev/null +++ b/modules/gpu/src/surf.cpp @@ -0,0 +1,356 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::gpu; +using namespace std; + +#if !defined (HAVE_CUDA) + +int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;} +void cv::gpu::SURF_GPU::uploadKeypoints(const vector&, GpuMat&) { throw_nogpu(); } +void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector&) { throw_nogpu(); } +void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector&) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, GpuMat&, GpuMat&, bool, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector&) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector&, GpuMat&, bool, bool) { throw_nogpu(); } +void cv::gpu::SURF_GPU::operator()(const GpuMat&, vector&, vector&, bool, bool) { throw_nogpu(); } + +#else /* !defined (HAVE_CUDA) */ + +namespace cv { namespace gpu { namespace surf +{ + void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size); + + void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, + int nIntervals, int x_size, int y_size); + + void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, + KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter); + + void find_orientation_gpu(KeyPoint_GPU* features, int nFeatures); + + void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures); +}}} + +using namespace cv::gpu::surf; + +namespace +{ + class SURF_GPU_Invoker : private SURFParams_GPU + { + public: + SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img) : + SURFParams_GPU(surf), + + img_float(surf.img_float), img_float_tr(surf.img_float_tr), + + sum(surf.sum), + + hessianBuffer(surf.hessianBuffer), + maxPosBuffer(surf.maxPosBuffer), + featuresBuffer(surf.featuresBuffer), + + img_cols(img.cols), img_rows(img.rows), + + mask_width(0), mask_height(0), + + featureCounter(0), maxCounter(0) + { + CV_Assert((img.type() == CV_8UC1 || img.type() == CV_32FC1) && nOctaves > 0 && nIntervals > 2); + CV_Assert(hasAtomicsSupport(getDevice())); + + max_features = static_cast(img.size().area() * featuresRatio); + max_candidates = static_cast(1.5 * max_features); + + featuresBuffer.create(1, max_features, CV_32FC(6)); + maxPosBuffer.create(1, max_candidates, CV_32SC4); + + mask_width = l2 * 0.5f; + mask_height = 1.0f + l1; + + // Dxy gap half-width + float dxy_center_offset = 0.5f * (l4 + l3); + // Dxy squares half-width + float dxy_half_width = 0.5f * l3; + + // rescale edge_scale to fit with the filter dimensions + float dxy_scale = edgeScale * std::pow((2.f + 2.f * l1) * l2 / (4.f * l3 * l3), 2.f); + + // Compute border required such that the filters don't overstep the image boundaries + float smax0 = 2.0f * initialScale + 0.5f; + int border0 = static_cast(std::ceil(smax0 * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f))); + + int width0 = (img_cols - 2 * border0) / initialStep; + int height0 = (img_rows - 2 * border0) / initialStep; + + uploadConstant("cv::gpu::surf::c_max_candidates", max_candidates); + uploadConstant("cv::gpu::surf::c_max_features", max_features); + uploadConstant("cv::gpu::surf::c_nIntervals", nIntervals); + uploadConstant("cv::gpu::surf::c_mask_width", mask_width); + uploadConstant("cv::gpu::surf::c_mask_height", mask_height); + uploadConstant("cv::gpu::surf::c_dxy_center_offset", dxy_center_offset); + uploadConstant("cv::gpu::surf::c_dxy_half_width", dxy_half_width); + uploadConstant("cv::gpu::surf::c_dxy_scale", dxy_scale); + uploadConstant("cv::gpu::surf::c_initialScale", initialScale); + uploadConstant("cv::gpu::surf::c_threshold", threshold); + + hessianBuffer.create(height0 * nIntervals, width0, CV_32F); + + if (img.type() == CV_32FC1) + img_float = img; + else + img.convertTo(img_float, CV_32F, 1.0 / 255.0); + + transpose(img_float, img_float_tr); + columnSum(img_float_tr, img_float_tr); + transpose(img_float_tr, sum); + columnSum(sum, sum); + + bindTexture("cv::gpu::surf::sumTex", (DevMem2Df)sum); + } + + ~SURF_GPU_Invoker() + { + unbindTexture("cv::gpu::surf::sumTex"); + } + + void detectKeypoints(GpuMat& keypoints) + { + for(int octave = 0; octave < nOctaves; ++octave) + { + int step = initialStep * (1 << octave); + + // Compute border required such that the filters don't overstep the image boundaries + float d = (initialScale * (1 << octave)) / (nIntervals - 2); + float smax = initialScale * (1 << octave) + d * (nIntervals - 2.0f) + 0.5f; + int border = static_cast(std::ceil(smax * std::max(std::max(mask_width, mask_height), l3 + l4 * 0.5f))); + + int x_size = (img_cols - 2 * border) / step; + int y_size = (img_rows - 2 * border) / step; + + if (x_size <= 0 || y_size <= 0) + break; + + uploadConstant("cv::gpu::surf::c_octave", octave); + uploadConstant("cv::gpu::surf::c_x_size", x_size); + uploadConstant("cv::gpu::surf::c_y_size", y_size); + uploadConstant("cv::gpu::surf::c_border", border); + uploadConstant("cv::gpu::surf::c_step", step); + + fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size); + + // Reset the candidate count. + maxCounter = 0; + + nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, nIntervals, x_size, y_size); + + maxCounter = std::min(maxCounter, static_cast(max_candidates)); + + fh_interp_extremum_gpu(hessianBuffer, maxPosBuffer.ptr(), maxCounter, + featuresBuffer.ptr(), featureCounter); + + featureCounter = std::min(featureCounter, static_cast(max_features)); + } + + featuresBuffer.colRange(0, featureCounter).copyTo(keypoints); + } + + void findOrientation(GpuMat& keypoints) + { + if (keypoints.cols > 0) + find_orientation_gpu(keypoints.ptr(), keypoints.cols); + } + + void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize) + { + if (keypoints.cols > 0) + { + descriptors.create(keypoints.cols, descriptorSize, CV_32F); + compute_descriptors_gpu(descriptors, keypoints.ptr(), keypoints.cols); + } + } + + private: + GpuMat& img_float; + GpuMat& img_float_tr; + + GpuMat& sum; + + GpuMat& hessianBuffer; + GpuMat& maxPosBuffer; + GpuMat& featuresBuffer; + + int img_cols, img_rows; + + float mask_width, mask_height; + + unsigned int featureCounter; + unsigned int maxCounter; + + int max_candidates; + int max_features; + }; +} + +int cv::gpu::SURF_GPU::descriptorSize() const +{ + return extended ? 128 : 64; +} + +void cv::gpu::SURF_GPU::uploadKeypoints(const vector& keypoints, GpuMat& keypointsGPU) +{ + Mat keypointsCPU(1, keypoints.size(), CV_32FC(6)); + + const KeyPoint* keypoints_ptr = &keypoints[0]; + KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr(); + for (size_t i = 0; i < keypoints.size(); ++i, ++keypoints_ptr, ++keypointsCPU_ptr) + { + const KeyPoint& kp = *keypoints_ptr; + KeyPoint_GPU& gkp = *keypointsCPU_ptr; + + gkp.x = kp.pt.x; + gkp.y = kp.pt.y; + + gkp.size = kp.size; + + gkp.octave = static_cast(kp.octave); + gkp.angle = kp.angle; + gkp.response = kp.response; + } + + keypointsGPU.upload(keypointsCPU); +} + +void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints) +{ + CV_Assert(keypointsGPU.type() == CV_32FC(6) && keypointsGPU.rows == 1); + + Mat keypointsCPU = keypointsGPU; + keypoints.resize(keypointsGPU.cols); + + KeyPoint* keypoints_ptr = &keypoints[0]; + const KeyPoint_GPU* keypointsCPU_ptr = keypointsCPU.ptr(); + for (int i = 0; i < keypointsGPU.cols; ++i, ++keypoints_ptr, ++keypointsCPU_ptr) + { + KeyPoint& kp = *keypoints_ptr; + const KeyPoint_GPU& gkp = *keypointsCPU_ptr; + + kp.pt.x = gkp.x; + kp.pt.y = gkp.y; + + kp.size = gkp.size; + + kp.octave = static_cast(gkp.octave); + kp.angle = gkp.angle; + kp.response = gkp.response; + } +} + +void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat& descriptorsGPU, vector& descriptors) +{ + CV_Assert(descriptorsGPU.type() == CV_32F); + + descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols); + Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]); + descriptorsGPU.download(descriptorsCPU); +} + +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints) +{ + SURF_GPU_Invoker surf(*this, img); + + surf.detectKeypoints(keypoints); + + surf.findOrientation(keypoints); +} + +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, GpuMat& keypoints, GpuMat& descriptors, + bool useProvidedKeypoints, bool calcOrientation) +{ + SURF_GPU_Invoker surf(*this, img); + + if (!useProvidedKeypoints) + surf.detectKeypoints(keypoints); + + if (calcOrientation) + surf.findOrientation(keypoints); + + surf.computeDescriptors(keypoints, descriptors, descriptorSize()); +} + +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector& keypoints) +{ + GpuMat keypointsGPU; + + (*this)(img, keypointsGPU); + + downloadKeypoints(keypointsGPU, keypoints); +} + +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector& keypoints, GpuMat& descriptors, + bool useProvidedKeypoints, bool calcOrientation) +{ + GpuMat keypointsGPU; + + if (useProvidedKeypoints) + uploadKeypoints(keypoints, keypointsGPU); + + (*this)(img, keypointsGPU, descriptors, useProvidedKeypoints, calcOrientation); + + downloadKeypoints(keypointsGPU, keypoints); +} + +void cv::gpu::SURF_GPU::operator()(const GpuMat& img, vector& keypoints, vector& descriptors, + bool useProvidedKeypoints, bool calcOrientation) +{ + GpuMat descriptorsGPU; + + (*this)(img, keypoints, descriptorsGPU, useProvidedKeypoints, calcOrientation); + + downloadDescriptors(descriptorsGPU, descriptors); +} + +#endif /* !defined (HAVE_CUDA) */