From 7f97fb481cbdd3b2a432332afecb6ae7ca421d8a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 14:14:48 +0400 Subject: [PATCH] FastNonLocalMeans --- modules/gpu/src/cuda/nlm.cu | 116 ++++++++++++++++++++++++++++-------- 1 file changed, 90 insertions(+), 26 deletions(-) diff --git a/modules/gpu/src/cuda/nlm.cu b/modules/gpu/src/cuda/nlm.cu index e267c733e0..cd3f0b5c3a 100644 --- a/modules/gpu/src/cuda/nlm.cu +++ b/modules/gpu/src/cuda/nlm.cu @@ -43,11 +43,11 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" - +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/block.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" using namespace cv::gpu; @@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { + + template struct Unroll; + template <> struct Unroll<1> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float& val2) + { + return thrust::tie(val1, val2); + } + + static __device__ __forceinline__ const thrust::tuple, plus > op() + { + plus op; + return thrust::make_tuple(op, op); + } + }; + template <> struct Unroll<2> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float2& val2) + { + return thrust::tie(val1, val2.x, val2.y); + } + + static __device__ __forceinline__ const thrust::tuple, plus, plus > op() + { + plus op; + return thrust::make_tuple(op, op, op); + } + }; + template <> struct Unroll<3> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float3& val2) + { + return thrust::tie(val1, val2.x, val2.y, val2.z); + } + + static __device__ __forceinline__ const thrust::tuple, plus, plus, plus > op() + { + plus op; + return thrust::make_tuple(op, op, op, op); + } + }; + template <> struct Unroll<4> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float4& val2) + { + return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w); + } + + static __device__ __forceinline__ const thrust::tuple, plus, plus, plus, plus > op() + { + plus op; + return thrust::make_tuple(op, op, op, op, op); + } + }; + __device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); } __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); } __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); } @@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device sum = sum + weight * saturate_cast(src(sy + y, sx + x)); } - volatile __shared__ float cta_buffer[CTA_SIZE]; + __shared__ float cta_buffer[CTA_SIZE * (VecTraits::cn + 1)]; - int tid = threadIdx.x; + reduce(Unroll::cn>::template smem_tuple(cta_buffer), + Unroll::cn>::tie(weights_sum, sum), + threadIdx.x, + Unroll::cn>::op()); - cta_buffer[tid] = weights_sum; - __syncthreads(); - Block::reduce(cta_buffer, plus()); - weights_sum = cta_buffer[0]; - - __syncthreads(); - - - for(int n = 0; n < VecTraits::cn; ++n) - { - cta_buffer[tid] = reinterpret_cast(&sum)[n]; - __syncthreads(); - Block::reduce(cta_buffer, plus()); - reinterpret_cast(&sum)[n] = cta_buffer[0]; - - __syncthreads(); - } - - if (tid == 0) - dst = saturate_cast(sum/weights_sum); + if (threadIdx.x == 0) + dst = saturate_cast(sum / weights_sum); } __device__ __forceinline__ void operator()(PtrStepSz& dst) const @@ -503,4 +567,4 @@ namespace cv { namespace gpu { namespace device }}} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */