From c6a7432e929a1a2f38489d791722f344bb2ed37d Mon Sep 17 00:00:00 2001 From: Anton Obukhov Date: Tue, 14 Jun 2011 17:34:00 +0000 Subject: [PATCH] [*] Approach to the bug with integral image calculation on SM_2.0 (Fermi) --- .../gpu/src/nvidia/NCVHaarObjectDetection.cu | 17 +++++++++++++---- .../gpu/src/nvidia/NPP_staging/NPP_staging.cu | 17 +++++++++++++---- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu index 87320557bc..03ecb57ec0 100644 --- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu @@ -71,6 +71,9 @@ //============================================================================== +NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive + + //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() //assuming size <= WARP_SIZE and size is power of 2 template @@ -81,10 +84,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) pos += K_WARP_SIZE; s_Data[pos] = idata; - for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) - { - s_Data[pos] += s_Data[pos - offset]; - } + //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) + //{ + // s_Data[pos] += s_Data[pos - offset]; + //} + + s_Data[pos] += s_Data[pos - 1]; + s_Data[pos] += s_Data[pos - 2]; + s_Data[pos] += s_Data[pos - 4]; + s_Data[pos] += s_Data[pos - 8]; + s_Data[pos] += s_Data[pos - 16]; return s_Data[pos]; } diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu index 3434f0d32c..f7cdfc6887 100644 --- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu +++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu @@ -82,6 +82,9 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream) //============================================================================== +NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive + + //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() //assuming size <= WARP_SIZE and size is power of 2 template @@ -92,10 +95,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) pos += K_WARP_SIZE; s_Data[pos] = idata; - for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) - { - s_Data[pos] += s_Data[pos - offset]; - } + //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) + //{ + // s_Data[pos] += s_Data[pos - offset]; + //} + + s_Data[pos] += s_Data[pos - 1]; + s_Data[pos] += s_Data[pos - 2]; + s_Data[pos] += s_Data[pos - 4]; + s_Data[pos] += s_Data[pos - 8]; + s_Data[pos] += s_Data[pos - 16]; return s_Data[pos]; }