diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu index 87320557bc..03ecb57ec0 100644 --- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu @@ -71,6 +71,9 @@ //============================================================================== +NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive + + //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() //assuming size <= WARP_SIZE and size is power of 2 template @@ -81,10 +84,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) pos += K_WARP_SIZE; s_Data[pos] = idata; - for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) - { - s_Data[pos] += s_Data[pos - offset]; - } + //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) + //{ + // s_Data[pos] += s_Data[pos - offset]; + //} + + s_Data[pos] += s_Data[pos - 1]; + s_Data[pos] += s_Data[pos - 2]; + s_Data[pos] += s_Data[pos - 4]; + s_Data[pos] += s_Data[pos - 8]; + s_Data[pos] += s_Data[pos - 16]; return s_Data[pos]; } diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu index 3434f0d32c..f7cdfc6887 100644 --- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu +++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu @@ -82,6 +82,9 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream) //============================================================================== +NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive + + //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() //assuming size <= WARP_SIZE and size is power of 2 template @@ -92,10 +95,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) pos += K_WARP_SIZE; s_Data[pos] = idata; - for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) - { - s_Data[pos] += s_Data[pos - offset]; - } + //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) + //{ + // s_Data[pos] += s_Data[pos - offset]; + //} + + s_Data[pos] += s_Data[pos - 1]; + s_Data[pos] += s_Data[pos - 2]; + s_Data[pos] += s_Data[pos - 4]; + s_Data[pos] += s_Data[pos - 8]; + s_Data[pos] += s_Data[pos - 16]; return s_Data[pos]; }