From c6a7432e929a1a2f38489d791722f344bb2ed37d Mon Sep 17 00:00:00 2001
From: Anton Obukhov <no@email>
Date: Tue, 14 Jun 2011 17:34:00 +0000
Subject: [PATCH] [*] Approach to the bug with integral image calculation on
 SM_2.0 (Fermi)

---
 .../gpu/src/nvidia/NCVHaarObjectDetection.cu    | 17 +++++++++++++----
 .../gpu/src/nvidia/NPP_staging/NPP_staging.cu   | 17 +++++++++++++----
 2 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
index 87320557bc..03ecb57ec0 100644
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -71,6 +71,9 @@
 //==============================================================================
 
 
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
+
+
 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
 //assuming size <= WARP_SIZE and size is power of 2
 template <class T>
@@ -81,10 +84,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
     pos += K_WARP_SIZE;
     s_Data[pos] = idata;
 
-    for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
-    {
-        s_Data[pos] += s_Data[pos - offset];
-    }
+    //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
+    //{
+    //    s_Data[pos] += s_Data[pos - offset];
+    //}
+
+    s_Data[pos] += s_Data[pos - 1];
+    s_Data[pos] += s_Data[pos - 2];
+    s_Data[pos] += s_Data[pos - 4];
+    s_Data[pos] += s_Data[pos - 8];
+    s_Data[pos] += s_Data[pos - 16];
 
     return s_Data[pos];
 }
diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
index 3434f0d32c..f7cdfc6887 100644
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -82,6 +82,9 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
 //==============================================================================
 
 
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
+
+
 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
 //assuming size <= WARP_SIZE and size is power of 2
 template <class T>
@@ -92,10 +95,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
     pos += K_WARP_SIZE;
     s_Data[pos] = idata;
 
-    for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
-    {
-        s_Data[pos] += s_Data[pos - offset];
-    }
+    //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
+    //{
+    //    s_Data[pos] += s_Data[pos - offset];
+    //}
+
+    s_Data[pos] += s_Data[pos - 1];
+    s_Data[pos] += s_Data[pos - 2];
+    s_Data[pos] += s_Data[pos - 4];
+    s_Data[pos] += s_Data[pos - 8];
+    s_Data[pos] += s_Data[pos - 16];
 
     return s_Data[pos];
 }