From cdd9234fae9c55b3754dc7425b82cf0ff5fb7c25 Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Mon, 15 Jul 2013 15:13:09 +0800
Subject: [PATCH 1/3] fix hog mismatch on cpu ocl

---
 modules/ocl/src/hog.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index 412afee8b8..2e2b3a9928 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -254,7 +254,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
 
     effect_size = Size(0, 0);
 
-	if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
+    if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
         hog_device_cpu = true;
     else
         hog_device_cpu = false;
@@ -1758,8 +1758,20 @@ void cv::ocl::device::hog::compute_hists(int nbins,
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( smem, (void *)NULL));
 
-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
-        localThreads, args, -1, -1);
+
+    if(hog_device_cpu)
+    {
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+            localThreads, args, -1, -1, "-D CPU");
+    }else
+    {
+        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
+        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        char opt[32] = {0};
+        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
+        openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
+            localThreads, args, -1, -1, opt);
+    }
 }
 
 void cv::ocl::device::hog::normalize_hists(int nbins, 

From f1060ac5147b550cef256a1ef533390949f31379 Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Wed, 17 Jul 2013 14:35:44 +0800
Subject: [PATCH 2/3] a little simplify to pyrlk kernel

---
 modules/ocl/perf/perf_opticalflow.cpp |   4 +-
 modules/ocl/src/opencl/pyrlk.cl       | 943 +++++++++++---------------
 2 files changed, 413 insertions(+), 534 deletions(-)

diff --git a/modules/ocl/perf/perf_opticalflow.cpp b/modules/ocl/perf/perf_opticalflow.cpp
index 936d7a77fc..10763b5b0f 100644
--- a/modules/ocl/perf/perf_opticalflow.cpp
+++ b/modules/ocl/perf/perf_opticalflow.cpp
@@ -48,8 +48,8 @@
 ///////////// PyrLKOpticalFlow ////////////////////////
 PERFTEST(PyrLKOpticalFlow)
 {
-    std::string images1[] = {"rubberwhale1.png", "basketball1.png"};
-    std::string images2[] = {"rubberwhale2.png", "basketball2.png"};
+    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
+    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
 
     for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
     {
diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl
index 40a1993952..02cf3afa44 100644
--- a/modules/ocl/src/opencl/pyrlk.cl
+++ b/modules/ocl/src/opencl/pyrlk.cl
@@ -17,6 +17,7 @@
 // @Authors
 //    Dachuan Zhao, dachuan@multicorewareinc.com
 //    Yao Wang, bitwangyaoyao@gmail.com
+//    Xiaopeng Fu, fuxiaopeng2222@163.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -47,6 +48,7 @@
 //#pragma OPENCL EXTENSION cl_amd_printf : enable
 
 #define	BUFFER	64
+#define	BUFFER2	BUFFER>>1
 #ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
@@ -58,53 +60,16 @@ void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local
     smem3[tid] = val3;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 32)
+    for(int i = BUFFER2; i > 0; i >>= 1)
     {
-        smem1[tid] += smem1[tid + 32];
-        smem2[tid] += smem2[tid + 32];
-        smem3[tid] += smem3[tid + 32];
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+            smem2[tid] += smem2[tid + i];
+            smem3[tid] += smem3[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 16)
-    {
-        smem1[tid] += smem1[tid + 16];
-        smem2[tid] += smem2[tid + 16];
-        smem3[tid] += smem3[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 8)
-    {
-        smem1[tid] += smem1[tid + 8];
-        smem2[tid] += smem2[tid + 8];
-        smem3[tid] += smem3[tid + 8];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 4)
-    {
-        smem1[tid] += smem1[tid + 4];
-        smem2[tid] += smem2[tid + 4];
-        smem3[tid] += smem3[tid + 4];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 2)
-    {
-        smem1[tid] += smem1[tid + 2];
-        smem2[tid] += smem2[tid + 2];
-        smem3[tid] += smem3[tid + 2];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 1)
-    {
-        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
-        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
-        smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 }
 
 void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
@@ -113,47 +78,15 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
     smem2[tid] = val2;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 32)
+    for(int i = BUFFER2; i > 0; i >>= 1)
     {
-        smem1[tid] += smem1[tid + 32];
-        smem2[tid] += smem2[tid + 32];
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+            smem2[tid] += smem2[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 16)
-    {
-        smem1[tid] += smem1[tid + 16];
-        smem2[tid] += smem2[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 8)
-    {
-        smem1[tid] += smem1[tid + 8];
-        smem2[tid] += smem2[tid + 8];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 4)
-    {
-        smem1[tid] += smem1[tid + 4];
-        smem2[tid] += smem2[tid + 4];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 2)
-    {
-        smem1[tid] += smem1[tid + 2];
-        smem2[tid] += smem2[tid + 2];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 1)
-    {
-        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
-        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 }
 
 void reduce1(float val1, volatile __local float* smem1, int tid)
@@ -161,45 +94,18 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (tid < 32)
+    for(int i = BUFFER2; i > 0; i >>= 1)
     {
-        smem1[tid] += smem1[tid + 32];
+        if(tid < i)
+        {
+            smem1[tid] += smem1[tid + i];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 16)
-    {
-        smem1[tid] += smem1[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 8)
-    {
-        smem1[tid] += smem1[tid + 8];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 4)
-    {
-        smem1[tid] += smem1[tid + 4];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 2)
-    {
-        smem1[tid] += smem1[tid + 2];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (tid < 1)
-    {
-        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 }
 #else
-void reduce3(float val1, float val2, float val3, 
-__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
+void reduce3(float val1, float val2, float val3,
+             __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
@@ -212,15 +118,19 @@ __local volatile float* smem1, __local volatile float* smem2, __local volatile f
         smem2[tid] += smem2[tid + 32];
         smem3[tid] += smem3[tid + 32];
 #if WAVE_SIZE < 32
-	} barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 16) {
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
 #endif
         smem1[tid] += smem1[tid + 16];
         smem2[tid] += smem2[tid + 16];
         smem3[tid] += smem3[tid + 16];
 #if WAVE_SIZE <16
-	} barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 8) {
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
 #endif
         smem1[tid] += smem1[tid + 8];
         smem2[tid] += smem2[tid + 8];
@@ -238,6 +148,7 @@ __local volatile float* smem1, __local volatile float* smem2, __local volatile f
         smem2[tid] += smem2[tid + 1];
         smem3[tid] += smem3[tid + 1];
     }
+    barrier(CLK_LOCAL_MEM_FENCE);
 }
 
 void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
@@ -251,14 +162,18 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola
         smem1[tid] += smem1[tid + 32];
         smem2[tid] += smem2[tid + 32];
 #if WAVE_SIZE < 32
-	} barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 16) {
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
 #endif
         smem1[tid] += smem1[tid + 16];
         smem2[tid] += smem2[tid + 16];
 #if WAVE_SIZE <16
-	} barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 8) {
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
 #endif
         smem1[tid] += smem1[tid + 8];
         smem2[tid] += smem2[tid + 8];
@@ -272,6 +187,7 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola
         smem1[tid] += smem1[tid + 1];
         smem2[tid] += smem2[tid + 1];
     }
+    barrier(CLK_LOCAL_MEM_FENCE);
 }
 
 void reduce1(float val1, __local volatile float* smem1, int tid)
@@ -283,19 +199,24 @@ void reduce1(float val1, __local volatile float* smem1, int tid)
     {
         smem1[tid] += smem1[tid + 32];
 #if WAVE_SIZE < 32
-	} barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 16) {
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
 #endif
         smem1[tid] += smem1[tid + 16];
 #if WAVE_SIZE <16
-	} barrier(CLK_LOCAL_MEM_FENCE);
-	if (tid < 8) {
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
 #endif
         smem1[tid] += smem1[tid + 8];
         smem1[tid] += smem1[tid + 4];
         smem1[tid] += smem1[tid + 2];
         smem1[tid] += smem1[tid + 1];
     }
+    barrier(CLK_LOCAL_MEM_FENCE);
 }
 #endif
 
@@ -306,106 +227,100 @@ void reduce1(float val1, __local volatile float* smem1, int tid)
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
 
 void SetPatch(image2d_t I, float x, float y,
-                                float* Pch, float* Dx, float* Dy,
-                                float* A11, float* A12, float* A22)
+              float* Pch, float* Dx, float* Dy,
+              float* A11, float* A12, float* A22)
 {
-            *Pch = read_imagef(I, sampler, (float2)(x, y)).x;
+    *Pch = read_imagef(I, sampler, (float2)(x, y)).x;
 
-            float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
-                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
+    float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
 
-            float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
-                            (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
+    float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
 
 
-            *Dx = dIdx;
-            *Dy = dIdy;
+    *Dx = dIdx;
+    *Dy = dIdy;
 
-            *A11 += dIdx * dIdx;
-            *A12 += dIdx * dIdy;
-            *A22 += dIdy * dIdy;
+    *A11 += dIdx * dIdx;
+    *A12 += dIdx * dIdy;
+    *A22 += dIdy * dIdy;
 }
 
 void GetPatch(image2d_t J, float x, float y,
-                                float* Pch, float* Dx, float* Dy,
-                                float* b1, float* b2)
+              float* Pch, float* Dx, float* Dy,
+              float* b1, float* b2)
 {
-                float J_val = read_imagef(J, sampler, (float2)(x, y)).x;
-                float diff = (J_val - *Pch) * 32.0f;
-                *b1 += diff**Dx;
-                *b2 += diff**Dy;
+    float J_val = read_imagef(J, sampler, (float2)(x, y)).x;
+    float diff = (J_val - *Pch) * 32.0f;
+    *b1 += diff**Dx;
+    *b2 += diff**Dy;
 }
 
 void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
 {
-        float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
-        *errval += fabs(diff);
+    float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
+    *errval += fabs(diff);
 }
 
 void SetPatch4(image2d_t I, const float x, const float y,
-                                float4* Pch, float4* Dx, float4* Dy,
-                                float* A11, float* A12, float* A22)
+               float4* Pch, float4* Dx, float4* Dy,
+               float* A11, float* A12, float* A22)
 {
-            *Pch = read_imagef(I, sampler, (float2)(x, y));
+    *Pch = read_imagef(I, sampler, (float2)(x, y));
 
-            float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
-                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)));
+    float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
+                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)));
 
-            float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
-                            (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)));
+    float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
+                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)));
 
 
-            *Dx = dIdx;
-            *Dy = dIdy;
-                        float4 sqIdx = dIdx * dIdx;
-                        *A11 += sqIdx.x + sqIdx.y + sqIdx.z;
-                        sqIdx = dIdx * dIdy;
-                        *A12 += sqIdx.x + sqIdx.y + sqIdx.z;
-                        sqIdx = dIdy * dIdy;
-                        *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
+    *Dx = dIdx;
+    *Dy = dIdy;
+    float4 sqIdx = dIdx * dIdx;
+    *A11 += sqIdx.x + sqIdx.y + sqIdx.z;
+    sqIdx = dIdx * dIdy;
+    *A12 += sqIdx.x + sqIdx.y + sqIdx.z;
+    sqIdx = dIdy * dIdy;
+    *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
 }
 
 void GetPatch4(image2d_t J, const float x, const float y,
-                                const float4* Pch, const float4* Dx, const float4* Dy,
-                                float* b1, float* b2)
+               const float4* Pch, const float4* Dx, const float4* Dy,
+               float* b1, float* b2)
 {
-                float4 J_val = read_imagef(J, sampler, (float2)(x, y));
-                float4 diff = (J_val - *Pch) * 32.0f;
-                                float4 xdiff = diff* *Dx;
-                                *b1 += xdiff.x + xdiff.y + xdiff.z;
-                                xdiff = diff* *Dy;
-                                *b2 += xdiff.x + xdiff.y + xdiff.z;
+    float4 J_val = read_imagef(J, sampler, (float2)(x, y));
+    float4 diff = (J_val - *Pch) * 32.0f;
+    float4 xdiff = diff* *Dx;
+    *b1 += xdiff.x + xdiff.y + xdiff.z;
+    xdiff = diff* *Dy;
+    *b2 += xdiff.x + xdiff.y + xdiff.z;
 }
 
 void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
 {
-        float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
-        *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
+    float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
+    *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
 }
 
 #define	GRIDSIZE	3
 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
-    __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
-        const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
+                             __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
+                             const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
 {
-#ifdef CPU
-    __local float smem1[BUFFER+1];
-    __local float smem2[BUFFER+1];
-    __local float smem3[BUFFER+1];
-#else
     __local float smem1[BUFFER];
     __local float smem2[BUFFER];
     __local float smem3[BUFFER];
-#endif
 
-        unsigned int xid=get_local_id(0);
-        unsigned int yid=get_local_id(1);
-        unsigned int gid=get_group_id(0);
-        unsigned int xsize=get_local_size(0);
-        unsigned int ysize=get_local_size(1);
-        int xBase, yBase, i, j, k;
+    unsigned int xid=get_local_id(0);
+    unsigned int yid=get_local_id(1);
+    unsigned int gid=get_group_id(0);
+    unsigned int xsize=get_local_size(0);
+    unsigned int ysize=get_local_size(1);
+    int xBase, yBase, i, j, k;
 
-        float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
+    float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
 
     const int tid = mad24(yid, xsize, xid);
 
@@ -432,77 +347,71 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
     float dIdx_patch[GRIDSIZE][GRIDSIZE];
     float dIdy_patch[GRIDSIZE][GRIDSIZE];
 
-        yBase=yid;
-        {
-                xBase=xid;
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
-                                        &A11, &A12, &A22);
+    yBase=yid;
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
+                 &A11, &A12, &A22);
 
 
-                xBase+=xsize;
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
+                 &A11, &A12, &A22);
 
-                xBase+=xsize;
-                if(xBase<c_winSize_x)
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
-                                        &A11, &A12, &A22);
-        }
-        yBase+=ysize;
-        {
-                xBase=xid;
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                     &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
+                 &A11, &A12, &A22);
 
 
-                xBase+=xsize;
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
+                 &A11, &A12, &A22);
 
-                xBase+=xsize;
-                if(xBase<c_winSize_x)
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
-                                        &A11, &A12, &A22);
-        }
-        yBase+=ysize;
-        if(yBase<c_winSize_y)
-        {
-                xBase=xid;
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                     &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    if(yBase<c_winSize_y)
+    {
+        xBase=xid;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
+                 &A11, &A12, &A22);
 
 
-                xBase+=xsize;
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
+                 &A11, &A12, &A22);
 
-                xBase+=xsize;
-                if(xBase<c_winSize_x)
-                SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                        &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
-                                        &A11, &A12, &A22);
-        }
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                     &A11, &A12, &A22);
+    }
 
     reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-#ifdef CPU
-    A11 = smem1[BUFFER];
-    A12 = smem2[BUFFER];
-    A22 = smem3[BUFFER];
-#else
     A11 = smem1[0];
     A12 = smem2[0];
     A22 = smem3[0];
-#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     float D = A11 * A22 - A12 * A12;
 
@@ -532,81 +441,76 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
         float b1 = 0;
         float b2 = 0;
 
-                yBase=yid;
-                {
-                        xBase=xid;
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
-                                                &b1, &b2);
+        yBase=yid;
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
+                     &b1, &b2);
 
 
-                        xBase+=xsize;
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
-                                                &b1, &b2);
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
+                     &b1, &b2);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
-                                                &b1, &b2);
-                }
-                yBase+=ysize;
-                {
-                        xBase=xid;
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
-                                                &b1, &b2);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                         &b1, &b2);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
+                     &b1, &b2);
 
 
-                        xBase+=xsize;
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
-                                                &b1, &b2);
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
+                     &b1, &b2);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
-                                                &b1, &b2);
-                }
-                yBase+=ysize;
-                if(yBase<c_winSize_y)
-                {
-                        xBase=xid;
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
-                                                &b1, &b2);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                         &b1, &b2);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
+                     &b1, &b2);
 
 
-                        xBase+=xsize;
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
-                                                &b1, &b2);
+            xBase+=xsize;
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
+                     &b1, &b2);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
-                                                &b1, &b2);
-                }
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                         &b1, &b2);
+        }
 
         reduce2(b1, b2, smem1, smem2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
 
-#ifdef CPU
-        b1 = smem1[BUFFER];
-        b2 = smem2[BUFFER];
-#else
         b1 = smem1[0];
         b2 = smem2[0];
-#endif
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         float2 delta;
         delta.x = A12 * b2 - A22 * b1;
         delta.y = A12 * b1 - A11 * b2;
 
-                prevPt += delta;
+        prevPt += delta;
 
         if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
             break;
@@ -615,97 +519,87 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
     D = 0.0f;
     if (calcErr)
     {
-                yBase=yid;
-                {
-                        xBase=xid;
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[0][0], &D);
+        yBase=yid;
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][0], &D);
 
 
-                        xBase+=xsize;
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[0][1], &D);
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[0][1], &D);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[0][2], &D);
-                }
-                yBase+=ysize;
-                {
-                        xBase=xid;
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[1][0], &D);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[0][2], &D);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][0], &D);
 
 
-                        xBase+=xsize;
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[1][1], &D);
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[1][1], &D);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[1][2], &D);
-                }
-                yBase+=ysize;
-                if(yBase<c_winSize_y)
-                {
-                        xBase=xid;
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[2][0], &D);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[1][2], &D);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][0], &D);
 
 
-                        xBase+=xsize;
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[2][1], &D);
+            xBase+=xsize;
+            GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                     &I_patch[2][1], &D);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                                                &I_patch[2][2], &D);
-                }
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                         &I_patch[2][2], &D);
+        }
 
         reduce1(D, smem1, tid);
     }
 
     if (tid == 0)
     {
-                prevPt += c_halfWin;
+        prevPt += c_halfWin;
 
         nextPts[gid] = prevPt;
 
         if (calcErr)
-#ifdef CPU
-            err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y);
-#else
             err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
-#endif
     }
 }
 
 
 __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
-    __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
-        const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
+                             __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
+                             const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
 {
-#ifdef CPU
-     __local float smem1[BUFFER+1];
-     __local float smem2[BUFFER+1];
-     __local float smem3[BUFFER+1];
-#else
-     __local float smem1[BUFFER];
-     __local float smem2[BUFFER];
-     __local float smem3[BUFFER];
-#endif
+    __local float smem1[BUFFER];
+    __local float smem2[BUFFER];
+    __local float smem3[BUFFER];
 
-        unsigned int xid=get_local_id(0);
-        unsigned int yid=get_local_id(1);
-        unsigned int gid=get_group_id(0);
-        unsigned int xsize=get_local_size(0);
-        unsigned int ysize=get_local_size(1);
-        int xBase, yBase, i, j, k;
+    unsigned int xid=get_local_id(0);
+    unsigned int yid=get_local_id(1);
+    unsigned int gid=get_group_id(0);
+    unsigned int xsize=get_local_size(0);
+    unsigned int ysize=get_local_size(1);
+    int xBase, yBase, i, j, k;
 
-        float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
+    float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
 
     const int tid = mad24(yid, xsize, xid);
 
@@ -721,7 +615,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
         return;
     }
 
-        nextPt -= c_halfWin;
+    nextPt -= c_halfWin;
 
     // extract the patch from the first image, compute covariation matrix of derivatives
 
@@ -732,80 +626,74 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
     float4 I_patch[8];
     float4 dIdx_patch[8];
     float4 dIdy_patch[8];
-        float4 I_add,Dx_add,Dy_add;
+    float4 I_add,Dx_add,Dy_add;
 
-        yBase=yid;
-        {
-                xBase=xid;
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
-                                        &A11, &A12, &A22);
+    yBase=yid;
+    {
+        xBase=xid;
+        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                  &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
+                  &A11, &A12, &A22);
 
 
-                xBase+=xsize;
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                  &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
+                  &A11, &A12, &A22);
 
-                xBase+=xsize;
-                if(xBase<c_winSize_x)
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
+                      &A11, &A12, &A22);
 
-        }
-        yBase+=ysize;
-        {
-                xBase=xid;
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
-                                        &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    {
+        xBase=xid;
+        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                  &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
+                  &A11, &A12, &A22);
 
 
-                xBase+=xsize;
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                  &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
+                  &A11, &A12, &A22);
 
-                xBase+=xsize;
-                if(xBase<c_winSize_x)
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
-                                        &A11, &A12, &A22);
-        }
-        yBase+=ysize;
-        if(yBase<c_winSize_y)
-        {
-                xBase=xid;
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
+                      &A11, &A12, &A22);
+    }
+    yBase+=ysize;
+    if(yBase<c_winSize_y)
+    {
+        xBase=xid;
+        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                  &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
+                  &A11, &A12, &A22);
 
 
-                xBase+=xsize;
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
-                                        &A11, &A12, &A22);
+        xBase+=xsize;
+        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                  &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
+                  &A11, &A12, &A22);
 
-                xBase+=xsize;
-                if(xBase<c_winSize_x)
-                SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                        &I_add, &Dx_add, &Dy_add,
-                                        &A11, &A12, &A22);
-        }
+        xBase+=xsize;
+        if(xBase<c_winSize_x)
+            SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_add, &Dx_add, &Dy_add,
+                      &A11, &A12, &A22);
+    }
 
     reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-#ifdef CPU
-    A11 = smem1[BUFFER];
-    A12 = smem2[BUFFER];
-    A22 = smem3[BUFFER];
-#else
     A11 = smem1[0];
     A12 = smem2[0];
     A22 = smem3[0];
-#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     float D = A11 * A22 - A12 * A12;
 
@@ -821,7 +709,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
     A12 /= D;
     A22 /= D;
 
-        nextPt = nextPts[gid] * 2.0f - c_halfWin;
+    nextPt = nextPts[gid] * 2.0f - c_halfWin;
 
     for (k = 0; k < c_iters; ++k)
     {
@@ -835,81 +723,76 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
         float b1 = 0;
         float b2 = 0;
 
-                yBase=yid;
-                {
-                        xBase=xid;
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
-                                                &b1, &b2);
+        yBase=yid;
+        {
+            xBase=xid;
+            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
+                      &b1, &b2);
 
 
-                        xBase+=xsize;
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
-                                                &b1, &b2);
+            xBase+=xsize;
+            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
+                      &b1, &b2);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
-                                                &b1, &b2);
-                }
-                yBase+=ysize;
-                {
-                        xBase=xid;
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
-                                                &b1, &b2);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                          &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
+                          &b1, &b2);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
+                      &b1, &b2);
 
 
-                        xBase+=xsize;
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
-                                                &b1, &b2);
+            xBase+=xsize;
+            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
+                      &b1, &b2);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
-                                                &b1, &b2);
-                }
-                yBase+=ysize;
-                if(yBase<c_winSize_y)
-                {
-                        xBase=xid;
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
-                                                &b1, &b2);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                          &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
+                          &b1, &b2);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
+                      &b1, &b2);
 
 
-                        xBase+=xsize;
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
-                                                &b1, &b2);
+            xBase+=xsize;
+            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
+                      &b1, &b2);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_add, &Dx_add, &Dy_add,
-                                                &b1, &b2);
-                }
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                          &I_add, &Dx_add, &Dy_add,
+                          &b1, &b2);
+        }
 
         reduce2(b1, b2, smem1, smem2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
 
-#ifdef CPU
-        b1 = smem1[BUFFER];
-        b2 = smem2[BUFFER];
-#else
         b1 = smem1[0];
         b2 = smem2[0];
-#endif
+        barrier(CLK_LOCAL_MEM_FENCE);
 
         float2 delta;
         delta.x = A12 * b2 - A22 * b1;
         delta.y = A12 * b1 - A11 * b2;
 
-                nextPt +=delta;
+        nextPt +=delta;
 
         if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
             break;
@@ -918,78 +801,74 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
     D = 0.0f;
     if (calcErr)
     {
-                yBase=yid;
-                {
-                        xBase=xid;
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[0], &D);
+        yBase=yid;
+        {
+            xBase=xid;
+            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[0], &D);
 
 
-                        xBase+=xsize;
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[1], &D);
+            xBase+=xsize;
+            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[1], &D);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[2], &D);
-                }
-                yBase+=ysize;
-                {
-                        xBase=xid;
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[3], &D);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                          &I_patch[2], &D);
+        }
+        yBase+=ysize;
+        {
+            xBase=xid;
+            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[3], &D);
 
 
-                        xBase+=xsize;
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[4], &D);
+            xBase+=xsize;
+            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[4], &D);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[5], &D);
-                }
-                yBase+=ysize;
-                if(yBase<c_winSize_y)
-                {
-                        xBase=xid;
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[6], &D);
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                          &I_patch[5], &D);
+        }
+        yBase+=ysize;
+        if(yBase<c_winSize_y)
+        {
+            xBase=xid;
+            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[6], &D);
 
 
-                        xBase+=xsize;
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_patch[7], &D);
+            xBase+=xsize;
+            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                      &I_patch[7], &D);
 
-                        xBase+=xsize;
-                        if(xBase<c_winSize_x)
-                        GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                                                &I_add, &D);
-                }
+            xBase+=xsize;
+            if(xBase<c_winSize_x)
+                GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
+                          &I_add, &D);
+        }
 
         reduce1(D, smem1, tid);
     }
 
     if (tid == 0)
     {
-                nextPt += c_halfWin;
+        nextPt += c_halfWin;
         nextPts[gid] = nextPt;
 
         if (calcErr)
-#ifdef CPU
-            err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y);
-#else
             err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y);
-#endif
     }
 }
 
 __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
-    const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
+                            const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
 {
-        int c_halfWin_x = (c_winSize_x - 1) / 2;
-        int c_halfWin_y = (c_winSize_y - 1) / 2;
+    int c_halfWin_x = (c_winSize_x - 1) / 2;
+    int c_halfWin_y = (c_winSize_y - 1) / 2;
 
     const int patchWidth  = get_local_size(0) + 2 * c_halfWin_x;
     const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
@@ -1003,7 +882,7 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
     const int xBase = get_group_id(0) * get_local_size(0);
     const int yBase = get_group_id(1) * get_local_size(1);
 
-        sampler_t sampleri    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+    sampler_t sampleri    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
 
     for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
     {
@@ -1017,10 +896,10 @@ __kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uSt
             // Sharr Deriv
 
             dIdx_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x+1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
-                                            (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x-1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x);
+                                             (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x-1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x);
 
             dIdy_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y+1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
-                                            (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y-1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x);
+                                             (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y-1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x);
         }
     }
     barrier(CLK_LOCAL_MEM_FENCE);

From 270b2c7918466a9470748cf5002507ca723298fc Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Fri, 26 Jul 2013 11:17:27 +0800
Subject: [PATCH 3/3] generating the lut table instead of hard coding one

---
 modules/ocl/src/hog.cpp                 | 126 ++++--------------------
 modules/ocl/src/opencl/objdetect_hog.cl |  95 +-----------------
 2 files changed, 18 insertions(+), 203 deletions(-)

diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index 2e2b3a9928..c7ac4098f5 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -56,98 +56,6 @@ using namespace std;
 
 static oclMat gauss_w_lut;
 static bool hog_device_cpu;
-/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
-static const float gaussian_interp_lut[] = 
-{
-    /* gaussian lut */
-    0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f, 
-    0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f, 
-    0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f, 
-    0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f, 
-    0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f, 
-    0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
-    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
-    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
-    0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 
-    0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 
-    0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f, 
-    0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f, 
-    0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f, 
-    0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
-    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
-    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
-    0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 
-    0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 
-    0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f, 
-    0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f, 
-    0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f, 
-    0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f, 
-    0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f, 
-    0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f, 
-    0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 
-    0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 
-    0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f, 
-    0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f, 
-    0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f, 
-    0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, 
-    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, 
-    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, 
-    0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 
-    0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 
-    0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f, 
-    0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f, 
-    0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f, 
-    0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, 
-    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, 
-    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, 
-    0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 
-    0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 
-    0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f, 
-    /* interp_weight lut */
-    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
-    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
-    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f, 
-    0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 
-    0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 
-    0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f, 
-    0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f, 
-    0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f, 
-    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
-    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
-    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f, 
-    0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 
-    0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 
-    0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f, 
-    0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f, 
-    0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f, 
-    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
-    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
-    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f, 
-    0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 
-    0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 
-    0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f, 
-    0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f, 
-    0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f, 
-    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, 
-    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, 
-    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f, 
-    0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 
-    0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 
-    0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f, 
-    0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f, 
-    0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f, 
-    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, 
-    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, 
-    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f, 
-    0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 
-    0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 
-    0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f, 
-    0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f, 
-    0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f, 
-    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, 
-    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, 
-    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
-};
 
 namespace cv
 {
@@ -180,7 +88,7 @@ namespace cv
                                       int nblocks_win_x, int nblocks_win_y);
 
                 void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                                   int height, int width, float sigma, const cv::ocl::oclMat &grad,
+                                   int height, int width, const cv::ocl::oclMat &grad,
                                    const cv::ocl::oclMat &qangle, 
                                    const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
 
@@ -328,10 +236,18 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
     Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
     labels.create(1, wins_per_img.area(), CV_8U);
 
-    vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut + 
-        sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
-    Mat m_lut(v_lut);
-    gauss_w_lut.upload(m_lut.reshape(1,1));
+    float sigma = getWinSigma();
+    float scale = 1.f / (2.f * sigma * sigma);
+    Mat gaussian_lut(1, 512, CV_32FC1);
+    int idx = 0;
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
+
+    gauss_w_lut.upload(gaussian_lut);
 }
 
 void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
@@ -358,7 +274,7 @@ void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
     computeGradient(img, this->grad, this->qangle);
 
     hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
-        effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
+        effect_size.width, grad, qangle, gauss_w_lut, block_hists);
 
     hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, 
         effect_size.width, block_hists, (float)threshold_L2hys);
@@ -1708,7 +1624,7 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
 
 void cv::ocl::device::hog::compute_hists(int nbins, 
                                          int block_stride_x, int block_stride_y,
-                                         int height, int width, float sigma, 
+                                         int height, int width, 
                                          const cv::ocl::oclMat &grad, 
                                          const cv::ocl::oclMat &qangle, 
                                          const cv::ocl::oclMat &gauss_w_lut, 
@@ -1716,8 +1632,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
 {
     Context *clCxt = Context::getContext();
     vector< pair<size_t, const void *> > args;
-    string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" : 
-        "compute_hists_kernel";
+    string kernelName = "compute_hists_lut_kernel";
 
     int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) 
         / block_stride_x;
@@ -1728,9 +1643,6 @@ void cv::ocl::device::hog::compute_hists(int nbins,
     int grad_quadstep = grad.step >> 2;
     int qangle_step = qangle.step;
 
-    // Precompute gaussian spatial window parameter
-    float scale = 1.f / (2.f * sigma * sigma);
-
     int blocks_in_group = 4;
     size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
     size_t globalThreads[3] = { 
@@ -1751,14 +1663,10 @@ void cv::ocl::device::hog::compute_hists(int nbins,
     args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    if (kernelName.compare("compute_hists_lut_kernel") == 0)
-        args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
-    else
-        args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( make_pair( smem, (void *)NULL));
 
-
     if(hog_device_cpu)
     {
         openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, 
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
index 8ca12704e5..0363227606 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -53,7 +53,7 @@
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
-// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
+// Use pre-computed gaussian and interp_weight lookup tables
 __kernel void compute_hists_lut_kernel(
     const int cblock_stride_x, const int cblock_stride_y,
     const int cnbins, const int cblock_hist_size, const int img_block_width, 
@@ -146,99 +146,6 @@ __kernel void compute_hists_lut_kernel(
     }
 }
 
-//----------------------------------------------------------------------------
-// Histogram computation
-// 12 threads for a cell, 12x4 threads per block
-__kernel void compute_hists_kernel(
-    const int cblock_stride_x, const int cblock_stride_y,
-    const int cnbins, const int cblock_hist_size, const int img_block_width, 
-    const int blocks_in_group, const int blocks_total,
-    const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
-    const float scale, __global float* block_hists, __local float* smem)
-{
-    const int lx = get_local_id(0);
-    const int lp = lx / 24; /* local group id */
-    const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
-    const int gidY = gid / img_block_width;
-    const int gidX = gid - gidY * img_block_width;
-
-    const int lidX = lx - lp * 24;
-    const int lidY = get_local_id(1);
-
-    const int cell_x = lidX / 12;
-    const int cell_y = lidY;
-    const int cell_thread_x = lidX - cell_x * 12;
-
-    __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * 
-        CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
-    __local float* final_hist = hists + cnbins * 
-        (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
-
-    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
-    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
-
-    __global const float* grad_ptr = (gid < blocks_total) ? 
-        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
-        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
-
-    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + 
-        cell_thread_x;
-    for (int bin_id = 0; bin_id < cnbins; ++bin_id)
-        hist[bin_id * 48] = 0.f;
-
-    const int dist_x = -4 + cell_thread_x - 4 * cell_x;
-    const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
-
-    const int dist_y_begin = -4 - 4 * lidY;
-    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
-    {
-        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
-
-        grad_ptr += grad_quadstep;
-        qangle_ptr += qangle_step;
-
-        int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
-
-        float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * 
-            dist_center_x) * scale);
-        float interp_weight = (8.f - fabs(dist_y + 0.5f)) * 
-            (8.f - fabs(dist_x + 0.5f)) / 64.f;
-
-        hist[bin.x * 48] += gaussian * interp_weight * vote.x;
-        hist[bin.y * 48] += gaussian * interp_weight * vote.y;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    volatile __local float* hist_ = hist;
-    for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
-    {
-        if (cell_thread_x < 6)
-            hist_[0] += hist_[6];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (cell_thread_x < 3)
-            hist_[0] += hist_[3];
-#ifdef CPU
-        barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-        if (cell_thread_x == 0)
-            final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
-                hist_[0] + hist_[1] + hist_[2];
-    }
-#ifdef CPU
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
-    if ((tid < cblock_hist_size) && (gid < blocks_total))
-    {
-        __global float* block_hist = block_hists + 
-            (gidY * img_block_width + gidX) * cblock_hist_size;
-        block_hist[tid] = final_hist[tid];
-    }
-}
-
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
 //  optimized for the case of 9 bins