From cdd9234fae9c55b3754dc7425b82cf0ff5fb7c25 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 15 Jul 2013 15:13:09 +0800 Subject: [PATCH 1/3] fix hog mismatch on cpu ocl --- modules/ocl/src/hog.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 412afee8b8..2e2b3a9928 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -254,7 +254,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo effect_size = Size(0, 0); - if (queryDeviceInfo()) + if (queryDeviceInfo()) hog_device_cpu = true; else hog_device_cpu = false; @@ -1758,8 +1758,20 @@ void cv::ocl::device::hog::compute_hists(int nbins, args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( smem, (void *)NULL)); - openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, - localThreads, args, -1, -1); + + if(hog_device_cpu) + { + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1, "-D CPU"); + }else + { + cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); + int wave_size = queryDeviceInfo(kernel); + char opt[32] = {0}; + sprintf(opt, "-D WAVE_SIZE=%d", wave_size); + openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, + localThreads, args, -1, -1, opt); + } } void cv::ocl::device::hog::normalize_hists(int nbins, From f1060ac5147b550cef256a1ef533390949f31379 Mon Sep 17 00:00:00 2001 From: yao Date: Wed, 17 Jul 2013 14:35:44 +0800 Subject: [PATCH 2/3] a little simplify to pyrlk kernel --- modules/ocl/perf/perf_opticalflow.cpp | 4 +- modules/ocl/src/opencl/pyrlk.cl | 943 +++++++++++--------------- 2 files changed, 413 insertions(+), 534 deletions(-) diff --git a/modules/ocl/perf/perf_opticalflow.cpp b/modules/ocl/perf/perf_opticalflow.cpp index 936d7a77fc..10763b5b0f 100644 --- a/modules/ocl/perf/perf_opticalflow.cpp +++ b/modules/ocl/perf/perf_opticalflow.cpp @@ -48,8 +48,8 @@ ///////////// PyrLKOpticalFlow //////////////////////// PERFTEST(PyrLKOpticalFlow) { - std::string images1[] = {"rubberwhale1.png", "basketball1.png"}; - std::string images2[] = {"rubberwhale2.png", "basketball2.png"}; + std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"}; + std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"}; for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++) { diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl index 40a1993952..02cf3afa44 100644 --- a/modules/ocl/src/opencl/pyrlk.cl +++ b/modules/ocl/src/opencl/pyrlk.cl @@ -17,6 +17,7 @@ // @Authors // Dachuan Zhao, dachuan@multicorewareinc.com // Yao Wang, bitwangyaoyao@gmail.com +// Xiaopeng Fu, fuxiaopeng2222@163.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -47,6 +48,7 @@ //#pragma OPENCL EXTENSION cl_amd_printf : enable #define BUFFER 64 +#define BUFFER2 BUFFER>>1 #ifndef WAVE_SIZE #define WAVE_SIZE 1 #endif @@ -58,53 +60,16 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local smem3[tid] = val3; barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 32) + for(int i = BUFFER2; i > 0; i >>= 1) { - smem1[tid] += smem1[tid + 32]; - smem2[tid] += smem2[tid + 32]; - smem3[tid] += smem3[tid + 32]; + if(tid < i) + { + smem1[tid] += smem1[tid + i]; + smem2[tid] += smem2[tid + i]; + smem3[tid] += smem3[tid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 16) - { - smem1[tid] += smem1[tid + 16]; - smem2[tid] += smem2[tid + 16]; - smem3[tid] += smem3[tid + 16]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 8) - { - smem1[tid] += smem1[tid + 8]; - smem2[tid] += smem2[tid + 8]; - smem3[tid] += smem3[tid + 8]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 4) - { - smem1[tid] += smem1[tid + 4]; - smem2[tid] += smem2[tid + 4]; - smem3[tid] += smem3[tid + 4]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 2) - { - smem1[tid] += smem1[tid + 2]; - smem2[tid] += smem2[tid + 2]; - smem3[tid] += smem3[tid + 2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 1) - { - smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; - smem2[BUFFER] = smem2[tid] + smem2[tid + 1]; - smem3[BUFFER] = smem3[tid] + smem3[tid + 1]; - } - barrier(CLK_LOCAL_MEM_FENCE); } void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) @@ -113,47 +78,15 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l smem2[tid] = val2; barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 32) + for(int i = BUFFER2; i > 0; i >>= 1) { - smem1[tid] += smem1[tid + 32]; - smem2[tid] += smem2[tid + 32]; + if(tid < i) + { + smem1[tid] += smem1[tid + i]; + smem2[tid] += smem2[tid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 16) - { - smem1[tid] += smem1[tid + 16]; - smem2[tid] += smem2[tid + 16]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 8) - { - smem1[tid] += smem1[tid + 8]; - smem2[tid] += smem2[tid + 8]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 4) - { - smem1[tid] += smem1[tid + 4]; - smem2[tid] += smem2[tid + 4]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 2) - { - smem1[tid] += smem1[tid + 2]; - smem2[tid] += smem2[tid + 2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 1) - { - smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; - smem2[BUFFER] = smem2[tid] + smem2[tid + 1]; - } - barrier(CLK_LOCAL_MEM_FENCE); } void reduce1(float val1, volatile __local float* smem1, int tid) @@ -161,45 +94,18 @@ void reduce1(float val1, volatile __local float* smem1, int tid) smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 32) + for(int i = BUFFER2; i > 0; i >>= 1) { - smem1[tid] += smem1[tid + 32]; + if(tid < i) + { + smem1[tid] += smem1[tid + i]; + } + barrier(CLK_LOCAL_MEM_FENCE); } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 16) - { - smem1[tid] += smem1[tid + 16]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 8) - { - smem1[tid] += smem1[tid + 8]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 4) - { - smem1[tid] += smem1[tid + 4]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 2) - { - smem1[tid] += smem1[tid + 2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 1) - { - smem1[BUFFER] = smem1[tid] + smem1[tid + 1]; - } - barrier(CLK_LOCAL_MEM_FENCE); } #else -void reduce3(float val1, float val2, float val3, -__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) +void reduce3(float val1, float val2, float val3, + __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) { smem1[tid] = val1; smem2[tid] = val2; @@ -212,15 +118,19 @@ __local volatile float* smem1, __local volatile float* smem2, __local volatile f smem2[tid] += smem2[tid + 32]; smem3[tid] += smem3[tid + 32]; #if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { #endif smem1[tid] += smem1[tid + 16]; smem2[tid] += smem2[tid + 16]; smem3[tid] += smem3[tid + 16]; #if WAVE_SIZE <16 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) { + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { #endif smem1[tid] += smem1[tid + 8]; smem2[tid] += smem2[tid + 8]; @@ -238,6 +148,7 @@ __local volatile float* smem1, __local volatile float* smem2, __local volatile f smem2[tid] += smem2[tid + 1]; smem3[tid] += smem3[tid + 1]; } + barrier(CLK_LOCAL_MEM_FENCE); } void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) @@ -251,14 +162,18 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola smem1[tid] += smem1[tid + 32]; smem2[tid] += smem2[tid + 32]; #if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { #endif smem1[tid] += smem1[tid + 16]; smem2[tid] += smem2[tid + 16]; #if WAVE_SIZE <16 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) { + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { #endif smem1[tid] += smem1[tid + 8]; smem2[tid] += smem2[tid + 8]; @@ -272,6 +187,7 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola smem1[tid] += smem1[tid + 1]; smem2[tid] += smem2[tid + 1]; } + barrier(CLK_LOCAL_MEM_FENCE); } void reduce1(float val1, __local volatile float* smem1, int tid) @@ -283,19 +199,24 @@ void reduce1(float val1, __local volatile float* smem1, int tid) { smem1[tid] += smem1[tid + 32]; #if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { #endif smem1[tid] += smem1[tid + 16]; #if WAVE_SIZE <16 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 8) { + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { #endif smem1[tid] += smem1[tid + 8]; smem1[tid] += smem1[tid + 4]; smem1[tid] += smem1[tid + 2]; smem1[tid] += smem1[tid + 1]; } + barrier(CLK_LOCAL_MEM_FENCE); } #endif @@ -306,106 +227,100 @@ void reduce1(float val1, __local volatile float* smem1, int tid) __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; void SetPatch(image2d_t I, float x, float y, - float* Pch, float* Dx, float* Dy, - float* A11, float* A12, float* A22) + float* Pch, float* Dx, float* Dy, + float* A11, float* A12, float* A22) { - *Pch = read_imagef(I, sampler, (float2)(x, y)).x; + *Pch = read_imagef(I, sampler, (float2)(x, y)).x; - float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x); + float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x); - float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x); + float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x); - *Dx = dIdx; - *Dy = dIdy; + *Dx = dIdx; + *Dy = dIdy; - *A11 += dIdx * dIdx; - *A12 += dIdx * dIdy; - *A22 += dIdy * dIdy; + *A11 += dIdx * dIdx; + *A12 += dIdx * dIdy; + *A22 += dIdy * dIdy; } void GetPatch(image2d_t J, float x, float y, - float* Pch, float* Dx, float* Dy, - float* b1, float* b2) + float* Pch, float* Dx, float* Dy, + float* b1, float* b2) { - float J_val = read_imagef(J, sampler, (float2)(x, y)).x; - float diff = (J_val - *Pch) * 32.0f; - *b1 += diff**Dx; - *b2 += diff**Dy; + float J_val = read_imagef(J, sampler, (float2)(x, y)).x; + float diff = (J_val - *Pch) * 32.0f; + *b1 += diff**Dx; + *b2 += diff**Dy; } void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval) { - float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch; - *errval += fabs(diff); + float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch; + *errval += fabs(diff); } void SetPatch4(image2d_t I, const float x, const float y, - float4* Pch, float4* Dx, float4* Dy, - float* A11, float* A12, float* A22) + float4* Pch, float4* Dx, float4* Dy, + float* A11, float* A12, float* A22) { - *Pch = read_imagef(I, sampler, (float2)(x, y)); + *Pch = read_imagef(I, sampler, (float2)(x, y)); - float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1))); + float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1))); - float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - - (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1))); + float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1))); - *Dx = dIdx; - *Dy = dIdy; - float4 sqIdx = dIdx * dIdx; - *A11 += sqIdx.x + sqIdx.y + sqIdx.z; - sqIdx = dIdx * dIdy; - *A12 += sqIdx.x + sqIdx.y + sqIdx.z; - sqIdx = dIdy * dIdy; - *A22 += sqIdx.x + sqIdx.y + sqIdx.z; + *Dx = dIdx; + *Dy = dIdy; + float4 sqIdx = dIdx * dIdx; + *A11 += sqIdx.x + sqIdx.y + sqIdx.z; + sqIdx = dIdx * dIdy; + *A12 += sqIdx.x + sqIdx.y + sqIdx.z; + sqIdx = dIdy * dIdy; + *A22 += sqIdx.x + sqIdx.y + sqIdx.z; } void GetPatch4(image2d_t J, const float x, const float y, - const float4* Pch, const float4* Dx, const float4* Dy, - float* b1, float* b2) + const float4* Pch, const float4* Dx, const float4* Dy, + float* b1, float* b2) { - float4 J_val = read_imagef(J, sampler, (float2)(x, y)); - float4 diff = (J_val - *Pch) * 32.0f; - float4 xdiff = diff* *Dx; - *b1 += xdiff.x + xdiff.y + xdiff.z; - xdiff = diff* *Dy; - *b2 += xdiff.x + xdiff.y + xdiff.z; + float4 J_val = read_imagef(J, sampler, (float2)(x, y)); + float4 diff = (J_val - *Pch) * 32.0f; + float4 xdiff = diff* *Dx; + *b1 += xdiff.x + xdiff.y + xdiff.z; + xdiff = diff* *Dy; + *b2 += xdiff.x + xdiff.y + xdiff.z; } void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval) { - float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch; - *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); + float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch; + *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); } #define GRIDSIZE 3 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, - __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, - const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) + __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, + const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { -#ifdef CPU - __local float smem1[BUFFER+1]; - __local float smem2[BUFFER+1]; - __local float smem3[BUFFER+1]; -#else __local float smem1[BUFFER]; __local float smem2[BUFFER]; __local float smem3[BUFFER]; -#endif - unsigned int xid=get_local_id(0); - unsigned int yid=get_local_id(1); - unsigned int gid=get_group_id(0); - unsigned int xsize=get_local_size(0); - unsigned int ysize=get_local_size(1); - int xBase, yBase, i, j, k; + unsigned int xid=get_local_id(0); + unsigned int yid=get_local_id(1); + unsigned int gid=get_group_id(0); + unsigned int xsize=get_local_size(0); + unsigned int ysize=get_local_size(1); + int xBase, yBase, i, j, k; - float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); + float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); const int tid = mad24(yid, xsize, xid); @@ -432,77 +347,71 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, float dIdx_patch[GRIDSIZE][GRIDSIZE]; float dIdy_patch[GRIDSIZE][GRIDSIZE]; - yBase=yid; - { - xBase=xid; - SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, - &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], - &A11, &A12, &A22); + yBase=yid; + { + xBase=xid; + SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], + &A11, &A12, &A22); - xBase+=xsize; - SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, - &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], - &A11, &A12, &A22); + xBase+=xsize; + SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f, + &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], + &A11, &A12, &A22); - xBase+=xsize; - if(xBase>1, (c_winSize_y - 1)>>1); + float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); const int tid = mad24(yid, xsize, xid); @@ -721,7 +615,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, return; } - nextPt -= c_halfWin; + nextPt -= c_halfWin; // extract the patch from the first image, compute covariation matrix of derivatives @@ -732,80 +626,74 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, float4 I_patch[8]; float4 dIdx_patch[8]; float4 dIdy_patch[8]; - float4 I_add,Dx_add,Dy_add; + float4 I_add,Dx_add,Dy_add; - yBase=yid; - { - xBase=xid; - SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, - &I_patch[0], &dIdx_patch[0], &dIdy_patch[0], - &A11, &A12, &A22); + yBase=yid; + { + xBase=xid; + SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, + &I_patch[0], &dIdx_patch[0], &dIdy_patch[0], + &A11, &A12, &A22); - xBase+=xsize; - SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, - &I_patch[1], &dIdx_patch[1], &dIdy_patch[1], - &A11, &A12, &A22); + xBase+=xsize; + SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f, + &I_patch[1], &dIdx_patch[1], &dIdy_patch[1], + &A11, &A12, &A22); - xBase+=xsize; - if(xBase Date: Fri, 26 Jul 2013 11:17:27 +0800 Subject: [PATCH 3/3] generating the lut table instead of hard coding one --- modules/ocl/src/hog.cpp | 126 ++++-------------------- modules/ocl/src/opencl/objdetect_hog.cl | 95 +----------------- 2 files changed, 18 insertions(+), 203 deletions(-) diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 2e2b3a9928..c7ac4098f5 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -56,98 +56,6 @@ using namespace std; static oclMat gauss_w_lut; static bool hog_device_cpu; -/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */ -static const float gaussian_interp_lut[] = -{ - /* gaussian lut */ - 0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f, - 0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f, - 0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f, - 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f, - 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f, - 0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, - 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, - 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, - 0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, - 0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, - 0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f, - 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f, - 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f, - 0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, - 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, - 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, - 0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, - 0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, - 0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f, - 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f, - 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f, - 0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f, - 0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f, - 0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f, - 0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, - 0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, - 0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f, - 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f, - 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f, - 0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f, - 0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f, - 0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f, - 0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, - 0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, - 0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f, - 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f, - 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f, - 0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f, - 0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f, - 0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f, - 0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, - 0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, - 0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f, - /* interp_weight lut */ - 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, - 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, - 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f, - 0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, - 0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, - 0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f, - 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f, - 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f, - 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, - 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, - 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f, - 0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, - 0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, - 0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f, - 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f, - 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f, - 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, - 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, - 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f, - 0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, - 0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, - 0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f, - 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f, - 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f, - 0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f, - 0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f, - 0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f, - 0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, - 0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, - 0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f, - 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f, - 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f, - 0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f, - 0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f, - 0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f, - 0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, - 0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, - 0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f, - 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f, - 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f, - 0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f, - 0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f, - 0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f -}; namespace cv { @@ -180,7 +88,7 @@ namespace cv int nblocks_win_x, int nblocks_win_y); void compute_hists(int nbins, int block_stride_x, int blovck_stride_y, - int height, int width, float sigma, const cv::ocl::oclMat &grad, + int height, int width, const cv::ocl::oclMat &grad, const cv::ocl::oclMat &qangle, const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists); @@ -328,10 +236,18 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride) Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride); labels.create(1, wins_per_img.area(), CV_8U); - vector v_lut = vector(gaussian_interp_lut, gaussian_interp_lut + - sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0])); - Mat m_lut(v_lut); - gauss_w_lut.upload(m_lut.reshape(1,1)); + float sigma = getWinSigma(); + float scale = 1.f / (2.f * sigma * sigma); + Mat gaussian_lut(1, 512, CV_32FC1); + int idx = 0; + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = std::exp(-(j * j + i * i) * scale); + for(int i=-8; i<8; i++) + for(int j=-8; j<8; j++) + gaussian_lut.at(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f; + + gauss_w_lut.upload(gaussian_lut); } void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle) @@ -358,7 +274,7 @@ void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img) computeGradient(img, this->grad, this->qangle); hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, - effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists); + effect_size.width, grad, qangle, gauss_w_lut, block_hists); hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width, block_hists, (float)threshold_L2hys); @@ -1708,7 +1624,7 @@ void cv::ocl::device::hog::set_up_constants(int nbins, void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y, - int height, int width, float sigma, + int height, int width, const cv::ocl::oclMat &grad, const cv::ocl::oclMat &qangle, const cv::ocl::oclMat &gauss_w_lut, @@ -1716,8 +1632,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, { Context *clCxt = Context::getContext(); vector< pair > args; - string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" : - "compute_hists_kernel"; + string kernelName = "compute_hists_lut_kernel"; int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x; @@ -1728,9 +1643,6 @@ void cv::ocl::device::hog::compute_hists(int nbins, int grad_quadstep = grad.step >> 2; int qangle_step = qangle.step; - // Precompute gaussian spatial window parameter - float scale = 1.f / (2.f * sigma * sigma); - int blocks_in_group = 4; size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; size_t globalThreads[3] = { @@ -1751,14 +1663,10 @@ void cv::ocl::device::hog::compute_hists(int nbins, args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step)); args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data)); - if (kernelName.compare("compute_hists_lut_kernel") == 0) - args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data)); - else - args.push_back( make_pair( sizeof(cl_float), (void *)&scale)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data)); args.push_back( make_pair( smem, (void *)NULL)); - if(hog_device_cpu) { openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 8ca12704e5..0363227606 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -53,7 +53,7 @@ //---------------------------------------------------------------------------- // Histogram computation // 12 threads for a cell, 12x4 threads per block -// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f +// Use pre-computed gaussian and interp_weight lookup tables __kernel void compute_hists_lut_kernel( const int cblock_stride_x, const int cblock_stride_y, const int cnbins, const int cblock_hist_size, const int img_block_width, @@ -146,99 +146,6 @@ __kernel void compute_hists_lut_kernel( } } -//---------------------------------------------------------------------------- -// Histogram computation -// 12 threads for a cell, 12x4 threads per block -__kernel void compute_hists_kernel( - const int cblock_stride_x, const int cblock_stride_y, - const int cnbins, const int cblock_hist_size, const int img_block_width, - const int blocks_in_group, const int blocks_total, - const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, - const float scale, __global float* block_hists, __local float* smem) -{ - const int lx = get_local_id(0); - const int lp = lx / 24; /* local group id */ - const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ - const int gidY = gid / img_block_width; - const int gidX = gid - gidY * img_block_width; - - const int lidX = lx - lp * 24; - const int lidY = get_local_id(1); - - const int cell_x = lidX / 12; - const int cell_y = lidY; - const int cell_thread_x = lidX - cell_x * 12; - - __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * - CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); - __local float* final_hist = hists + cnbins * - (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); - - const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; - const int offset_y = gidY * cblock_stride_y + (cell_y << 2); - - __global const float* grad_ptr = (gid < blocks_total) ? - grad + offset_y * grad_quadstep + (offset_x << 1) : grad; - __global const uchar* qangle_ptr = (gid < blocks_total) ? - qangle + offset_y * qangle_step + (offset_x << 1) : qangle; - - __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + - cell_thread_x; - for (int bin_id = 0; bin_id < cnbins; ++bin_id) - hist[bin_id * 48] = 0.f; - - const int dist_x = -4 + cell_thread_x - 4 * cell_x; - const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); - - const int dist_y_begin = -4 - 4 * lidY; - for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) - { - float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); - - grad_ptr += grad_quadstep; - qangle_ptr += qangle_step; - - int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); - - float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * - dist_center_x) * scale); - float interp_weight = (8.f - fabs(dist_y + 0.5f)) * - (8.f - fabs(dist_x + 0.5f)) / 64.f; - - hist[bin.x * 48] += gaussian * interp_weight * vote.x; - hist[bin.y * 48] += gaussian * interp_weight * vote.y; - } - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* hist_ = hist; - for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) - { - if (cell_thread_x < 6) - hist_[0] += hist_[6]; - barrier(CLK_LOCAL_MEM_FENCE); - if (cell_thread_x < 3) - hist_[0] += hist_[3]; -#ifdef CPU - barrier(CLK_LOCAL_MEM_FENCE); -#endif - if (cell_thread_x == 0) - final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = - hist_[0] + hist_[1] + hist_[2]; - } -#ifdef CPU - barrier(CLK_LOCAL_MEM_FENCE); -#endif - int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; - if ((tid < cblock_hist_size) && (gid < blocks_total)) - { - __global float* block_hist = block_hists + - (gidY * img_block_width + gidX) * cblock_hist_size; - block_hist[tid] = final_hist[tid]; - } -} - //------------------------------------------------------------- // Normalization of histograms via L2Hys_norm // optimized for the case of 9 bins