diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index 02b4cc8355..f1954cfe33 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -365,7 +365,7 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1 static bool ocl_moments( InputArray _src, Moments& m) { - const int TILE_SIZE = 16; + const int TILE_SIZE = 32; const int K = 10; ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, format("-D TILE_SIZE=%d", TILE_SIZE)); if( k.empty() ) @@ -378,10 +378,10 @@ static bool ocl_moments( InputArray _src, Moments& m) int ntiles = xtiles*ytiles; UMat umbuf(1, ntiles*K, CV_32S); - size_t globalsize[] = {xtiles, ytiles}; + size_t globalsize[] = {xtiles, sz.height}, localsize[] = {1, TILE_SIZE}; bool ok = k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(umbuf), - xtiles).run(2, globalsize, 0, true); + xtiles).run(2, globalsize, localsize, true); if(!ok) return false; Mat mbuf = umbuf.getMat(ACCESS_READ); diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl index f6527b1657..0cf5b35440 100644 --- a/modules/imgproc/src/opencl/moments.cl +++ b/modules/imgproc/src/opencl/moments.cl @@ -1,32 +1,31 @@ /* See LICENSE file in the root OpenCV directory */ -#if TILE_SIZE > 16 -#error "TILE SIZE should be <= 16" +#if TILE_SIZE != 32 +#error "TILE SIZE should be 32" #endif __kernel void moments(__global const uchar* src, int src_step, int src_offset, int src_rows, int src_cols, __global int* mom0, int xtiles) { - int x = get_global_id(0); - int y = get_global_id(1); - int x_min = x*TILE_SIZE; - int y_min = y*TILE_SIZE; + int x0 = get_global_id(0); + int y0 = get_group_id(1); + int x, y = get_local_id(1); + int x_min = x0*TILE_SIZE; + int ypix = y0*TILE_SIZE + y; + __local int mom[TILE_SIZE][10]; - if( x_min < src_cols && y_min < src_rows ) + if( x_min < src_cols && y0*TILE_SIZE < src_rows ) { - int x_max = min(src_cols - x_min, TILE_SIZE); - int y_max = min(src_rows - y_min, TILE_SIZE); - int m00=0, m10=0, m01=0, m20=0, m11=0, m02=0, m30=0, m21=0, m12=0, m03=0; - __global const uchar* ptr = src + src_offset + y_min*src_step + x_min; - __global int* mom = mom0 + (xtiles*y + x)*10; - x = x_max & -4; - - for( y = 0; y < y_max; y++, ptr += src_step ) + if( ypix < src_rows ) { + int x_max = min(src_cols - x_min, TILE_SIZE); + __global const uchar* ptr = src + src_offset + ypix*src_step + x_min; int4 S = (int4)(0,0,0,0), p; #define SUM_ELEM(elem, ofs) \ - (int4)(1, (ofs), ((ofs)*(ofs)), ((ofs)*(ofs)*(ofs)))*elem + (int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem + + x = x_max & -4; if( x_max >= 4 ) { p = convert_int4(vload4(0, ptr)); @@ -51,6 +50,30 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, } } + if( x_max >= 20 ) + { + p = convert_int4(vload4(0, ptr+16)); + S += SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19); + + if( x_max >= 24 ) + { + p = convert_int4(vload4(0, ptr+20)); + S += SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23); + + if( x_max >= 28 ) + { + p = convert_int4(vload4(0, ptr+24)); + S += SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27); + + if( x_max >= 32 ) + { + p = convert_int4(vload4(0, ptr+28)); + S += SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31); + } + } + } + } + if( x < x_max ) { int ps = ptr[x]; @@ -68,27 +91,57 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, } int sy = y*y; - m00 += S.s0; - m10 += S.s1; - m01 += y*S.s0; - m20 += S.s2; - m11 += y*S.s1; - m02 += sy*S.s0; - m30 += S.s3; - m21 += y*S.s2; - m12 += sy*S.s1; - m03 += y*sy*S.s0; - } - mom[0] = m00; - mom[1] = m10; - mom[2] = m01; - mom[3] = m20; - mom[4] = m11; - mom[5] = m02; - mom[6] = m30; - mom[7] = m21; - mom[8] = m12; - mom[9] = m03; + mom[y][0] = S.s0; + mom[y][1] = S.s1; + mom[y][2] = y*S.s0; + mom[y][3] = S.s2; + mom[y][4] = y*S.s1; + mom[y][5] = sy*S.s0; + mom[y][6] = S.s3; + mom[y][7] = y*S.s2; + mom[y][8] = sy*S.s1; + mom[y][9] = y*sy*S.s0; + } + else + mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] = + mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + #define REDUCE(d) \ + if( y < d ) \ + { \ + mom[y][0] += mom[y+d][0]; \ + mom[y][1] += mom[y+d][1]; \ + mom[y][2] += mom[y+d][2]; \ + mom[y][3] += mom[y+d][3]; \ + mom[y][4] += mom[y+d][4]; \ + mom[y][5] += mom[y+d][5]; \ + mom[y][6] += mom[y+d][6]; \ + mom[y][7] += mom[y+d][7]; \ + mom[y][8] += mom[y+d][8]; \ + mom[y][9] += mom[y+d][9]; \ + } \ + barrier(CLK_LOCAL_MEM_FENCE) + + REDUCE(16); + REDUCE(8); + REDUCE(4); + REDUCE(2); + + if( y == 0 ) + { + __global int* momout = mom0 + (y0*xtiles + x0)*10; + momout[0] = mom[0][0] + mom[1][0]; + momout[1] = mom[0][1] + mom[1][1]; + momout[2] = mom[0][2] + mom[1][2]; + momout[3] = mom[0][3] + mom[1][3]; + momout[4] = mom[0][4] + mom[1][4]; + momout[5] = mom[0][5] + mom[1][5]; + momout[6] = mom[0][6] + mom[1][6]; + momout[7] = mom[0][7] + mom[1][7]; + momout[8] = mom[0][8] + mom[1][8]; + momout[9] = mom[0][9] + mom[1][9]; + } } }