diff --git a/modules/imgproc/src/opencl/match_template.cl b/modules/imgproc/src/opencl/match_template.cl index 3d913a8395..1919e8edd2 100644 --- a/modules/imgproc/src/opencl/match_template.cl +++ b/modules/imgproc/src/opencl/match_template.cl @@ -29,6 +29,14 @@ // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. +#if cn != 3 +#define loadpix(addr) *(__global const T *)(addr) +#define TSIZE (int)sizeof(T) +#else +#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) +#define TSIZE ((int)sizeof(T1)*3) +#endif + #define SQSUMS_PTR(ox, oy) mad24(y + oy, src_sqsums_step, mad24(x + ox, cn, src_sqsums_offset)) #define SUMS_PTR(ox, oy) mad24(y + oy, src_sums_step, mad24(x + ox, cn, src_sums_offset)) #define SUMS(ox, oy) mad24(y+oy, src_sums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sums_offset)) @@ -66,14 +74,6 @@ inline float normAcc_SQDIFF(float num, float denum) #error "cn should be 1-4" #endif -#if cn != 3 -#define loadpix(addr) *(__global const T *)(addr) -#define TSIZE (int)sizeof(T) -#else -#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) -#define TSIZE ((int)sizeof(T1)*3) -#endif - #ifdef CALC_SUM __kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset, @@ -141,39 +141,7 @@ __kernel void extractFirstChannel( const __global uchar* img, int img_step, int #elif defined CCORR -#if cn==3 - -__kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset, - __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, - __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - WT sum = (WT)(0); - - for (int i = 0; i < template_rows; ++i) - { - for (int j = 0; j < template_cols; ++j) - { - T src = vload3(0, (__global const T1 *)(srcptr + mad24(y+i, src_step, mad24(x+j, (int)sizeof(T1)*cn, src_offset)))); - T template = vload3(0, (__global const T1 *)(templateptr + mad24(i, template_step, mad24(j, (int)sizeof(T1)*cn, template_offset)))); -#if wdepth == 4 - sum = mad24(convertToWT(src), convertToWT(template), sum); -#else - sum = mad(convertToWT(src), convertToWT(template), sum); -#endif - } - } - - int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - *(__global float *)(dst + dst_idx) = convertToDT(sum); - } -} - -#elif cn==1 && PIX_PER_WI_X==4 +#if cn==1 && PIX_PER_WI_X==4 __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset, __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, @@ -256,47 +224,29 @@ __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_s __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) { - int x0 = get_global_id(0)*PIX_PER_WI_X; + int x = get_global_id(0); int y = get_global_id(1); - int step = src_step/(int)sizeof(T); - - if (y < dst_rows) + if (x < dst_cols && y < dst_rows) { - WT sum [PIX_PER_WI_X]; - #pragma unroll - for (int i=0; i < PIX_PER_WI_X; i++) - sum[i] = 0; - - __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x0, (int)sizeof(T), src_offset))); - __global const T * template = (__global const T *)(templateptr + template_offset); + WT sum = (WT)(0); for (int i = 0; i < template_rows; ++i) { for (int j = 0; j < template_cols; ++j) { - #pragma unroll - for (int cx=0, x = x0; cx < PIX_PER_WI_X && x < dst_cols; ++cx, ++x) - { - + T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset))); + T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset))); #if wdepth == 4 - sum[cx] = mad24(convertToWT(src[j+cx]), convertToWT(template[j]), sum[cx]); + sum = mad24(convertToWT(src), convertToWT(template), sum); #else - sum[cx] = mad(convertToWT(src[j+cx]), convertToWT(template[j]), sum[cx]); + sum = mad(convertToWT(src), convertToWT(template), sum); #endif - } } - - src = (__global const T *)((__global const uchar *)src + src_step); - template = (__global const T *)((__global const uchar *)template + template_step); } - #pragma unroll - for (int cx=0; cx < PIX_PER_WI_X && x0 < dst_cols; ++cx, ++x0) - { - int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset)); - *(__global float *)(dst + dst_idx) = convertToDT(sum[cx]); - } + int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); + *(__global float *)(dst + dst_idx) = convertToDT(sum); } } #endif @@ -327,8 +277,6 @@ __kernel void matchTemplate_CCORR_NORMED(__global const uchar * src_sqsums, int #elif defined SQDIFF -#if cn==3 - __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset, __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) @@ -344,8 +292,8 @@ __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_ { for (int j = 0; j < template_cols; ++j) { - T src = vload3(0, (__global const T1 *)(srcptr + mad24(y+i, src_step, mad24(x+j, (int)sizeof(T1)*cn, src_offset)))); - T template = vload3(0, (__global const T1 *)(templateptr + mad24(i, template_step, mad24(j, (int)sizeof(T1)*cn, template_offset)))); + T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset))); + T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset))); value = convertToWT(src) - convertToWT(template); #if wdepth == 4 @@ -361,45 +309,6 @@ __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_ } } -#else - -__kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset, - __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, - __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); - __global const T * template = (__global const T *)(templateptr + template_offset); - - WT sum = (WT)(0), value; - - for (int i = 0; i < template_rows; ++i) - { - for (int j = 0; j < template_cols; ++j) - { - value = convertToWT(src[j]) - convertToWT(template[j]); -#if wdepth == 4 - sum = mad24(value, value, sum); -#else - sum = mad(value, value, sum); -#endif - } - - src = (__global const T *)((__global const uchar *)src + src_step); - template = (__global const T *)((__global const uchar *)template + template_step); - } - - int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - *(__global float *)(dst + dst_idx) = convertToDT(sum); - } -} - -#endif - #elif defined SQDIFF_PREPARED __kernel void matchTemplate_Prepared_SQDIFF(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp index 926015a9d3..164af425e3 100644 --- a/modules/imgproc/src/templmatch.cpp +++ b/modules/imgproc/src/templmatch.cpp @@ -108,14 +108,14 @@ static bool sumTemplate(InputArray _src, UMat & result) return k.run(1, &globalsize, &wgs, false); } -static bool useNaive(int method, Size size) +static bool useNaive(Size size) { - if(method == TM_CCORR || method == TM_SQDIFF ) - { - return size.height < 18 && size.width < 18; - } - else - return false; + if (!ocl::Device::getDefault().isIntel()) + return true; + + int dft_size = 18; + return size.height < dft_size && size.width < dft_size; + } struct ConvolveBuf @@ -261,14 +261,14 @@ static bool convolve_32F(InputArray _image, InputArray _templ, OutputArray _resu static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result) { int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int wdepth = std::max(depth, CV_32S), wtype = CV_MAKE_TYPE(wdepth, cn); + int wdepth = CV_32F, wtype = CV_MAKE_TYPE(wdepth, cn); ocl::Device dev = ocl::Device::getDefault(); - int pxPerWIx = (cn!=3 && dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1; + int pxPerWIx = (cn==1 && dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1; int rated_cn = cn; int wtype1 = wtype; - if (pxPerWIx!=1 && cn==1) + if (pxPerWIx!=1) { rated_cn = pxPerWIx; type = CV_MAKE_TYPE(depth, rated_cn); @@ -299,27 +299,26 @@ static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, Outpu static bool matchTemplate_CCORR(InputArray _image, InputArray _templ, OutputArray _result) +{ + if (useNaive(_templ.size())) + return( matchTemplateNaive_CCORR(_image, _templ, _result)); + else + { + if(_image.depth() == CV_8U) { - if (useNaive(TM_CCORR, _templ.size())) - return( matchTemplateNaive_CCORR(_image, _templ, _result)); - - else - { - if(_image.depth() == CV_8U && _templ.depth() == CV_8U) - { - UMat imagef, templf; - UMat image = _image.getUMat(); - UMat templ = _templ.getUMat(); - image.convertTo(imagef, CV_32F); - templ.convertTo(templf, CV_32F); - return(convolve_32F(imagef, templf, _result)); - } - else - { - return(convolve_32F(_image, _templ, _result)); - } - } + UMat imagef, templf; + UMat image = _image.getUMat(); + UMat templ = _templ.getUMat(); + image.convertTo(imagef, CV_32F); + templ.convertTo(templf, CV_32F); + return(convolve_32F(imagef, templf, _result)); } + else + { + return(convolve_32F(_image, _templ, _result)); + } + } +} static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, OutputArray _result) { @@ -355,7 +354,7 @@ static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, Out static bool matchTemplateNaive_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result) { int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int wdepth = std::max(depth, CV_32S), wtype = CV_MAKE_TYPE(wdepth, cn); + int wdepth = CV_32F, wtype = CV_MAKE_TYPE(wdepth, cn); char cvt[40]; ocl::Kernel k("matchTemplate_Naive_SQDIFF", ocl::imgproc::match_template_oclsrc, @@ -377,7 +376,7 @@ static bool matchTemplateNaive_SQDIFF(InputArray _image, InputArray _templ, Outp static bool matchTemplate_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result) { - if (useNaive(TM_SQDIFF, _templ.size())) + if (useNaive(_templ.size())) return( matchTemplateNaive_SQDIFF(_image, _templ, _result)); else { diff --git a/modules/imgproc/test/ocl/test_match_template.cpp b/modules/imgproc/test/ocl/test_match_template.cpp index 92ff9926a9..8c8a1238c7 100644 --- a/modules/imgproc/test/ocl/test_match_template.cpp +++ b/modules/imgproc/test/ocl/test_match_template.cpp @@ -71,7 +71,7 @@ PARAM_TEST_CASE(MatchTemplate, MatDepth, Channels, MatchTemplType, bool) type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1)); depth = GET_PARAM(0); method = GET_PARAM(2); - use_roi = false;//GET_PARAM(3); + use_roi = GET_PARAM(3); } virtual void generateTestData() @@ -116,7 +116,7 @@ OCL_TEST_P(MatchTemplate, Mat) } } -OCL_INSTANTIATE_TEST_CASE_P(ImageProc, MatchTemplate, Combine( +OCL_INSTANTIATE_TEST_CASE_P(ImageProc, MatchTemplate, Combine( Values(CV_8U, CV_32F), Values(1, 2, 3, 4), MatchTemplType::all(),