From c072c28e28846a6fe11720c3495814d78233f09b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 9 Jun 2014 17:07:11 +0400 Subject: [PATCH 1/4] optimized cv::calcHist --- modules/imgproc/src/histogram.cpp | 21 ++++--- modules/imgproc/src/opencl/histogram.cl | 83 ++++++++++++++++--------- 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 71bd0e7de2..92db679043 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -1477,14 +1477,18 @@ enum static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32S) { - int compunits = ocl::Device::getDefault().maxComputeUnits(); - size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); + const ocl::Device & dev = ocl::Device::getDefault(); + int compunits = dev.maxComputeUnits(); + size_t wgs = dev.maxWorkGroupSize(); Size size = _src.size(); bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0; + int kercn = dev.isAMD() && use16 ? 16 : std::min(4, ocl::predictOptimalVectorWidth(_src)); ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc, - format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D cn=%d", - BINS, compunits, wgs, use16 ? 16 : 1)); + format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D kercn=%d -D T=%s%s", + BINS, compunits, wgs, kercn, + kercn == 4 ? "int" : ocl::typeToStr(CV_8UC(kercn)), + _src.isContinuous() ? " -D HAVE_SRC_CONT" : "")); if (k1.empty()) return false; @@ -1492,18 +1496,21 @@ static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32 UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1), hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1); - k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total()); + k1.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total()); size_t globalsize = compunits * wgs; if (!k1.run(1, &globalsize, &wgs, false)) return false; ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc, - format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs)); + format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", + BINS, compunits, (int)wgs)); if (k2.empty()) return false; - k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist)); + k2.args(ocl::KernelArg::PtrReadOnly(ghist), + ocl::KernelArg::PtrWriteOnly(hist)); if (!k2.run(1, &wgs, &wgs, false)) return false; diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl index c0247a5ba2..05cd42763f 100644 --- a/modules/imgproc/src/opencl/histogram.cl +++ b/modules/imgproc/src/opencl/histogram.cl @@ -37,58 +37,78 @@ // // -#ifndef cn -#define cn 1 +#ifndef kercn +#define kercn 1 #endif -#if cn == 16 -#define T uchar16 -#else +#ifndef T #define T uchar #endif __kernel void calculate_histogram(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, - __global uchar * hist, int total) + __global uchar * histptr, int total) { int lid = get_local_id(0); - int id = get_global_id(0) * cn; + int id = get_global_id(0) * kercn; int gid = get_group_id(0); __local int localhist[BINS]; + #pragma unroll for (int i = lid; i < BINS; i += WGS) localhist[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); - for (int grain = HISTS_COUNT * WGS * cn; id < total; id += grain) + int src_index; + + for (int grain = HISTS_COUNT * WGS * kercn; id < total; id += grain) { - int src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols); -#if cn == 1 - atomic_inc(localhist + convert_int(src[src_index])); +#ifdef HAVE_SRC_CONT + src_index = id; #else + src_index = mad24(id / src_cols, src_step, src_offset + id % src_cols); +#endif + +#if kercn == 1 + atomic_inc(localhist + convert_int(src[src_index])); +#elif kercn == 4 + int value = *(__global const int *)(src + src_index); + atomic_inc(localhist + (value & 0xff)); + atomic_inc(localhist + ((value >> 8) & 0xff)); + atomic_inc(localhist + ((value >> 16) & 0xff)); + atomic_inc(localhist + ((value >> 24) & 0xff)); +#elif kercn >= 2 T value = *(__global const T *)(src + src_index); - atomic_inc(localhist + convert_int(value.s0)); - atomic_inc(localhist + convert_int(value.s1)); - atomic_inc(localhist + convert_int(value.s2)); - atomic_inc(localhist + convert_int(value.s3)); - atomic_inc(localhist + convert_int(value.s4)); - atomic_inc(localhist + convert_int(value.s5)); - atomic_inc(localhist + convert_int(value.s6)); - atomic_inc(localhist + convert_int(value.s7)); - atomic_inc(localhist + convert_int(value.s8)); - atomic_inc(localhist + convert_int(value.s9)); - atomic_inc(localhist + convert_int(value.sA)); - atomic_inc(localhist + convert_int(value.sB)); - atomic_inc(localhist + convert_int(value.sC)); - atomic_inc(localhist + convert_int(value.sD)); - atomic_inc(localhist + convert_int(value.sE)); - atomic_inc(localhist + convert_int(value.sF)); + atomic_inc(localhist + value.s0); + atomic_inc(localhist + value.s1); +#if kercn >= 4 + atomic_inc(localhist + value.s2); + atomic_inc(localhist + value.s3); +#if kercn >= 8 + atomic_inc(localhist + value.s4); + atomic_inc(localhist + value.s5); + atomic_inc(localhist + value.s6); + atomic_inc(localhist + value.s7); +#if kercn == 16 + atomic_inc(localhist + value.s8); + atomic_inc(localhist + value.s9); + atomic_inc(localhist + value.sA); + atomic_inc(localhist + value.sB); + atomic_inc(localhist + value.sC); + atomic_inc(localhist + value.sD); + atomic_inc(localhist + value.sE); + atomic_inc(localhist + value.sF); +#endif +#endif +#endif #endif } barrier(CLK_LOCAL_MEM_FENCE); + __global int * hist = (__global int *)(histptr + gid * BINS * (int)sizeof(int)); + #pragma unroll for (int i = lid; i < BINS; i += WGS) - *(__global int *)(hist + mad24(gid, BINS * (int)sizeof(int), i * (int)sizeof(int))) = localhist[i]; + hist[i] = localhist[i]; } __kernel void merge_histogram(__global const int * ghist, __global int * hist) @@ -97,15 +117,16 @@ __kernel void merge_histogram(__global const int * ghist, __global int * hist) #pragma unroll for (int i = lid; i < BINS; i += WGS) - hist[i] = 0; + hist[i] = ghist[i]; barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll - for (int i = 0; i < HISTS_COUNT; ++i) + for (int i = 1; i < HISTS_COUNT; ++i) { + ghist += BINS; #pragma unroll for (int j = lid; j < BINS; j += WGS) - hist[j] += ghist[mad24(i, BINS, j)]; + hist[j] += ghist[j]; barrier(CLK_LOCAL_MEM_FENCE); } } From eeaa4b36657838a8f79192341fc50d484d23d6e1 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 9 Jun 2014 19:45:37 +0400 Subject: [PATCH 2/4] eliminated convertTo --- modules/imgproc/src/histogram.cpp | 19 +++++++------------ modules/imgproc/src/opencl/histogram.cl | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 92db679043..9be3f56979 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -1494,7 +1494,7 @@ static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32 _hist.create(BINS, 1, ddepth); UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1), - hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1); + hist = _hist.getUMat(); k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total()); @@ -1503,23 +1503,18 @@ static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32 if (!k1.run(1, &globalsize, &wgs, false)) return false; + char cvt[40]; ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc, - format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", - BINS, compunits, (int)wgs)); + format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D convertToHT=%s -D HT=%s", + BINS, compunits, (int)wgs, ocl::convertTypeStr(CV_32S, ddepth, 1, cvt), + ocl::typeToStr(ddepth))); if (k2.empty()) return false; k2.args(ocl::KernelArg::PtrReadOnly(ghist), - ocl::KernelArg::PtrWriteOnly(hist)); - if (!k2.run(1, &wgs, &wgs, false)) - return false; + ocl::KernelArg::WriteOnlyNoSize(hist)); - if (hist.depth() != ddepth) - hist.convertTo(_hist, ddepth); - else - _hist.getUMatRef() = hist; - - return true; + return k2.run(1, &wgs, &wgs, false); } static bool ocl_calcHist(InputArrayOfArrays images, OutputArray hist) diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl index 05cd42763f..2161d3b089 100644 --- a/modules/imgproc/src/opencl/histogram.cl +++ b/modules/imgproc/src/opencl/histogram.cl @@ -45,6 +45,8 @@ #define T uchar #endif +#define noconvert + __kernel void calculate_histogram(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * histptr, int total) { @@ -111,10 +113,20 @@ __kernel void calculate_histogram(__global const uchar * src, int src_step, int hist[i] = localhist[i]; } -__kernel void merge_histogram(__global const int * ghist, __global int * hist) +#ifndef HT +#define HT int +#endif + +#ifndef convertToHT +#define convertToHT noconvert +#endif + +__kernel void merge_histogram(__global const int * ghist, __global uchar * histptr, int hist_step, int hist_offset) { int lid = get_local_id(0); + __global HT * hist = (__global HT *)(histptr + hist_offset); + #pragma unroll for (int i = lid; i < BINS; i += WGS) hist[i] = ghist[i]; @@ -126,7 +138,7 @@ __kernel void merge_histogram(__global const int * ghist, __global int * hist) ghist += BINS; #pragma unroll for (int j = lid; j < BINS; j += WGS) - hist[j] += ghist[j]; + hist[j] += convertToHT(ghist[j]); barrier(CLK_LOCAL_MEM_FENCE); } } From c9528b3952777298fda3ac354436a2484b0fa91b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 9 Jun 2014 19:58:45 +0400 Subject: [PATCH 3/4] optimized histogram merging --- modules/imgproc/src/opencl/histogram.cl | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl index 2161d3b089..05341deabb 100644 --- a/modules/imgproc/src/opencl/histogram.cl +++ b/modules/imgproc/src/opencl/histogram.cl @@ -126,21 +126,31 @@ __kernel void merge_histogram(__global const int * ghist, __global uchar * histp int lid = get_local_id(0); __global HT * hist = (__global HT *)(histptr + hist_offset); - +#if WGS >= BINS + HT res = (HT)(0); +#else #pragma unroll for (int i = lid; i < BINS; i += WGS) - hist[i] = ghist[i]; - barrier(CLK_LOCAL_MEM_FENCE); + hist[i] = (HT)(0); +#endif #pragma unroll - for (int i = 1; i < HISTS_COUNT; ++i) + for (int i = 0; i < HISTS_COUNT; ++i) { - ghist += BINS; #pragma unroll for (int j = lid; j < BINS; j += WGS) +#if WGS >= BINS + res += convertToHT(ghist[j]); +#else hist[j] += convertToHT(ghist[j]); - barrier(CLK_LOCAL_MEM_FENCE); +#endif + ghist += BINS; } + +#if WGS >= BINS + if (lid < BINS) + *(__global HT *)(histptr + mad24(lid, hist_step, hist_offset)) = res; +#endif } __kernel void calcLUT(__global uchar * dst, __constant int * hist, int total) From 33239fca70c4bc5993cda79ed7f42528bacf505d Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 10 Jun 2014 15:48:42 +0400 Subject: [PATCH 4/4] cv::equalizeHist --- modules/imgproc/src/histogram.cpp | 40 +++++++++++++++++-------- modules/imgproc/src/opencl/histogram.cl | 28 +++++++++++++++-- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 9be3f56979..441d2226b8 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -3430,24 +3430,40 @@ namespace cv { static bool ocl_equalizeHist(InputArray _src, OutputArray _dst) { - size_t wgs = std::min(ocl::Device::getDefault().maxWorkGroupSize(), BINS); + const ocl::Device & dev = ocl::Device::getDefault(); + int compunits = dev.maxComputeUnits(); + size_t wgs = dev.maxWorkGroupSize(); + Size size = _src.size(); + bool use16 = size.width % 16 == 0 && _src.offset() % 16 == 0 && _src.step() % 16 == 0; + int kercn = dev.isAMD() && use16 ? 16 : std::min(4, ocl::predictOptimalVectorWidth(_src)); - // calculation of histogram - UMat hist; - if (!ocl_calcHist1(_src, hist)) + ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc, + format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d -D kercn=%d -D T=%s%s", + BINS, compunits, wgs, kercn, + kercn == 4 ? "int" : ocl::typeToStr(CV_8UC(kercn)), + _src.isContinuous() ? " -D HAVE_SRC_CONT" : "")); + if (k1.empty()) return false; + UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1); + + k1.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::PtrWriteOnly(ghist), (int)src.total()); + + size_t globalsize = compunits * wgs; + if (!k1.run(1, &globalsize, &wgs, false)) + return false; + + wgs = std::min(ocl::Device::getDefault().maxWorkGroupSize(), BINS); UMat lut(1, 256, CV_8UC1); - ocl::Kernel k("calcLUT", ocl::imgproc::histogram_oclsrc, - format("-D BINS=%d -D HISTS_COUNT=1 -D WGS=%d", BINS, (int)wgs)); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::PtrWriteOnly(lut), - ocl::KernelArg::PtrReadOnly(hist), (int)_src.total()); + ocl::Kernel k2("calcLUT", ocl::imgproc::histogram_oclsrc, + format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", + BINS, compunits, (int)wgs)); + k2.args(ocl::KernelArg::PtrWriteOnly(lut), + ocl::KernelArg::PtrReadOnly(ghist), (int)_src.total()); // calculation of LUT - if (!k.run(1, &wgs, &wgs, false)) + if (!k2.run(1, &wgs, &wgs, false)) return false; // execute LUT transparently diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl index 05341deabb..ff8023054f 100644 --- a/modules/imgproc/src/opencl/histogram.cl +++ b/modules/imgproc/src/opencl/histogram.cl @@ -153,13 +153,37 @@ __kernel void merge_histogram(__global const int * ghist, __global uchar * histp #endif } -__kernel void calcLUT(__global uchar * dst, __constant int * hist, int total) +__kernel void calcLUT(__global uchar * dst, __global const int * ghist, int total) { int lid = get_local_id(0); __local int sumhist[BINS]; __local float scale; - sumhist[lid] = hist[lid]; +#if WGS >= BINS + int res = 0; +#else + #pragma unroll + for (int i = lid; i < BINS; i += WGS) + sumhist[i] = 0; +#endif + + #pragma unroll + for (int i = 0; i < HISTS_COUNT; ++i) + { + #pragma unroll + for (int j = lid; j < BINS; j += WGS) +#if WGS >= BINS + res += ghist[j]; +#else + sumhist[j] += ghist[j]; +#endif + ghist += BINS; + } + +#if WGS >= BINS + if (lid < BINS) + sumhist[lid] = res; +#endif barrier(CLK_LOCAL_MEM_FENCE); if (lid == 0)