From 73663dcdd1f0f06a0567f266c4f9ebeb9b74a2b2 Mon Sep 17 00:00:00 2001 From: Erik Karlsson Date: Mon, 2 Mar 2015 03:29:17 +0100 Subject: [PATCH] Added support for 16-bit input --- .../src/fast_nlmeans_denoising_opencl.hpp | 57 ++++++++++++------- modules/photo/src/opencl/nlmeans.cl | 31 ++++++---- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index 5e96533fb9..a88b5cfd71 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -28,12 +28,14 @@ static int divUp(int a, int b) return (a + b - 1) / b; } -template +template static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn, int & almostTemplateWindowSizeSqBinShift, bool abs) { - const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255; - int fixedPointMult = std::numeric_limits::max() / maxEstimateSumValue; + const WT maxEstimateSumValue = searchWindowSize * searchWindowSize * + std::numeric_limits::max(); + int fixedPointMult = (int)std::min(std::numeric_limits::max() / maxEstimateSumValue, + std::numeric_limits::max()); int depth = DataType::depth; bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; @@ -48,7 +50,8 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; const FT WEIGHT_THRESHOLD = 1e-3f; - int maxDist = abs ? 255 * cn : 255 * 255 * cn; + int maxDist = abs ? std::numeric_limits::max() * cn : + std::numeric_limits::max() * std::numeric_limits::max() * cn; int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); FT den = 1.0f / (h * h * cn); @@ -74,7 +77,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT; Size size = _src.size(); - if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC3 ) + if (cn != 1 && cn != 2 && cn != 3 && depth != CV_8U && (!abs || depth != CV_16U)) return false; int templateWindowHalfWize = templateWindowSize / 2; @@ -84,45 +87,60 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS); int almostTemplateWindowSizeSqBinShift = -1; - char cvt[2][40]; + char buf[4][40]; String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" " -D sample_t=%s -D pixel_t=%s -D int_t=%s" + " -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s" " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" " -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s%s", templateWindowSize, searchWindowSize, ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)), + depth == CV_8U ? ocl::typeToStr(CV_32S) : "long", + depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) : + (sprintf(buf[0], "long%d", cn), buf[0]), + depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) : + (sprintf(buf[1], "convert_long%d", cn), buf[1]), BLOCK_COLS, BLOCK_ROWS, ctaSize, templateWindowHalfWize, searchWindowHalfSize, - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), type == CV_8UC3 ? 4 : cn, - ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]), abs ? " -D ABS" : ""); + ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn == 3 ? 4 : cn, + ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : ""); ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); if (k.empty()) return false; UMat almostDist2Weight; - if (!ocl_calcAlmostDist2Weight(almostDist2Weight, searchWindowSize, templateWindowSize, - h, cn, almostTemplateWindowSizeSqBinShift, abs)) + if ((depth == CV_8U && + !ocl_calcAlmostDist2Weight(almostDist2Weight, + searchWindowSize, templateWindowSize, + h, cn, + almostTemplateWindowSizeSqBinShift, + abs)) || + (depth == CV_16U && + !ocl_calcAlmostDist2Weight(almostDist2Weight, + searchWindowSize, templateWindowSize, + h, cn, + almostTemplateWindowSizeSqBinShift, + abs))) return false; CV_Assert(almostTemplateWindowSizeSqBinShift >= 0); UMat srcex; int borderSize = searchWindowHalfSize + templateWindowHalfWize; - if (type == CV_8UC3) { - Mat src_rgb = _src.getMat(), src_rgba(size, CV_8UC4); + if (cn == 3) { + UMat tmp(size, CV_MAKE_TYPE(depth, 4)); int from_to[] = { 0,0, 1,1, 2,2 }; - mixChannels(&src_rgb, 1, &src_rgba, 1, from_to, 3); - copyMakeBorder(src_rgba, srcex, - borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); + mixChannels(std::vector(1, _src.getUMat()), std::vector(1, tmp), from_to, 3); + copyMakeBorder(tmp, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); } else copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); _dst.create(size, type); UMat dst; - if (type == CV_8UC3) - dst.create(size, CV_8UC4); + if (cn == 3) + dst.create(size, CV_MAKE_TYPE(depth, 4)); else dst = _dst.getUMat(); @@ -139,10 +157,9 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 }; if (!k.run(2, globalsize, localsize, false)) return false; - if (type == CV_8UC3) { - Mat dst_rgba = dst.getMat(ACCESS_READ), dst_rgb = _dst.getMat(); + if (cn == 3) { int from_to[] = { 0,0, 1,1, 2,2 }; - mixChannels(&dst_rgba, 1, &dst_rgb, 1, from_to, 3); + mixChannels(std::vector(1, dst), std::vector(1, _dst.getUMat()), from_to, 3); } return true; diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index c48adda0b2..3a104c42a1 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -206,22 +206,23 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off inline void convolveWindow(__global const sample_t * src, int src_step, int src_offset, __local int * dists, __global const int * almostDist2Weight, __global sample_t * dst, int dst_step, int dst_offset, - int y, int x, int id, __local int * weights_local, - __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) + int y, int x, int id, __local weight_t * weights_local, + __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) { - int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0; - int_t weighted_sum = (int_t)(0); + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + weight_t weights = 0; + sum_t weighted_sum = (sum_t)(0); for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE) { int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset)); - int_t src_value = convert_int_t(*(__global const pixel_t *)(src + src_index)); + sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index)); int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; int weight = almostDist2Weight[almostAvgDist]; - weights += weight; - weighted_sum += (int_t)(weight) * src_value; + weights += (weight_t)weight; + weighted_sum += (sum_t)(weight) * src_value; } weights_local[id] = weights; @@ -242,11 +243,11 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_ if (id == 0) { int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset)); - int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + + sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3]; - int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; + weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; - *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (int_t)(weights_local_0)); + *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)(weights_local_0)); } } @@ -259,8 +260,9 @@ __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step, int block_y = get_group_id(1); int id = get_local_id(0), first; - __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE]; - __local int_t weighted_sum[CTA_SIZE]; + __local int dists[SEARCH_SIZE_SQ]; + __local weight_t weights[CTA_SIZE]; + __local sum_t weighted_sum[CTA_SIZE]; int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); @@ -271,6 +273,11 @@ __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step, __global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int)); __global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE; + src_step /= sizeof(sample_t); + src_offset /= sizeof(sample_t); + dst_step /= sizeof(sample_t); + dst_offset /= sizeof(sample_t); + for (int y = y0; y < y1; ++y) for (int x = x0; x < x1; ++x) {