From bc4e471847cb3209b7e14b0747ddc73af6ef221a Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Thu, 31 Jan 2019 16:10:59 +0300 Subject: [PATCH 1/9] Add a mutex for shared Inference Engine plugins --- modules/dnn/src/op_inf_engine.cpp | 11 ++++++++-- modules/dnn/test/test_layers.cpp | 34 +++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index 9652e58581..0349d44710 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -622,7 +622,11 @@ void InfEngineBackendNet::init(int targetId) #endif // IE < R5 -static std::map sharedPlugins; +static std::map& getSharedPlugins() +{ + static std::map sharedPlugins; + return sharedPlugins; +} void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) { @@ -630,6 +634,8 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) try { + AutoLock lock(getInitializationMutex()); + auto& sharedPlugins = getSharedPlugins(); auto pluginIt = sharedPlugins.find(targetDevice); if (pluginIt != sharedPlugins.end()) { @@ -797,7 +803,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN void resetMyriadDevice() { #ifdef HAVE_INF_ENGINE - sharedPlugins.erase(InferenceEngine::TargetDevice::eMYRIAD); + AutoLock lock(getInitializationMutex()); + getSharedPlugins().erase(InferenceEngine::TargetDevice::eMYRIAD); #endif // HAVE_INF_ENGINE } diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 77651ba617..f2f131333a 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -46,6 +46,10 @@ #include #include // CV_DNN_REGISTER_LAYER_CLASS +#ifdef HAVE_INF_ENGINE +#include +#endif + namespace opencv_test { namespace { template @@ -970,6 +974,36 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8) if (targetId != DNN_TARGET_MYRIAD) normAssert(outs[0], outs[1]); } + +TEST_P(Layer_Test_Convolution_DLDT, multithreading) +{ + Target targetId = GetParam(); + std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : ""; + std::string xmlPath = _tf("layer_convolution" + suffix + ".xml"); + std::string binPath = _tf("layer_convolution" + suffix + ".bin"); + Net firstNet = readNet(xmlPath, binPath); + Net secondNet = readNet(xmlPath, binPath); + Mat inp = blobFromNPY(_tf("blob.npy")); + + firstNet.setInput(inp); + secondNet.setInput(inp); + firstNet.setPreferableTarget(targetId); + secondNet.setPreferableTarget(targetId); + + Mat out1, out2; + std::thread t1([&]{out1 = firstNet.forward();}); + std::thread t2([&]{out2 = secondNet.forward();}); + + t1.join(); + t2.join(); + + Mat ref = blobFromNPY(_tf("layer_convolution.npy")); + double l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.5e-3 : 1e-5; + double lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.8e-2 : 1e-4; + normAssert(out1, ref, "first thread", l1, lInf); + normAssert(out2, ref, "second thread", l1, lInf); +} + INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Convolution_DLDT, testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))); From 6ad3bf31303378bdaa26a4829359ea14633a2c55 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Wed, 6 Feb 2019 13:05:11 +0300 Subject: [PATCH 2/9] Enable ResNet-based Mask-RCNN models from TensorFlow Object Detection API --- samples/dnn/tf_text_graph_mask_rcnn.py | 47 ++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/samples/dnn/tf_text_graph_mask_rcnn.py b/samples/dnn/tf_text_graph_mask_rcnn.py index c8803088f9..24d8790d32 100644 --- a/samples/dnn/tf_text_graph_mask_rcnn.py +++ b/samples/dnn/tf_text_graph_mask_rcnn.py @@ -25,7 +25,8 @@ scopesToIgnore = ('FirstStageFeatureExtractor/Assert', 'FirstStageFeatureExtractor/Shape', 'FirstStageFeatureExtractor/strided_slice', 'FirstStageFeatureExtractor/GreaterEqual', - 'FirstStageFeatureExtractor/LogicalAnd') + 'FirstStageFeatureExtractor/LogicalAnd', + 'Conv/required_space_to_batch_paddings') # Load a config file. config = readTextMessage(args.config) @@ -54,10 +55,30 @@ graph_def = parseTextGraph(args.output) removeIdentity(graph_def) +nodesToKeep = [] def to_remove(name, op): + if name in nodesToKeep: + return False return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \ (name.startswith('CropAndResize') and op != 'CropAndResize') +# Fuse atrous convolutions (with dilations). +nodesMap = {node.name: node for node in graph_def.node} +for node in reversed(graph_def.node): + if node.op == 'BatchToSpaceND': + del node.input[2] + conv = nodesMap[node.input[0]] + spaceToBatchND = nodesMap[conv.input[0]] + + paddingsNode = NodeDef() + paddingsNode.name = conv.name + '/paddings' + paddingsNode.op = 'Const' + paddingsNode.addAttr('value', [2, 2, 2, 2]) + graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode) + nodesToKeep.append(paddingsNode.name) + + spaceToBatchND.input[2] = paddingsNode.name + removeUnusedNodesAndAttrs(to_remove, graph_def) @@ -106,8 +127,8 @@ heights = [] for a in aspect_ratios: for s in scales: ar = np.sqrt(a) - heights.append((features_stride**2) * s / ar) - widths.append((features_stride**2) * s * ar) + heights.append((height_stride**2) * s / ar) + widths.append((width_stride**2) * s * ar) proposals.addAttr('width', widths) proposals.addAttr('height', heights) @@ -252,5 +273,25 @@ graph_def.node[-1].name = 'detection_masks' graph_def.node[-1].op = 'Sigmoid' graph_def.node[-1].input.pop() +def getUnconnectedNodes(): + unconnected = [node.name for node in graph_def.node] + for node in graph_def.node: + for inp in node.input: + if inp in unconnected: + unconnected.remove(inp) + return unconnected + +while True: + unconnectedNodes = getUnconnectedNodes() + unconnectedNodes.remove(graph_def.node[-1].name) + if not unconnectedNodes: + break + + for name in unconnectedNodes: + for i in range(len(graph_def.node)): + if graph_def.node[i].name == name: + del graph_def.node[i] + break + # Save as text. graph_def.save(args.output) From bbedebb57cf44bb447837918a02c48979411271a Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Wed, 6 Feb 2019 17:56:44 +0300 Subject: [PATCH 3/9] perf tests for cvtColor for 16U and 32f added --- modules/imgproc/perf/perf_cvt_color.cpp | 120 ++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index e0f32fdab6..e0b55b7fa0 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -100,6 +100,72 @@ CV_ENUM(CvtMode, COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA ) +CV_ENUM(CvtMode16U, + COLOR_BGR2BGRA, COLOR_BGR2GRAY, + COLOR_BGR2RGB, COLOR_BGR2RGBA, COLOR_BGR2XYZ, + COLOR_BGR2YCrCb, COLOR_BGR2YUV, + + COLOR_BGRA2BGR, COLOR_BGRA2GRAY, COLOR_BGRA2RGBA, + CX_BGRA2XYZ, + CX_BGRA2YCrCb, CX_BGRA2YUV, + + COLOR_GRAY2BGR, COLOR_GRAY2BGRA, + + COLOR_RGB2GRAY, + COLOR_RGB2XYZ, COLOR_RGB2YCrCb, COLOR_RGB2YUV, + + COLOR_RGBA2BGR, COLOR_RGBA2GRAY, + CX_RGBA2XYZ, + CX_RGBA2YCrCb, CX_RGBA2YUV, + + COLOR_XYZ2BGR, COLOR_XYZ2RGB, CX_XYZ2BGRA, CX_XYZ2RGBA, + + COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, CX_YCrCb2BGRA, CX_YCrCb2RGBA, + COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA + ) + +CV_ENUM(CvtMode32F, + COLOR_BGR2BGRA, COLOR_BGR2GRAY, + COLOR_BGR2HLS, COLOR_BGR2HLS_FULL, COLOR_BGR2HSV, COLOR_BGR2HSV_FULL, + COLOR_BGR2Lab, COLOR_BGR2Luv, COLOR_BGR2RGB, COLOR_BGR2RGBA, COLOR_BGR2XYZ, + COLOR_BGR2YCrCb, COLOR_BGR2YUV, + + COLOR_BGRA2BGR, COLOR_BGRA2GRAY, COLOR_BGRA2RGBA, + CX_BGRA2HLS, CX_BGRA2HLS_FULL, CX_BGRA2HSV, CX_BGRA2HSV_FULL, + CX_BGRA2Lab, CX_BGRA2Luv, CX_BGRA2XYZ, + CX_BGRA2YCrCb, CX_BGRA2YUV, + + COLOR_GRAY2BGR, COLOR_GRAY2BGRA, + + COLOR_HLS2BGR, COLOR_HLS2BGR_FULL, COLOR_HLS2RGB, COLOR_HLS2RGB_FULL, + CX_HLS2BGRA, CX_HLS2BGRA_FULL, CX_HLS2RGBA, CX_HLS2RGBA_FULL, + + COLOR_HSV2BGR, COLOR_HSV2BGR_FULL, COLOR_HSV2RGB, COLOR_HSV2RGB_FULL, + CX_HSV2BGRA, CX_HSV2BGRA_FULL, CX_HSV2RGBA, CX_HSV2RGBA_FULL, + + COLOR_Lab2BGR, COLOR_Lab2LBGR, COLOR_Lab2LRGB, COLOR_Lab2RGB, + CX_Lab2BGRA, CX_Lab2LBGRA, CX_Lab2LRGBA, CX_Lab2RGBA, + + COLOR_LBGR2Lab, COLOR_LBGR2Luv, COLOR_LRGB2Lab, COLOR_LRGB2Luv, + CX_LBGRA2Lab, CX_LBGRA2Luv, CX_LRGBA2Lab, CX_LRGBA2Luv, + + COLOR_Luv2BGR, COLOR_Luv2LBGR, COLOR_Luv2LRGB, COLOR_Luv2RGB, + CX_Luv2BGRA, CX_Luv2LBGRA, CX_Luv2LRGBA, CX_Luv2RGBA, + + COLOR_RGB2GRAY, + COLOR_RGB2HLS, COLOR_RGB2HLS_FULL, COLOR_RGB2HSV, COLOR_RGB2HSV_FULL, + COLOR_RGB2Lab, COLOR_RGB2Luv, COLOR_RGB2XYZ, COLOR_RGB2YCrCb, COLOR_RGB2YUV, + + COLOR_RGBA2BGR, COLOR_RGBA2GRAY, + CX_RGBA2HLS, CX_RGBA2HLS_FULL, CX_RGBA2HSV, CX_RGBA2HSV_FULL, + CX_RGBA2Lab, CX_RGBA2Luv, CX_RGBA2XYZ, + CX_RGBA2YCrCb, CX_RGBA2YUV, + + COLOR_XYZ2BGR, COLOR_XYZ2RGB, CX_XYZ2BGRA, CX_XYZ2RGBA, + + COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, CX_YCrCb2BGRA, CX_YCrCb2RGBA, + COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA + ) CV_ENUM(CvtModeBayer, COLOR_BayerBG2BGR, COLOR_BayerBG2BGRA, COLOR_BayerBG2BGR_VNG, COLOR_BayerBG2GRAY, @@ -274,6 +340,60 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, #endif } + +typedef tuple Size_CvtMode16U_t; +typedef perf::TestBaseWithParam Size_CvtMode16U; + +PERF_TEST_P(Size_CvtMode16U, cvtColor_16u, + testing::Combine( + testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p), + CvtMode16U::all() + ) + ) +{ + Size sz = get<0>(GetParam()); + int _mode = get<1>(GetParam()), mode = _mode; + ChPair ch = getConversionInfo(mode); + mode %= COLOR_COLORCVT_MAX; + Mat src(sz, CV_16UC(ch.scn)); + Mat dst(sz, CV_16UC(ch.scn)); + + declare.time(100); + declare.in(src, WARMUP_RNG).out(dst); + + int runs = sz.width <= 320 ? 100 : 5; + TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); + + SANITY_CHECK(dst, 1); +} + + +typedef tuple Size_CvtMode32F_t; +typedef perf::TestBaseWithParam Size_CvtMode32F; + +PERF_TEST_P(Size_CvtMode32F, cvtColor_32f, + testing::Combine( + testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p), + CvtMode32F::all() + ) + ) +{ + Size sz = get<0>(GetParam()); + int _mode = get<1>(GetParam()), mode = _mode; + ChPair ch = getConversionInfo(mode); + mode %= COLOR_COLORCVT_MAX; + Mat src(sz, CV_32FC(ch.scn)); + Mat dst(sz, CV_32FC(ch.scn)); + + declare.time(100); + declare.in(src, WARMUP_RNG).out(dst); + + int runs = sz.width <= 320 ? 100 : 5; + TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); + + SANITY_CHECK(dst, 1); +} + typedef tuple Size_CvtMode_Bayer_t; typedef perf::TestBaseWithParam Size_CvtMode_Bayer; From fb8e652c3f20d377e9f935faee370ed28fb60122 Mon Sep 17 00:00:00 2001 From: Namgoo Lee Date: Tue, 5 Feb 2019 16:37:33 +0000 Subject: [PATCH 4/9] Add CV_16UC1 support for cuda::CLAHE Due to size limit of shared memory, histogram is built on the global memory for CV_16UC1 case. The amount of memory needed for building histogram is: 65536 * 4byte = 256KB and shared memory limit is 48KB typically. Added test cases for CV_16UC1 and various clip limits. Added perf tests for CV_16UC1 on both CPU and CUDA code. There was also a bug in CV_8UC1 case when redistributing "residual" clipped pixels. Adding the test case where clip limit is 5.0 exposes this bug. --- .../core/include/opencv2/core/cuda_types.hpp | 2 + modules/cudaimgproc/perf/perf_histogram.cpp | 8 +- modules/cudaimgproc/src/cuda/clahe.cu | 256 +++++++++++++++++- modules/cudaimgproc/src/histogram.cpp | 29 +- modules/cudaimgproc/test/test_histogram.cpp | 13 +- modules/imgproc/perf/perf_histogram.cpp | 8 +- 6 files changed, 285 insertions(+), 31 deletions(-) diff --git a/modules/core/include/opencv2/core/cuda_types.hpp b/modules/core/include/opencv2/core/cuda_types.hpp index e2647c0455..45dc2cad1c 100644 --- a/modules/core/include/opencv2/core/cuda_types.hpp +++ b/modules/core/include/opencv2/core/cuda_types.hpp @@ -127,10 +127,12 @@ namespace cv }; typedef PtrStepSz PtrStepSzb; + typedef PtrStepSz PtrStepSzus; typedef PtrStepSz PtrStepSzf; typedef PtrStepSz PtrStepSzi; typedef PtrStep PtrStepb; + typedef PtrStep PtrStepus; typedef PtrStep PtrStepf; typedef PtrStep PtrStepi; diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp index 105411da9e..585f2b0006 100644 --- a/modules/cudaimgproc/perf/perf_histogram.cpp +++ b/modules/cudaimgproc/perf/perf_histogram.cpp @@ -183,16 +183,18 @@ PERF_TEST_P(Sz, EqualizeHist, ////////////////////////////////////////////////////////////////////// // CLAHE -DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double); +DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double, MatType); PERF_TEST_P(Sz_ClipLimit, CLAHE, Combine(CUDA_TYPICAL_MAT_SIZES, - Values(0.0, 40.0))) + Values(0.0, 40.0), + Values(MatType(CV_8UC1), MatType(CV_16UC1)))) { const cv::Size size = GET_PARAM(0); const double clipLimit = GET_PARAM(1); + const int type = GET_PARAM(2); - cv::Mat src(size, CV_8UC1); + cv::Mat src(size, type); declare.in(src, WARMUP_RNG); if (PERF_RUN_CUDA()) diff --git a/modules/cudaimgproc/src/cuda/clahe.cu b/modules/cudaimgproc/src/cuda/clahe.cu index b66a7d8a66..75663c51e2 100644 --- a/modules/cudaimgproc/src/cuda/clahe.cu +++ b/modules/cudaimgproc/src/cuda/clahe.cu @@ -48,11 +48,11 @@ using namespace cv::cudev; namespace clahe { - __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut, - const int2 tileSize, const int tilesX, - const int clipLimit, const float lutScale) + __global__ void calcLutKernel_8U(const PtrStepb src, PtrStepb lut, + const int2 tileSize, const int tilesX, + const int clipLimit, const float lutScale) { - __shared__ int smem[512]; + __shared__ int smem[256]; const int tx = blockIdx.x; const int ty = blockIdx.y; @@ -95,18 +95,28 @@ namespace clahe // broadcast evaluated value __shared__ int totalClipped; + __shared__ int redistBatch; + __shared__ int residual; + __shared__ int rStep; if (tid == 0) + { totalClipped = clipped; + redistBatch = totalClipped / 256; + residual = totalClipped - redistBatch * 256; + + rStep = 1; + if (residual != 0) + rStep = 256 / residual; + } + __syncthreads(); // redistribute clipped samples evenly - int redistBatch = totalClipped / 256; tHistVal += redistBatch; - int residual = totalClipped - redistBatch * 256; - if (tid < residual) + if (residual && tid % rStep == 0 && tid / rStep < residual) ++tHistVal; } @@ -115,12 +125,212 @@ namespace clahe lut(ty * tilesX + tx, tid) = saturate_cast(__float2int_rn(lutScale * lutVal)); } - void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream) + __global__ void calcLutKernel_16U(const PtrStepus src, PtrStepus lut, + const int2 tileSize, const int tilesX, + const int clipLimit, const float lutScale, + PtrStepSzi hist) + { + #define histSize 65536 + #define blockSize 256 + + __shared__ int smem[blockSize]; + + const int tx = blockIdx.x; + const int ty = blockIdx.y; + const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + + const int histRow = ty * tilesX + tx; + + // build histogram + + for (int i = tid; i < histSize; i += blockSize) + hist(histRow, i) = 0; + + __syncthreads(); + + for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y) + { + const ushort* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x; + for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x) + { + const int data = srcPtr[j]; + ::atomicAdd(&hist(histRow, data), 1); + } + } + + __syncthreads(); + + if (clipLimit > 0) + { + // clip histogram bar && + // find number of overall clipped samples + + __shared__ int partialSum[blockSize]; + + for (int i = tid; i < histSize; i += blockSize) + { + int histVal = hist(histRow, i); + + int clipped = 0; + if (histVal > clipLimit) + { + clipped = histVal - clipLimit; + hist(histRow, i) = clipLimit; + } + + // Following code block is in effect equivalent to: + // + // blockReduce(smem, clipped, tid, plus()); + // + { + for (int j = 16; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0xFFFFFFFFU, clipped, j); + #else + int val = __shfl_down(clipped, j); + #endif + clipped += val; + } + + if (tid % 32 == 0) + smem[tid / 32] = clipped; + + __syncthreads(); + + if (tid < 8) + { + clipped = smem[tid]; + + for (int j = 4; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0x000000FFU, clipped, j); + #else + int val = __shfl_down(clipped, j); + #endif + clipped += val; + } + } + } + // end of code block + + if (tid == 0) + partialSum[i / blockSize] = clipped; + + __syncthreads(); + } + + int partialSum_ = partialSum[tid]; + + // Following code block is in effect equivalent to: + // + // blockReduce(smem, partialSum_, tid, plus()); + // + { + for (int j = 16; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0xFFFFFFFFU, partialSum_, j); + #else + int val = __shfl_down(partialSum_, j); + #endif + partialSum_ += val; + } + + if (tid % 32 == 0) + smem[tid / 32] = partialSum_; + + __syncthreads(); + + if (tid < 8) + { + partialSum_ = smem[tid]; + + for (int j = 4; j >= 1; j /= 2) + { + #if __CUDACC_VER_MAJOR__ >= 9 + int val = __shfl_down_sync(0x000000FFU, partialSum_, j); + #else + int val = __shfl_down(partialSum_, j); + #endif + partialSum_ += val; + } + } + } + // end of code block + + // broadcast evaluated value && + // redistribute clipped samples evenly + + __shared__ int totalClipped; + __shared__ int redistBatch; + __shared__ int residual; + __shared__ int rStep; + + if (tid == 0) + { + totalClipped = partialSum_; + redistBatch = totalClipped / histSize; + residual = totalClipped - redistBatch * histSize; + + rStep = 1; + if (residual != 0) + rStep = histSize / residual; + } + + __syncthreads(); + + for (int i = tid; i < histSize; i += blockSize) + { + int histVal = hist(histRow, i); + + int equalized = histVal + redistBatch; + + if (residual && i % rStep == 0 && i / rStep < residual) + ++equalized; + + hist(histRow, i) = equalized; + } + } + + __shared__ int partialScan[blockSize]; + + for (int i = tid; i < histSize; i += blockSize) + { + int equalized = hist(histRow, i); + equalized = blockScanInclusive(equalized, smem, tid); + + if (tid == blockSize - 1) + partialScan[i / blockSize] = equalized; + + hist(histRow, i) = equalized; + } + + __syncthreads(); + + int partialScan_ = partialScan[tid]; + partialScan[tid] = blockScanExclusive(partialScan_, smem, tid); + + __syncthreads(); + + for (int i = tid; i < histSize; i += blockSize) + { + const int lutVal = hist(histRow, i) + partialScan[i / blockSize]; + + lut(histRow, i) = saturate_cast(__float2int_rn(lutScale * lutVal)); + } + + #undef histSize + #undef blockSize + } + + void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(tilesX, tilesY); - calcLutKernel<<>>(src, lut, tileSize, tilesX, clipLimit, lutScale); + calcLutKernel_8U<<>>(src, lut, tileSize, tilesX, clipLimit, lutScale); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); @@ -128,7 +338,21 @@ namespace clahe CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } - __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY) + void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(tilesX, tilesY); + + calcLutKernel_16U<<>>(src, lut, tileSize, tilesX, clipLimit, lutScale, hist); + + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + + if (stream == 0) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + } + + template + __global__ void transformKernel(const PtrStepSz src, PtrStep dst, const PtrStep lut, const int2 tileSize, const int tilesX, const int tilesY) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -159,22 +383,26 @@ namespace clahe res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya)); res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya)); - dst(y, x) = saturate_cast(res); + dst(y, x) = saturate_cast(res); } - void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream) + template + void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream) { const dim3 block(32, 8); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) ); + CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) ); - transformKernel<<>>(src, dst, lut, tileSize, tilesX, tilesY); + transformKernel<<>>(src, dst, lut, tileSize, tilesX, tilesY); CV_CUDEV_SAFE_CALL( cudaGetLastError() ); if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } + + template void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); + template void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); } #endif // CUDA_DISABLER diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index 6e219b641b..e616c5a2e9 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -141,8 +141,9 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream) namespace clahe { - void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream); - void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); + void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream); + void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream); + template void transform(PtrStepSz src, PtrStepSz dst, PtrStep lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream); } namespace @@ -170,6 +171,7 @@ namespace GpuMat srcExt_; GpuMat lut_; + GpuMat hist_; // histogram on global memory for CV_16UC1 case }; CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) : @@ -186,14 +188,16 @@ namespace { GpuMat src = _src.getGpuMat(); - CV_Assert( src.type() == CV_8UC1 ); + const int type = src.type(); - _dst.create( src.size(), src.type() ); + CV_Assert( type == CV_8UC1 || type == CV_16UC1 ); + + _dst.create( src.size(), type ); GpuMat dst = _dst.getGpuMat(); - const int histSize = 256; + const int histSize = type == CV_8UC1 ? 256 : 65536; - ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_); + ensureSizeIsEnough(tilesX_ * tilesY_, histSize, type, lut_); cudaStream_t stream = StreamAccessor::getStream(s); @@ -227,9 +231,18 @@ namespace clipLimit = std::max(clipLimit, 1); } - clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream); + if (type == CV_8UC1) + clahe::calcLut_8U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream); + else // type == CV_16UC1 + { + ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_32SC1, hist_); + clahe::calcLut_16U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, hist_, stream); + } - clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream); + if (type == CV_8UC1) + clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream); + else // type == CV_16UC1 + clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream); } void CLAHE_Impl::setClipLimit(double clipLimit) diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp index eb084609e9..6af8eb2135 100644 --- a/modules/cudaimgproc/test/test_histogram.cpp +++ b/modules/cudaimgproc/test/test_histogram.cpp @@ -236,17 +236,19 @@ namespace IMPLEMENT_PARAM_CLASS(ClipLimit, double) } -PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit) +PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit, MatType) { cv::cuda::DeviceInfo devInfo; cv::Size size; double clipLimit; + int type; virtual void SetUp() { devInfo = GET_PARAM(0); size = GET_PARAM(1); clipLimit = GET_PARAM(2); + type = GET_PARAM(3); cv::cuda::setDevice(devInfo.deviceID()); } @@ -254,7 +256,11 @@ PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit) CUDA_TEST_P(CLAHE, Accuracy) { - cv::Mat src = randomMat(size, CV_8UC1); + cv::Mat src; + if (type == CV_8UC1) + src = randomMat(size, type); + else if (type == CV_16UC1) + src = randomMat(size, type, 0, 65535); cv::Ptr clahe = cv::cuda::createCLAHE(clipLimit); cv::cuda::GpuMat dst; @@ -270,7 +276,8 @@ CUDA_TEST_P(CLAHE, Accuracy) INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CLAHE, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, - testing::Values(0.0, 40.0))); + testing::Values(0.0, 5.0, 10.0, 20.0, 40.0), + testing::Values(MatType(CV_8UC1), MatType(CV_16UC1)))); }} // namespace diff --git a/modules/imgproc/perf/perf_histogram.cpp b/modules/imgproc/perf/perf_histogram.cpp index 4f54e948bb..d80d8a6d51 100644 --- a/modules/imgproc/perf/perf_histogram.cpp +++ b/modules/imgproc/perf/perf_histogram.cpp @@ -141,18 +141,20 @@ PERF_TEST_P(Dim_Cmpmethod, compareHist, SANITY_CHECK_NOTHING(); } -typedef tuple Sz_ClipLimit_t; +typedef tuple Sz_ClipLimit_t; typedef TestBaseWithParam Sz_ClipLimit; PERF_TEST_P(Sz_ClipLimit, CLAHE, testing::Combine(testing::Values(::perf::szVGA, ::perf::sz720p, ::perf::sz1080p), - testing::Values(0.0, 40.0)) + testing::Values(0.0, 40.0), + testing::Values(MatType(CV_8UC1), MatType(CV_16UC1))) ) { const Size size = get<0>(GetParam()); const double clipLimit = get<1>(GetParam()); + const int type = get<2>(GetParam()); - Mat src(size, CV_8UC1); + Mat src(size, type); declare.in(src, WARMUP_RNG); Ptr clahe = createCLAHE(clipLimit); From 9cbdb48d6d55d40f2ada59a32822d040d5e282b3 Mon Sep 17 00:00:00 2001 From: Alexander Nesterov Date: Fri, 1 Feb 2019 16:23:51 -0100 Subject: [PATCH 5/9] Fix change step --- modules/dnn/src/layers/prior_box_layer.cpp | 19 +++++++++++------ modules/dnn/test/test_backends.cpp | 24 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp index 458d667cbc..ac11fe7ada 100644 --- a/modules/dnn/src/layers/prior_box_layer.cpp +++ b/modules/dnn/src/layers/prior_box_layer.cpp @@ -487,9 +487,7 @@ public: if (_explicitSizes) { InferenceEngine::Builder::PriorBoxClusteredLayer ieLayer(name); - - CV_Assert(_stepX == _stepY); - ieLayer.setStep(_stepX); + ieLayer.setSteps({_stepY, _stepX}); CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], ""); ieLayer.setOffset(_offsetsX[0]); @@ -516,9 +514,6 @@ public: if (_maxSize > 0) ieLayer.setMaxSize(_maxSize); - CV_Assert(_stepX == _stepY); - ieLayer.setStep(_stepX); - CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], ""); ieLayer.setOffset(_offsetsX[0]); @@ -526,6 +521,18 @@ public: ieLayer.setFlip(false); // We already flipped aspect ratios. InferenceEngine::Builder::Layer l = ieLayer; + if (_stepX == _stepY) + { + l.getParameters()["step"] = _stepX; + l.getParameters()["step_h"] = 0.0; + l.getParameters()["step_w"] = 0.0; + } + else + { + l.getParameters()["step"] = 0.0; + l.getParameters()["step_h"] = _stepY; + l.getParameters()["step_w"] = _stepX; + } if (!_aspectRatios.empty()) { l.getParameters()["aspect_ratio"] = _aspectRatios; diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index 1d97cfc088..fbf045d01d 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -162,6 +162,18 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe) inp, "detection_out", "", diffScores); } +TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height) +{ + if (backend == DNN_BACKEND_HALIDE) + throw SkipTestException(""); + Mat sample = imread(findDataFile("dnn/street.png", false)); + Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 560), Scalar(127.5, 127.5, 127.5), false); + float diffScores = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.029 : 0.0; + float diffSquares = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.09 : 0.0; + processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", + inp, "detection_out", "", diffScores, diffSquares); +} + TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow) { if (backend == DNN_BACKEND_HALIDE) @@ -174,6 +186,18 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow) inp, "detection_out", "", l1, lInf); } +TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height) +{ + if (backend == DNN_BACKEND_HALIDE) + throw SkipTestException(""); + Mat sample = imread(findDataFile("dnn/street.png", false)); + Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false); + float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 0.0; + float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.06 : 0.0; + processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt", + inp, "detection_out", "", l1, lInf); +} + TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow) { if (backend == DNN_BACKEND_HALIDE) From 07c10d6fc3a5e697aba2e9b34586d2bafef2ce85 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Thu, 7 Feb 2019 15:58:34 +0300 Subject: [PATCH 6/9] Fixed out of bound reading issue in erode() and dilate() --- modules/imgproc/src/morph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp index cb25a50c7b..c18e5c8066 100644 --- a/modules/imgproc/src/morph.cpp +++ b/modules/imgproc/src/morph.cpp @@ -159,7 +159,7 @@ template struct MorphRowVec i += vtype::nlanes/2; } - return i; + return i - i % cn; } int ksize, anchor; From 87f651c119d31564e95e8dfb512d68a9f70ffab8 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 7 Feb 2019 18:20:29 +0300 Subject: [PATCH 7/9] disabled sanity check for 32f --- modules/imgproc/perf/perf_cvt_color.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index e0b55b7fa0..ac4f62e29f 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -391,7 +391,7 @@ PERF_TEST_P(Size_CvtMode32F, cvtColor_32f, int runs = sz.width <= 320 ? 100 : 5; TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); - SANITY_CHECK(dst, 1); + SANITY_CHECK_NOTHING(); } typedef tuple Size_CvtMode_Bayer_t; From 4e679e1cc5b075ec006b29a58b4fe117523fba1d Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 7 Feb 2019 19:26:36 +0300 Subject: [PATCH 8/9] disabled 16u and 32f perf tests --- modules/imgproc/perf/perf_cvt_color.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index ac4f62e29f..e3af113bed 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -344,7 +344,7 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u, typedef tuple Size_CvtMode16U_t; typedef perf::TestBaseWithParam Size_CvtMode16U; -PERF_TEST_P(Size_CvtMode16U, cvtColor_16u, +PERF_TEST_P(Size_CvtMode16U, DISABLED_cvtColor_16u, testing::Combine( testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p), CvtMode16U::all() @@ -371,7 +371,7 @@ PERF_TEST_P(Size_CvtMode16U, cvtColor_16u, typedef tuple Size_CvtMode32F_t; typedef perf::TestBaseWithParam Size_CvtMode32F; -PERF_TEST_P(Size_CvtMode32F, cvtColor_32f, +PERF_TEST_P(Size_CvtMode32F, DISABLED_cvtColor_32f, testing::Combine( testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p), CvtMode32F::all() From d7b1f28a906b057e18972eed856f82cf976fa067 Mon Sep 17 00:00:00 2001 From: AnastasiaaSenina Date: Thu, 7 Feb 2019 17:50:55 +0300 Subject: [PATCH 9/9] fixed bug: added threshold for variables 'rotate_a', ' rotate_c' --- modules/video/src/camshift.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/video/src/camshift.cpp b/modules/video/src/camshift.cpp index ed5426ab98..dc40462762 100644 --- a/modules/video/src/camshift.cpp +++ b/modules/video/src/camshift.cpp @@ -167,6 +167,8 @@ cv::RotatedRect cv::CamShift( InputArray _probImage, Rect& window, double rotate_a = cs * cs * mu20 + 2 * cs * sn * mu11 + sn * sn * mu02; double rotate_c = sn * sn * mu20 - 2 * cs * sn * mu11 + cs * cs * mu02; + rotate_a = std::max(0.0, rotate_a); // avoid negative result due calculation numeric errors + rotate_c = std::max(0.0, rotate_c); // avoid negative result due calculation numeric errors double length = std::sqrt( rotate_a * inv_m00 ) * 4; double width = std::sqrt( rotate_c * inv_m00 ) * 4;