From bc4e471847cb3209b7e14b0747ddc73af6ef221a Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Thu, 31 Jan 2019 16:10:59 +0300
Subject: [PATCH 1/9] Add a mutex for shared Inference Engine plugins

---
 modules/dnn/src/op_inf_engine.cpp | 11 ++++++++--
 modules/dnn/test/test_layers.cpp  | 34 +++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 9652e58581..0349d44710 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -622,7 +622,11 @@ void InfEngineBackendNet::init(int targetId)
 
 #endif  // IE < R5
 
-static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
+static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr>& getSharedPlugins()
+{
+    static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
+    return sharedPlugins;
+}
 
 void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
 {
@@ -630,6 +634,8 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
 
     try
     {
+        AutoLock lock(getInitializationMutex());
+        auto& sharedPlugins = getSharedPlugins();
         auto pluginIt = sharedPlugins.find(targetDevice);
         if (pluginIt != sharedPlugins.end())
         {
@@ -797,7 +803,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
 void resetMyriadDevice()
 {
 #ifdef HAVE_INF_ENGINE
-    sharedPlugins.erase(InferenceEngine::TargetDevice::eMYRIAD);
+    AutoLock lock(getInitializationMutex());
+    getSharedPlugins().erase(InferenceEngine::TargetDevice::eMYRIAD);
 #endif  // HAVE_INF_ENGINE
 }
 
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 77651ba617..f2f131333a 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -46,6 +46,10 @@
 #include <opencv2/dnn/all_layers.hpp>
 #include <opencv2/dnn/layer.details.hpp>  // CV_DNN_REGISTER_LAYER_CLASS
 
+#ifdef HAVE_INF_ENGINE
+#include <thread>
+#endif
+
 namespace opencv_test { namespace {
 
 template<typename TString>
@@ -970,6 +974,36 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
     if (targetId != DNN_TARGET_MYRIAD)
         normAssert(outs[0], outs[1]);
 }
+
+TEST_P(Layer_Test_Convolution_DLDT, multithreading)
+{
+    Target targetId = GetParam();
+    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    std::string xmlPath = _tf("layer_convolution" + suffix + ".xml");
+    std::string binPath = _tf("layer_convolution" + suffix + ".bin");
+    Net firstNet = readNet(xmlPath, binPath);
+    Net secondNet = readNet(xmlPath, binPath);
+    Mat inp = blobFromNPY(_tf("blob.npy"));
+
+    firstNet.setInput(inp);
+    secondNet.setInput(inp);
+    firstNet.setPreferableTarget(targetId);
+    secondNet.setPreferableTarget(targetId);
+
+    Mat out1, out2;
+    std::thread t1([&]{out1 = firstNet.forward();});
+    std::thread t2([&]{out2 = secondNet.forward();});
+
+    t1.join();
+    t2.join();
+
+    Mat ref = blobFromNPY(_tf("layer_convolution.npy"));
+    double l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.5e-3 : 1e-5;
+    double lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.8e-2 : 1e-4;
+    normAssert(out1, ref, "first thread", l1, lInf);
+    normAssert(out2, ref, "second thread", l1, lInf);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Convolution_DLDT,
     testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)));
 

From 6ad3bf31303378bdaa26a4829359ea14633a2c55 Mon Sep 17 00:00:00 2001
From: Dmitry Kurtaev <dmitry.kurtaev+github@gmail.com>
Date: Wed, 6 Feb 2019 13:05:11 +0300
Subject: [PATCH 2/9] Enable ResNet-based Mask-RCNN models from TensorFlow
 Object Detection API

---
 samples/dnn/tf_text_graph_mask_rcnn.py | 47 ++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/samples/dnn/tf_text_graph_mask_rcnn.py b/samples/dnn/tf_text_graph_mask_rcnn.py
index c8803088f9..24d8790d32 100644
--- a/samples/dnn/tf_text_graph_mask_rcnn.py
+++ b/samples/dnn/tf_text_graph_mask_rcnn.py
@@ -25,7 +25,8 @@ scopesToIgnore = ('FirstStageFeatureExtractor/Assert',
                   'FirstStageFeatureExtractor/Shape',
                   'FirstStageFeatureExtractor/strided_slice',
                   'FirstStageFeatureExtractor/GreaterEqual',
-                  'FirstStageFeatureExtractor/LogicalAnd')
+                  'FirstStageFeatureExtractor/LogicalAnd',
+                  'Conv/required_space_to_batch_paddings')
 
 # Load a config file.
 config = readTextMessage(args.config)
@@ -54,10 +55,30 @@ graph_def = parseTextGraph(args.output)
 
 removeIdentity(graph_def)
 
+nodesToKeep = []
 def to_remove(name, op):
+    if name in nodesToKeep:
+        return False
     return op == 'Const' or name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
            (name.startswith('CropAndResize') and op != 'CropAndResize')
 
+# Fuse atrous convolutions (with dilations).
+nodesMap = {node.name: node for node in graph_def.node}
+for node in reversed(graph_def.node):
+    if node.op == 'BatchToSpaceND':
+        del node.input[2]
+        conv = nodesMap[node.input[0]]
+        spaceToBatchND = nodesMap[conv.input[0]]
+
+        paddingsNode = NodeDef()
+        paddingsNode.name = conv.name + '/paddings'
+        paddingsNode.op = 'Const'
+        paddingsNode.addAttr('value', [2, 2, 2, 2])
+        graph_def.node.insert(graph_def.node.index(spaceToBatchND), paddingsNode)
+        nodesToKeep.append(paddingsNode.name)
+
+        spaceToBatchND.input[2] = paddingsNode.name
+
 removeUnusedNodesAndAttrs(to_remove, graph_def)
 
 
@@ -106,8 +127,8 @@ heights = []
 for a in aspect_ratios:
     for s in scales:
         ar = np.sqrt(a)
-        heights.append((features_stride**2) * s / ar)
-        widths.append((features_stride**2) * s * ar)
+        heights.append((height_stride**2) * s / ar)
+        widths.append((width_stride**2) * s * ar)
 
 proposals.addAttr('width', widths)
 proposals.addAttr('height', heights)
@@ -252,5 +273,25 @@ graph_def.node[-1].name = 'detection_masks'
 graph_def.node[-1].op = 'Sigmoid'
 graph_def.node[-1].input.pop()
 
+def getUnconnectedNodes():
+    unconnected = [node.name for node in graph_def.node]
+    for node in graph_def.node:
+        for inp in node.input:
+            if inp in unconnected:
+                unconnected.remove(inp)
+    return unconnected
+
+while True:
+    unconnectedNodes = getUnconnectedNodes()
+    unconnectedNodes.remove(graph_def.node[-1].name)
+    if not unconnectedNodes:
+        break
+
+    for name in unconnectedNodes:
+        for i in range(len(graph_def.node)):
+            if graph_def.node[i].name == name:
+                del graph_def.node[i]
+                break
+
 # Save as text.
 graph_def.save(args.output)

From bbedebb57cf44bb447837918a02c48979411271a Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@intel.com>
Date: Wed, 6 Feb 2019 17:56:44 +0300
Subject: [PATCH 3/9] perf tests for cvtColor for 16U and 32f added

---
 modules/imgproc/perf/perf_cvt_color.cpp | 120 ++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index e0f32fdab6..e0b55b7fa0 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -100,6 +100,72 @@ CV_ENUM(CvtMode,
     COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA
     )
 
+CV_ENUM(CvtMode16U,
+    COLOR_BGR2BGRA, COLOR_BGR2GRAY,
+    COLOR_BGR2RGB, COLOR_BGR2RGBA, COLOR_BGR2XYZ,
+    COLOR_BGR2YCrCb, COLOR_BGR2YUV,
+
+    COLOR_BGRA2BGR, COLOR_BGRA2GRAY, COLOR_BGRA2RGBA,
+    CX_BGRA2XYZ,
+    CX_BGRA2YCrCb, CX_BGRA2YUV,
+
+    COLOR_GRAY2BGR, COLOR_GRAY2BGRA,
+
+    COLOR_RGB2GRAY,
+    COLOR_RGB2XYZ, COLOR_RGB2YCrCb, COLOR_RGB2YUV,
+
+    COLOR_RGBA2BGR, COLOR_RGBA2GRAY,
+    CX_RGBA2XYZ,
+    CX_RGBA2YCrCb, CX_RGBA2YUV,
+
+    COLOR_XYZ2BGR, COLOR_XYZ2RGB, CX_XYZ2BGRA, CX_XYZ2RGBA,
+
+    COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, CX_YCrCb2BGRA, CX_YCrCb2RGBA,
+    COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA
+    )
+
+CV_ENUM(CvtMode32F,
+    COLOR_BGR2BGRA, COLOR_BGR2GRAY,
+    COLOR_BGR2HLS, COLOR_BGR2HLS_FULL, COLOR_BGR2HSV, COLOR_BGR2HSV_FULL,
+    COLOR_BGR2Lab, COLOR_BGR2Luv, COLOR_BGR2RGB, COLOR_BGR2RGBA, COLOR_BGR2XYZ,
+    COLOR_BGR2YCrCb, COLOR_BGR2YUV,
+
+    COLOR_BGRA2BGR, COLOR_BGRA2GRAY, COLOR_BGRA2RGBA,
+    CX_BGRA2HLS, CX_BGRA2HLS_FULL, CX_BGRA2HSV, CX_BGRA2HSV_FULL,
+    CX_BGRA2Lab, CX_BGRA2Luv, CX_BGRA2XYZ,
+    CX_BGRA2YCrCb, CX_BGRA2YUV,
+
+    COLOR_GRAY2BGR, COLOR_GRAY2BGRA,
+
+    COLOR_HLS2BGR, COLOR_HLS2BGR_FULL, COLOR_HLS2RGB, COLOR_HLS2RGB_FULL,
+    CX_HLS2BGRA, CX_HLS2BGRA_FULL, CX_HLS2RGBA, CX_HLS2RGBA_FULL,
+
+    COLOR_HSV2BGR, COLOR_HSV2BGR_FULL, COLOR_HSV2RGB, COLOR_HSV2RGB_FULL,
+    CX_HSV2BGRA, CX_HSV2BGRA_FULL, CX_HSV2RGBA,    CX_HSV2RGBA_FULL,
+
+    COLOR_Lab2BGR, COLOR_Lab2LBGR, COLOR_Lab2LRGB, COLOR_Lab2RGB,
+    CX_Lab2BGRA, CX_Lab2LBGRA, CX_Lab2LRGBA, CX_Lab2RGBA,
+
+    COLOR_LBGR2Lab, COLOR_LBGR2Luv, COLOR_LRGB2Lab, COLOR_LRGB2Luv,
+    CX_LBGRA2Lab, CX_LBGRA2Luv, CX_LRGBA2Lab, CX_LRGBA2Luv,
+
+    COLOR_Luv2BGR, COLOR_Luv2LBGR, COLOR_Luv2LRGB, COLOR_Luv2RGB,
+    CX_Luv2BGRA, CX_Luv2LBGRA, CX_Luv2LRGBA, CX_Luv2RGBA,
+
+    COLOR_RGB2GRAY,
+    COLOR_RGB2HLS, COLOR_RGB2HLS_FULL, COLOR_RGB2HSV, COLOR_RGB2HSV_FULL,
+    COLOR_RGB2Lab, COLOR_RGB2Luv, COLOR_RGB2XYZ, COLOR_RGB2YCrCb, COLOR_RGB2YUV,
+
+    COLOR_RGBA2BGR, COLOR_RGBA2GRAY,
+    CX_RGBA2HLS, CX_RGBA2HLS_FULL, CX_RGBA2HSV, CX_RGBA2HSV_FULL,
+    CX_RGBA2Lab, CX_RGBA2Luv, CX_RGBA2XYZ,
+    CX_RGBA2YCrCb, CX_RGBA2YUV,
+
+    COLOR_XYZ2BGR, COLOR_XYZ2RGB, CX_XYZ2BGRA, CX_XYZ2RGBA,
+
+    COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, CX_YCrCb2BGRA, CX_YCrCb2RGBA,
+    COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA
+    )
 
 CV_ENUM(CvtModeBayer,
     COLOR_BayerBG2BGR, COLOR_BayerBG2BGRA, COLOR_BayerBG2BGR_VNG, COLOR_BayerBG2GRAY,
@@ -274,6 +340,60 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u,
 #endif
 }
 
+
+typedef tuple<Size, CvtMode16U> Size_CvtMode16U_t;
+typedef perf::TestBaseWithParam<Size_CvtMode16U_t> Size_CvtMode16U;
+
+PERF_TEST_P(Size_CvtMode16U, cvtColor_16u,
+            testing::Combine(
+                testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p),
+                CvtMode16U::all()
+                )
+            )
+{
+    Size sz = get<0>(GetParam());
+    int _mode = get<1>(GetParam()), mode = _mode;
+    ChPair ch = getConversionInfo(mode);
+    mode %= COLOR_COLORCVT_MAX;
+    Mat src(sz, CV_16UC(ch.scn));
+    Mat dst(sz, CV_16UC(ch.scn));
+
+    declare.time(100);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    int runs = sz.width <= 320 ? 100 : 5;
+    TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
+
+    SANITY_CHECK(dst, 1);
+}
+
+
+typedef tuple<Size, CvtMode32F> Size_CvtMode32F_t;
+typedef perf::TestBaseWithParam<Size_CvtMode32F_t> Size_CvtMode32F;
+
+PERF_TEST_P(Size_CvtMode32F, cvtColor_32f,
+            testing::Combine(
+                testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p),
+                CvtMode32F::all()
+                )
+            )
+{
+    Size sz = get<0>(GetParam());
+    int _mode = get<1>(GetParam()), mode = _mode;
+    ChPair ch = getConversionInfo(mode);
+    mode %= COLOR_COLORCVT_MAX;
+    Mat src(sz, CV_32FC(ch.scn));
+    Mat dst(sz, CV_32FC(ch.scn));
+
+    declare.time(100);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    int runs = sz.width <= 320 ? 100 : 5;
+    TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
+
+    SANITY_CHECK(dst, 1);
+}
+
 typedef tuple<Size, CvtModeBayer> Size_CvtMode_Bayer_t;
 typedef perf::TestBaseWithParam<Size_CvtMode_Bayer_t> Size_CvtMode_Bayer;
 

From fb8e652c3f20d377e9f935faee370ed28fb60122 Mon Sep 17 00:00:00 2001
From: Namgoo Lee <devnglee@gmail.com>
Date: Tue, 5 Feb 2019 16:37:33 +0000
Subject: [PATCH 4/9] Add CV_16UC1 support for cuda::CLAHE

Due to size limit of shared memory, histogram is built on
the global memory for CV_16UC1 case.

The amount of memory needed for building histogram is:

    65536 * 4byte = 256KB

and shared memory limit is 48KB typically.

Added test cases for CV_16UC1 and various clip limits.
Added perf tests for CV_16UC1 on both CPU and CUDA code.

There was also a bug in CV_8UC1 case when redistributing
"residual" clipped pixels. Adding the test case where clip
limit is 5.0 exposes this bug.
---
 .../core/include/opencv2/core/cuda_types.hpp  |   2 +
 modules/cudaimgproc/perf/perf_histogram.cpp   |   8 +-
 modules/cudaimgproc/src/cuda/clahe.cu         | 256 +++++++++++++++++-
 modules/cudaimgproc/src/histogram.cpp         |  29 +-
 modules/cudaimgproc/test/test_histogram.cpp   |  13 +-
 modules/imgproc/perf/perf_histogram.cpp       |   8 +-
 6 files changed, 285 insertions(+), 31 deletions(-)

diff --git a/modules/core/include/opencv2/core/cuda_types.hpp b/modules/core/include/opencv2/core/cuda_types.hpp
index e2647c0455..45dc2cad1c 100644
--- a/modules/core/include/opencv2/core/cuda_types.hpp
+++ b/modules/core/include/opencv2/core/cuda_types.hpp
@@ -127,10 +127,12 @@ namespace cv
         };
 
         typedef PtrStepSz<unsigned char> PtrStepSzb;
+        typedef PtrStepSz<unsigned short> PtrStepSzus;
         typedef PtrStepSz<float> PtrStepSzf;
         typedef PtrStepSz<int> PtrStepSzi;
 
         typedef PtrStep<unsigned char> PtrStepb;
+        typedef PtrStep<unsigned short> PtrStepus;
         typedef PtrStep<float> PtrStepf;
         typedef PtrStep<int> PtrStepi;
 
diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp
index 105411da9e..585f2b0006 100644
--- a/modules/cudaimgproc/perf/perf_histogram.cpp
+++ b/modules/cudaimgproc/perf/perf_histogram.cpp
@@ -183,16 +183,18 @@ PERF_TEST_P(Sz, EqualizeHist,
 //////////////////////////////////////////////////////////////////////
 // CLAHE
 
-DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double);
+DEF_PARAM_TEST(Sz_ClipLimit, cv::Size, double, MatType);
 
 PERF_TEST_P(Sz_ClipLimit, CLAHE,
             Combine(CUDA_TYPICAL_MAT_SIZES,
-                    Values(0.0, 40.0)))
+                    Values(0.0, 40.0),
+                    Values(MatType(CV_8UC1), MatType(CV_16UC1))))
 {
     const cv::Size size = GET_PARAM(0);
     const double clipLimit = GET_PARAM(1);
+    const int type = GET_PARAM(2);
 
-    cv::Mat src(size, CV_8UC1);
+    cv::Mat src(size, type);
     declare.in(src, WARMUP_RNG);
 
     if (PERF_RUN_CUDA())
diff --git a/modules/cudaimgproc/src/cuda/clahe.cu b/modules/cudaimgproc/src/cuda/clahe.cu
index b66a7d8a66..75663c51e2 100644
--- a/modules/cudaimgproc/src/cuda/clahe.cu
+++ b/modules/cudaimgproc/src/cuda/clahe.cu
@@ -48,11 +48,11 @@ using namespace cv::cudev;
 
 namespace clahe
 {
-    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
-                                  const int2 tileSize, const int tilesX,
-                                  const int clipLimit, const float lutScale)
+    __global__ void calcLutKernel_8U(const PtrStepb src, PtrStepb lut,
+                                     const int2 tileSize, const int tilesX,
+                                     const int clipLimit, const float lutScale)
     {
-        __shared__ int smem[512];
+        __shared__ int smem[256];
 
         const int tx = blockIdx.x;
         const int ty = blockIdx.y;
@@ -95,18 +95,28 @@ namespace clahe
             // broadcast evaluated value
 
             __shared__ int totalClipped;
+            __shared__ int redistBatch;
+            __shared__ int residual;
+            __shared__ int rStep;
 
             if (tid == 0)
+            {
                 totalClipped = clipped;
+                redistBatch = totalClipped / 256;
+                residual = totalClipped - redistBatch * 256;
+
+                rStep = 1;
+                if (residual != 0)
+                    rStep = 256 / residual;
+            }
+
             __syncthreads();
 
             // redistribute clipped samples evenly
 
-            int redistBatch = totalClipped / 256;
             tHistVal += redistBatch;
 
-            int residual = totalClipped - redistBatch * 256;
-            if (tid < residual)
+            if (residual && tid % rStep == 0 && tid / rStep < residual)
                 ++tHistVal;
         }
 
@@ -115,12 +125,212 @@ namespace clahe
         lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
     }
 
-    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
+    __global__ void calcLutKernel_16U(const PtrStepus src, PtrStepus lut,
+                                      const int2 tileSize, const int tilesX,
+                                      const int clipLimit, const float lutScale,
+                                      PtrStepSzi hist)
+    {
+        #define histSize 65536
+        #define blockSize 256
+
+        __shared__ int smem[blockSize];
+
+        const int tx = blockIdx.x;
+        const int ty = blockIdx.y;
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        const int histRow = ty * tilesX + tx;
+
+        // build histogram
+
+        for (int i = tid; i < histSize; i += blockSize)
+            hist(histRow, i) = 0;
+
+        __syncthreads();
+
+        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
+        {
+            const ushort* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
+            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
+            {
+                const int data = srcPtr[j];
+                ::atomicAdd(&hist(histRow, data), 1);
+            }
+        }
+
+        __syncthreads();
+
+        if (clipLimit > 0)
+        {
+            // clip histogram bar &&
+            // find number of overall clipped samples
+
+            __shared__ int partialSum[blockSize];
+
+            for (int i = tid; i < histSize; i += blockSize)
+            {
+                int histVal = hist(histRow, i);
+
+                int clipped = 0;
+                if (histVal > clipLimit)
+                {
+                    clipped = histVal - clipLimit;
+                    hist(histRow, i) = clipLimit;
+                }
+
+                // Following code block is in effect equivalent to:
+                //
+                //      blockReduce<blockSize>(smem, clipped, tid, plus<int>());
+                //
+                {
+                    for (int j = 16; j >= 1; j /= 2)
+                    {
+                    #if __CUDACC_VER_MAJOR__ >= 9
+                        int val = __shfl_down_sync(0xFFFFFFFFU, clipped, j);
+                    #else
+                        int val = __shfl_down(clipped, j);
+                    #endif
+                        clipped += val;
+                    }
+
+                    if (tid % 32 == 0)
+                        smem[tid / 32] = clipped;
+
+                    __syncthreads();
+
+                    if (tid < 8)
+                    {
+                        clipped = smem[tid];
+
+                        for (int j = 4; j >= 1; j /= 2)
+                        {
+                        #if __CUDACC_VER_MAJOR__ >= 9
+                            int val = __shfl_down_sync(0x000000FFU, clipped, j);
+                        #else
+                            int val = __shfl_down(clipped, j);
+                        #endif
+                            clipped += val;
+                        }
+                    }
+                }
+                // end of code block
+
+                if (tid == 0)
+                    partialSum[i / blockSize] = clipped;
+
+                __syncthreads();
+            }
+
+            int partialSum_ = partialSum[tid];
+
+            // Following code block is in effect equivalent to:
+            //
+            //      blockReduce<blockSize>(smem, partialSum_, tid, plus<int>());
+            //
+            {
+                for (int j = 16; j >= 1; j /= 2)
+                {
+                #if __CUDACC_VER_MAJOR__ >= 9
+                    int val = __shfl_down_sync(0xFFFFFFFFU, partialSum_, j);
+                #else
+                    int val = __shfl_down(partialSum_, j);
+                #endif
+                    partialSum_ += val;
+                }
+
+                if (tid % 32 == 0)
+                    smem[tid / 32] = partialSum_;
+
+                __syncthreads();
+
+                if (tid < 8)
+                {
+                    partialSum_ = smem[tid];
+
+                    for (int j = 4; j >= 1; j /= 2)
+                    {
+                    #if __CUDACC_VER_MAJOR__ >= 9
+                        int val = __shfl_down_sync(0x000000FFU, partialSum_, j);
+                    #else
+                        int val = __shfl_down(partialSum_, j);
+                    #endif
+                        partialSum_ += val;
+                    }
+                }
+            }
+            // end of code block
+
+            // broadcast evaluated value &&
+            // redistribute clipped samples evenly
+
+            __shared__ int totalClipped;
+            __shared__ int redistBatch;
+            __shared__ int residual;
+            __shared__ int rStep;
+
+            if (tid == 0)
+            {
+                totalClipped = partialSum_;
+                redistBatch = totalClipped / histSize;
+                residual = totalClipped - redistBatch * histSize;
+
+                rStep = 1;
+                if (residual != 0)
+                    rStep = histSize / residual;
+            }
+
+            __syncthreads();
+
+            for (int i = tid; i < histSize; i += blockSize)
+            {
+                int histVal = hist(histRow, i);
+
+                int equalized = histVal + redistBatch;
+
+                if (residual && i % rStep == 0 && i / rStep < residual)
+                    ++equalized;
+
+                hist(histRow, i) = equalized;
+            }
+        }
+
+        __shared__ int partialScan[blockSize];
+
+        for (int i = tid; i < histSize; i += blockSize)
+        {
+            int equalized = hist(histRow, i);
+            equalized = blockScanInclusive<blockSize>(equalized, smem, tid);
+
+            if (tid == blockSize - 1)
+                partialScan[i / blockSize] = equalized;
+
+            hist(histRow, i) = equalized;
+        }
+
+        __syncthreads();
+
+        int partialScan_ = partialScan[tid];
+        partialScan[tid] = blockScanExclusive<blockSize>(partialScan_, smem, tid);
+
+        __syncthreads();
+
+        for (int i = tid; i < histSize; i += blockSize)
+        {
+            const int lutVal = hist(histRow, i) + partialScan[i / blockSize];
+
+            lut(histRow, i) = saturate_cast<ushort>(__float2int_rn(lutScale * lutVal));
+        }
+
+        #undef histSize
+        #undef blockSize
+    }
+
+    void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(tilesX, tilesY);
 
-        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
+        calcLutKernel_8U<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
 
         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
 
@@ -128,7 +338,21 @@ namespace clahe
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
     }
 
-    __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
+    void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(tilesX, tilesY);
+
+        calcLutKernel_16U<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale, hist);
+
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    __global__ void transformKernel(const PtrStepSz<T> src, PtrStep<T> dst, const PtrStep<T> lut, const int2 tileSize, const int tilesX, const int tilesY)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -159,22 +383,26 @@ namespace clahe
         res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
         res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
 
-        dst(y, x) = saturate_cast<uchar>(res);
+        dst(y, x) = saturate_cast<T>(res);
     }
 
-    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
+    template <typename T>
+    void transform(PtrStepSz<T> src, PtrStepSz<T> dst, PtrStep<T> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
     {
         const dim3 block(32, 8);
         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
 
-        CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
+        CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel<T>, cudaFuncCachePreferL1) );
 
-        transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
+        transformKernel<T><<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
 
         if (stream == 0)
             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
     }
+
+    template void transform<uchar>(PtrStepSz<uchar> src, PtrStepSz<uchar> dst, PtrStep<uchar> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
+    template void transform<ushort>(PtrStepSz<ushort> src, PtrStepSz<ushort> dst, PtrStep<ushort> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
 }
 
 #endif // CUDA_DISABLER
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index 6e219b641b..e616c5a2e9 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -141,8 +141,9 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
 
 namespace clahe
 {
-    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
-    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
+    void calcLut_8U(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
+    void calcLut_16U(PtrStepSzus src, PtrStepus lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, PtrStepSzi hist, cudaStream_t stream);
+    template <typename T> void transform(PtrStepSz<T> src, PtrStepSz<T> dst, PtrStep<T> lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
 }
 
 namespace
@@ -170,6 +171,7 @@ namespace
 
         GpuMat srcExt_;
         GpuMat lut_;
+        GpuMat hist_; // histogram on global memory for CV_16UC1 case
     };
 
     CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
@@ -186,14 +188,16 @@ namespace
     {
         GpuMat src = _src.getGpuMat();
 
-        CV_Assert( src.type() == CV_8UC1 );
+        const int type = src.type();
 
-        _dst.create( src.size(), src.type() );
+        CV_Assert( type == CV_8UC1 || type == CV_16UC1 );
+
+        _dst.create( src.size(), type );
         GpuMat dst = _dst.getGpuMat();
 
-        const int histSize = 256;
+        const int histSize = type == CV_8UC1 ? 256 : 65536;
 
-        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
+        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, type, lut_);
 
         cudaStream_t stream = StreamAccessor::getStream(s);
 
@@ -227,9 +231,18 @@ namespace
             clipLimit = std::max(clipLimit, 1);
         }
 
-        clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
+        if (type == CV_8UC1)
+            clahe::calcLut_8U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
+        else // type == CV_16UC1
+        {
+            ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_32SC1, hist_);
+            clahe::calcLut_16U(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, hist_, stream);
+        }
 
-        clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
+        if (type == CV_8UC1)
+            clahe::transform<uchar>(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
+        else // type == CV_16UC1
+            clahe::transform<ushort>(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
     }
 
     void CLAHE_Impl::setClipLimit(double clipLimit)
diff --git a/modules/cudaimgproc/test/test_histogram.cpp b/modules/cudaimgproc/test/test_histogram.cpp
index eb084609e9..6af8eb2135 100644
--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@@ -236,17 +236,19 @@ namespace
     IMPLEMENT_PARAM_CLASS(ClipLimit, double)
 }
 
-PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit)
+PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit, MatType)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
     double clipLimit;
+    int type;
 
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         size = GET_PARAM(1);
         clipLimit = GET_PARAM(2);
+        type = GET_PARAM(3);
 
         cv::cuda::setDevice(devInfo.deviceID());
     }
@@ -254,7 +256,11 @@ PARAM_TEST_CASE(CLAHE, cv::cuda::DeviceInfo, cv::Size, ClipLimit)
 
 CUDA_TEST_P(CLAHE, Accuracy)
 {
-    cv::Mat src = randomMat(size, CV_8UC1);
+    cv::Mat src;
+    if (type == CV_8UC1)
+        src = randomMat(size, type);
+    else if (type == CV_16UC1)
+        src = randomMat(size, type, 0, 65535);
 
     cv::Ptr<cv::cuda::CLAHE> clahe = cv::cuda::createCLAHE(clipLimit);
     cv::cuda::GpuMat dst;
@@ -270,7 +276,8 @@ CUDA_TEST_P(CLAHE, Accuracy)
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CLAHE, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
-    testing::Values(0.0, 40.0)));
+    testing::Values(0.0, 5.0, 10.0, 20.0, 40.0),
+    testing::Values(MatType(CV_8UC1), MatType(CV_16UC1))));
 
 
 }} // namespace
diff --git a/modules/imgproc/perf/perf_histogram.cpp b/modules/imgproc/perf/perf_histogram.cpp
index 4f54e948bb..d80d8a6d51 100644
--- a/modules/imgproc/perf/perf_histogram.cpp
+++ b/modules/imgproc/perf/perf_histogram.cpp
@@ -141,18 +141,20 @@ PERF_TEST_P(Dim_Cmpmethod, compareHist,
     SANITY_CHECK_NOTHING();
 }
 
-typedef tuple<Size, double> Sz_ClipLimit_t;
+typedef tuple<Size, double, MatType> Sz_ClipLimit_t;
 typedef TestBaseWithParam<Sz_ClipLimit_t> Sz_ClipLimit;
 
 PERF_TEST_P(Sz_ClipLimit, CLAHE,
             testing::Combine(testing::Values(::perf::szVGA, ::perf::sz720p, ::perf::sz1080p),
-                             testing::Values(0.0, 40.0))
+                             testing::Values(0.0, 40.0),
+                             testing::Values(MatType(CV_8UC1), MatType(CV_16UC1)))
             )
 {
     const Size size = get<0>(GetParam());
     const double clipLimit = get<1>(GetParam());
+    const int type = get<2>(GetParam());
 
-    Mat src(size, CV_8UC1);
+    Mat src(size, type);
     declare.in(src, WARMUP_RNG);
 
     Ptr<CLAHE> clahe = createCLAHE(clipLimit);

From 9cbdb48d6d55d40f2ada59a32822d040d5e282b3 Mon Sep 17 00:00:00 2001
From: Alexander Nesterov <nesoldr@mail.ru>
Date: Fri, 1 Feb 2019 16:23:51 -0100
Subject: [PATCH 5/9] Fix change step

---
 modules/dnn/src/layers/prior_box_layer.cpp | 19 +++++++++++------
 modules/dnn/test/test_backends.cpp         | 24 ++++++++++++++++++++++
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index 458d667cbc..ac11fe7ada 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -487,9 +487,7 @@ public:
         if (_explicitSizes)
         {
             InferenceEngine::Builder::PriorBoxClusteredLayer ieLayer(name);
-
-            CV_Assert(_stepX == _stepY);
-            ieLayer.setStep(_stepX);
+            ieLayer.setSteps({_stepY, _stepX});
 
             CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
             ieLayer.setOffset(_offsetsX[0]);
@@ -516,9 +514,6 @@ public:
             if (_maxSize > 0)
                 ieLayer.setMaxSize(_maxSize);
 
-            CV_Assert(_stepX == _stepY);
-            ieLayer.setStep(_stepX);
-
             CV_CheckEQ(_offsetsX.size(), (size_t)1, ""); CV_CheckEQ(_offsetsY.size(), (size_t)1, ""); CV_CheckEQ(_offsetsX[0], _offsetsY[0], "");
             ieLayer.setOffset(_offsetsX[0]);
 
@@ -526,6 +521,18 @@ public:
             ieLayer.setFlip(false);  // We already flipped aspect ratios.
 
             InferenceEngine::Builder::Layer l = ieLayer;
+            if (_stepX == _stepY)
+            {
+                l.getParameters()["step"] = _stepX;
+                l.getParameters()["step_h"] = 0.0;
+                l.getParameters()["step_w"] = 0.0;
+            }
+            else
+            {
+                l.getParameters()["step"] = 0.0;
+                l.getParameters()["step_h"] = _stepY;
+                l.getParameters()["step_w"] = _stepX;
+            }
             if (!_aspectRatios.empty())
             {
                 l.getParameters()["aspect_ratio"] = _aspectRatios;
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 1d97cfc088..fbf045d01d 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -162,6 +162,18 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
                inp, "detection_out", "", diffScores);
 }
 
+TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    Mat sample = imread(findDataFile("dnn/street.png", false));
+    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 560), Scalar(127.5, 127.5, 127.5), false);
+    float diffScores  = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.029 : 0.0;
+    float diffSquares = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.09  : 0.0;
+    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
+               inp, "detection_out", "", diffScores, diffSquares);
+}
+
 TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
 {
     if (backend == DNN_BACKEND_HALIDE)
@@ -174,6 +186,18 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
                inp, "detection_out", "", l1, lInf);
 }
 
+TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height)
+{
+    if (backend == DNN_BACKEND_HALIDE)
+        throw SkipTestException("");
+    Mat sample = imread(findDataFile("dnn/street.png", false));
+    Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false);
+    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 0.0;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.06 : 0.0;
+    processNet("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", "dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt",
+               inp, "detection_out", "", l1, lInf);
+}
+
 TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
 {
     if (backend == DNN_BACKEND_HALIDE)

From 07c10d6fc3a5e697aba2e9b34586d2bafef2ce85 Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <terfendail@mediana.jetos.com>
Date: Thu, 7 Feb 2019 15:58:34 +0300
Subject: [PATCH 6/9] Fixed out of bound reading issue in erode() and dilate()

---
 modules/imgproc/src/morph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index cb25a50c7b..c18e5c8066 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -159,7 +159,7 @@ template<class VecUpdate> struct MorphRowVec
             i += vtype::nlanes/2;
         }
 
-        return i;
+        return i - i % cn;
     }
 
     int ksize, anchor;

From 87f651c119d31564e95e8dfb512d68a9f70ffab8 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@intel.com>
Date: Thu, 7 Feb 2019 18:20:29 +0300
Subject: [PATCH 7/9] disabled sanity check for 32f

---
 modules/imgproc/perf/perf_cvt_color.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index e0b55b7fa0..ac4f62e29f 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -391,7 +391,7 @@ PERF_TEST_P(Size_CvtMode32F, cvtColor_32f,
     int runs = sz.width <= 320 ? 100 : 5;
     TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
 
-    SANITY_CHECK(dst, 1);
+    SANITY_CHECK_NOTHING();
 }
 
 typedef tuple<Size, CvtModeBayer> Size_CvtMode_Bayer_t;

From 4e679e1cc5b075ec006b29a58b4fe117523fba1d Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@intel.com>
Date: Thu, 7 Feb 2019 19:26:36 +0300
Subject: [PATCH 8/9] disabled 16u and 32f perf tests

---
 modules/imgproc/perf/perf_cvt_color.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index ac4f62e29f..e3af113bed 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -344,7 +344,7 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u,
 typedef tuple<Size, CvtMode16U> Size_CvtMode16U_t;
 typedef perf::TestBaseWithParam<Size_CvtMode16U_t> Size_CvtMode16U;
 
-PERF_TEST_P(Size_CvtMode16U, cvtColor_16u,
+PERF_TEST_P(Size_CvtMode16U, DISABLED_cvtColor_16u,
             testing::Combine(
                 testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p),
                 CvtMode16U::all()
@@ -371,7 +371,7 @@ PERF_TEST_P(Size_CvtMode16U, cvtColor_16u,
 typedef tuple<Size, CvtMode32F> Size_CvtMode32F_t;
 typedef perf::TestBaseWithParam<Size_CvtMode32F_t> Size_CvtMode32F;
 
-PERF_TEST_P(Size_CvtMode32F, cvtColor_32f,
+PERF_TEST_P(Size_CvtMode32F, DISABLED_cvtColor_32f,
             testing::Combine(
                 testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p),
                 CvtMode32F::all()

From d7b1f28a906b057e18972eed856f82cf976fa067 Mon Sep 17 00:00:00 2001
From: AnastasiaaSenina <miss.s-anas.itmm@yandex.ru>
Date: Thu, 7 Feb 2019 17:50:55 +0300
Subject: [PATCH 9/9]  fixed bug: added threshold for variables 'rotate_a', '
 rotate_c'

---
 modules/video/src/camshift.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/video/src/camshift.cpp b/modules/video/src/camshift.cpp
index ed5426ab98..dc40462762 100644
--- a/modules/video/src/camshift.cpp
+++ b/modules/video/src/camshift.cpp
@@ -167,6 +167,8 @@ cv::RotatedRect cv::CamShift( InputArray _probImage, Rect& window,
 
     double rotate_a = cs * cs * mu20 + 2 * cs * sn * mu11 + sn * sn * mu02;
     double rotate_c = sn * sn * mu20 - 2 * cs * sn * mu11 + cs * cs * mu02;
+    rotate_a = std::max(0.0, rotate_a);  // avoid negative result due calculation numeric errors
+    rotate_c = std::max(0.0, rotate_c);  // avoid negative result due calculation numeric errors
     double length = std::sqrt( rotate_a * inv_m00 ) * 4;
     double width = std::sqrt( rotate_c * inv_m00 ) * 4;