From d4af89781b81cdc2ffa93af2a04a26370575d230 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Thu, 11 Jun 2020 23:06:18 +0900
Subject: [PATCH 01/15] fix corner case of libnvcuvid   * detect header
 automatically and not based on version number

---
 cmake/OpenCVDetectCUDA.cmake                 | 21 +++++++++++++++++++-
 cmake/templates/cvconfig.h.in                |  2 ++
 modules/cudacodec/src/cuvid_video_source.hpp |  4 ++--
 modules/cudacodec/src/frame_queue.hpp        |  4 ++--
 modules/cudacodec/src/precomp.hpp            |  4 ++--
 modules/cudacodec/src/video_decoder.hpp      |  4 ++--
 modules/cudacodec/src/video_parser.hpp       |  4 ++--
 7 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 98a00fdd87..37a77cd660 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -38,11 +38,30 @@ if(CUDA_FOUND)
   endif()
 
   if(WITH_NVCUVID)
+    macro(SEARCH_NVCUVID_HEADER _filename _result)
+      # place header file under CUDA_TOOLKIT_TARGET_DIR or CUDA_TOOLKIT_ROOT_DIR
+      find_path(_header_result
+        ${_filename}
+        PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
+        ENV CUDA_PATH
+        ENV CUDA_INC_PATH
+        PATH_SUFFIXES include
+        NO_DEFAULT_PATH
+        )
+      if("x${_header_result}" STREQUAL "x_header_result-NOTFOUND")
+        set(${_result} 0)
+      else()
+        set(${_result} 1)
+      endif()
+    endmacro()
+    SEARCH_NVCUVID_HEADER("nvcuvid.h" HAVE_NVCUVID_HEADER)
+    SEARCH_NVCUVID_HEADER("dynlink_nvcuvid.h" HAVE_DYNLINK_NVCUVID_HEADER)
     find_cuda_helper_libs(nvcuvid)
     if(WIN32)
       find_cuda_helper_libs(nvcuvenc)
     endif()
-    if(CUDA_nvcuvid_LIBRARY)
+    if(CUDA_nvcuvid_LIBRARY AND (${HAVE_NVCUVID_HEADER} OR ${HAVE_DYNLINK_NVCUVID_HEADER}))
+      # make sure to have both header and library before enabling
       set(HAVE_NVCUVID 1)
     endif()
     if(CUDA_nvcuvenc_LIBRARY)
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index f7656e9d71..9a42bfa3d1 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -127,6 +127,8 @@
 
 /* NVIDIA Video Decoding API*/
 #cmakedefine HAVE_NVCUVID
+#cmakedefine HAVE_NVCUVID_HEADER
+#cmakedefine HAVE_DYNLINK_NVCUVID_HEADER
 
 /* NVIDIA Video Encoding API*/
 #cmakedefine HAVE_NVCUVENC
diff --git a/modules/cudacodec/src/cuvid_video_source.hpp b/modules/cudacodec/src/cuvid_video_source.hpp
index 802e65a92c..4dd7761620 100644
--- a/modules/cudacodec/src/cuvid_video_source.hpp
+++ b/modules/cudacodec/src/cuvid_video_source.hpp
@@ -44,9 +44,9 @@
 #ifndef __CUVID_VIDEO_SOURCE_HPP__
 #define __CUVID_VIDEO_SOURCE_HPP__
 
-#if CUDA_VERSION >= 9000 && CUDA_VERSION < 10000
+#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
     #include <dynlink_nvcuvid.h>
-#else
+#elif defined(HAVE_NVCUVID_HEADER)
     #include <nvcuvid.h>
 #endif
 #include "opencv2/core/private.cuda.hpp"
diff --git a/modules/cudacodec/src/frame_queue.hpp b/modules/cudacodec/src/frame_queue.hpp
index 3ff06a67ed..f7fe7ad57d 100644
--- a/modules/cudacodec/src/frame_queue.hpp
+++ b/modules/cudacodec/src/frame_queue.hpp
@@ -47,9 +47,9 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.cuda.hpp"
 
-#if CUDA_VERSION >= 9000 && CUDA_VERSION < 10000
+#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
     #include <dynlink_nvcuvid.h>
-#else
+#elif defined(HAVE_NVCUVID_HEADER)
     #include <nvcuvid.h>
 #endif
 
diff --git a/modules/cudacodec/src/precomp.hpp b/modules/cudacodec/src/precomp.hpp
index 728924fa61..e453d90197 100644
--- a/modules/cudacodec/src/precomp.hpp
+++ b/modules/cudacodec/src/precomp.hpp
@@ -56,9 +56,9 @@
 #include "opencv2/core/private.cuda.hpp"
 
 #ifdef HAVE_NVCUVID
-    #if CUDA_VERSION >= 9000 && CUDA_VERSION < 10000
+    #if defined(HAVE_DYNLINK_NVCUVID_HEADER)
         #include <dynlink_nvcuvid.h>
-    #else
+    #elif defined(HAVE_NVCUVID_HEADER)
         #include <nvcuvid.h>
     #endif
 
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
index 30878caa7b..46caccaf83 100644
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -44,9 +44,9 @@
 #ifndef __VIDEO_DECODER_HPP__
 #define __VIDEO_DECODER_HPP__
 
-#if CUDA_VERSION >= 9000 && CUDA_VERSION < 10000
+#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
     #include <dynlink_nvcuvid.h>
-#else
+#elif defined(HAVE_NVCUVID_HEADER)
     #include <nvcuvid.h>
 #endif
 
diff --git a/modules/cudacodec/src/video_parser.hpp b/modules/cudacodec/src/video_parser.hpp
index 5bd0f96562..03fff8e96a 100644
--- a/modules/cudacodec/src/video_parser.hpp
+++ b/modules/cudacodec/src/video_parser.hpp
@@ -44,9 +44,9 @@
 #ifndef __VIDEO_PARSER_HPP__
 #define __VIDEO_PARSER_HPP__
 
-#if CUDA_VERSION >= 9000 && CUDA_VERSION < 10000
+#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
     #include <dynlink_nvcuvid.h>
-#else
+#elif defined(HAVE_NVCUVID_HEADER)
     #include <nvcuvid.h>
 #endif
 

From 442999dcdb0e51fe48661fe92803a568a52eeec9 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 12 Jun 2020 10:07:58 +0000
Subject: [PATCH 02/15] core: fix handling of ND-arrays in dumpInputArray()
 helpers

---
 modules/core/src/bindings_utils.cpp | 94 ++++++++++++++++++++++++-----
 modules/python/test/test_misc.py    | 21 +++++--
 2 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/modules/core/src/bindings_utils.cpp b/modules/core/src/bindings_utils.cpp
index 432f280e52..050b7247f8 100644
--- a/modules/core/src/bindings_utils.cpp
+++ b/modules/core/src/bindings_utils.cpp
@@ -25,9 +25,26 @@ String dumpInputArray(InputArray argument)
                 break; // done
             }
             ss << cv::format(" total(-1)=%lld", (long long int)argument.total(-1));
-            ss << cv::format(" dims(-1)=%d", argument.dims(-1));
-            Size size = argument.size(-1);
-            ss << cv::format(" size(-1)=%dx%d", size.width, size.height);
+            int dims = argument.dims(-1);
+            ss << cv::format(" dims(-1)=%d", dims);
+            if (dims <= 2)
+            {
+                Size size = argument.size(-1);
+                ss << cv::format(" size(-1)=%dx%d", size.width, size.height);
+            }
+            else
+            {
+                int sz[CV_MAX_DIM] = {0};
+                argument.sizend(sz, -1);
+                ss << " size(-1)=[";
+                for (int i = 0; i < dims; i++)
+                {
+                    if (i > 0)
+                        ss << ' ';
+                    ss << sz[i];
+                }
+                ss << "]";
+            }
             ss << " type(-1)=" << cv::typeToString(argument.type(-1));
         } while (0);
     }
@@ -61,10 +78,26 @@ CV_EXPORTS_W String dumpInputArrayOfArrays(InputArrayOfArrays argument)
             if (argument.total(-1) > 0)
             {
                 ss << " type(0)=" << cv::typeToString(argument.type(0));
-                ss << cv::format(" dims(0)=%d", argument.dims(0));
-                size = argument.size(0);
-                ss << cv::format(" size(0)=%dx%d", size.width, size.height);
-                ss << " type(0)=" << cv::typeToString(argument.type(0));
+                int dims = argument.dims(0);
+                ss << cv::format(" dims(0)=%d", dims);
+                if (dims <= 2)
+                {
+                    Size size0 = argument.size(0);
+                    ss << cv::format(" size(0)=%dx%d", size0.width, size0.height);
+                }
+                else
+                {
+                    int sz[CV_MAX_DIM] = {0};
+                    argument.sizend(sz, 0);
+                    ss << " size(0)=[";
+                    for (int i = 0; i < dims; i++)
+                    {
+                        if (i > 0)
+                            ss << ' ';
+                        ss << sz[i];
+                    }
+                    ss << "]";
+                }
             }
         } while (0);
     }
@@ -92,9 +125,26 @@ CV_EXPORTS_W String dumpInputOutputArray(InputOutputArray argument)
                 break; // done
             }
             ss << cv::format(" total(-1)=%lld", (long long int)argument.total(-1));
-            ss << cv::format(" dims(-1)=%d", argument.dims(-1));
-            Size size = argument.size(-1);
-            ss << cv::format(" size(-1)=%dx%d", size.width, size.height);
+            int dims = argument.dims(-1);
+            ss << cv::format(" dims(-1)=%d", dims);
+            if (dims <= 2)
+            {
+                Size size = argument.size(-1);
+                ss << cv::format(" size(-1)=%dx%d", size.width, size.height);
+            }
+            else
+            {
+                int sz[CV_MAX_DIM] = {0};
+                argument.sizend(sz, -1);
+                ss << " size(-1)=[";
+                for (int i = 0; i < dims; i++)
+                {
+                    if (i > 0)
+                        ss << ' ';
+                    ss << sz[i];
+                }
+                ss << "]";
+            }
             ss << " type(-1)=" << cv::typeToString(argument.type(-1));
         } while (0);
     }
@@ -128,10 +178,26 @@ CV_EXPORTS_W String dumpInputOutputArrayOfArrays(InputOutputArrayOfArrays argume
             if (argument.total(-1) > 0)
             {
                 ss << " type(0)=" << cv::typeToString(argument.type(0));
-                ss << cv::format(" dims(0)=%d", argument.dims(0));
-                size = argument.size(0);
-                ss << cv::format(" size(0)=%dx%d", size.width, size.height);
-                ss << " type(0)=" << cv::typeToString(argument.type(0));
+                int dims = argument.dims(0);
+                ss << cv::format(" dims(0)=%d", dims);
+                if (dims <= 2)
+                {
+                    Size size0 = argument.size(0);
+                    ss << cv::format(" size(0)=%dx%d", size0.width, size0.height);
+                }
+                else
+                {
+                    int sz[CV_MAX_DIM] = {0};
+                    argument.sizend(sz, 0);
+                    ss << " size(0)=[";
+                    for (int i = 0; i < dims; i++)
+                    {
+                        if (i > 0)
+                            ss << ' ';
+                        ss << sz[i];
+                    }
+                    ss << "]";
+                }
             }
         } while (0);
     }
diff --git a/modules/python/test/test_misc.py b/modules/python/test/test_misc.py
index b25ef7efbb..f9a350d160 100644
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@@ -105,22 +105,35 @@ class Arguments(NewOpenCVTests):
         a = np.array([[[1, 2]], [[3, 4]], [[5, 6]]], dtype=float)
         res5 = cv.utils.dumpInputArray(a)  # 64FC2
         self.assertEqual(res5, "InputArray: empty()=false kind=0x00010000 flags=0x01010000 total(-1)=3 dims(-1)=2 size(-1)=1x3 type(-1)=CV_64FC2")
+        a = np.zeros((2,3,4), dtype='f')
+        res6 = cv.utils.dumpInputArray(a)
+        self.assertEqual(res6, "InputArray: empty()=false kind=0x00010000 flags=0x01010000 total(-1)=6 dims(-1)=2 size(-1)=3x2 type(-1)=CV_32FC4")
+        a = np.zeros((2,3,4,5), dtype='f')
+        res7 = cv.utils.dumpInputArray(a)
+        self.assertEqual(res7, "InputArray: empty()=false kind=0x00010000 flags=0x01010000 total(-1)=120 dims(-1)=4 size(-1)=[2 3 4 5] type(-1)=CV_32FC1")
 
     def test_InputArrayOfArrays(self):
         res1 = cv.utils.dumpInputArrayOfArrays(None)
         # self.assertEqual(res1, "InputArray: noArray()")  # not supported
         self.assertEqual(res1, "InputArrayOfArrays: empty()=true kind=0x00050000 flags=0x01050000 total(-1)=0 dims(-1)=1 size(-1)=0x0")
         res2_1 = cv.utils.dumpInputArrayOfArrays((1, 2))  # { Scalar:all(1), Scalar::all(2) }
-        self.assertEqual(res2_1, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=2 dims(-1)=1 size(-1)=2x1 type(0)=CV_64FC1 dims(0)=2 size(0)=1x4 type(0)=CV_64FC1")
+        self.assertEqual(res2_1, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=2 dims(-1)=1 size(-1)=2x1 type(0)=CV_64FC1 dims(0)=2 size(0)=1x4")
         res2_2 = cv.utils.dumpInputArrayOfArrays([1.5])
-        self.assertEqual(res2_2, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=1 dims(-1)=1 size(-1)=1x1 type(0)=CV_64FC1 dims(0)=2 size(0)=1x4 type(0)=CV_64FC1")
+        self.assertEqual(res2_2, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=1 dims(-1)=1 size(-1)=1x1 type(0)=CV_64FC1 dims(0)=2 size(0)=1x4")
         a = np.array([[1, 2], [3, 4], [5, 6]])
         b = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
         res3 = cv.utils.dumpInputArrayOfArrays([a, b])
-        self.assertEqual(res3, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=2 dims(-1)=1 size(-1)=2x1 type(0)=CV_32SC1 dims(0)=2 size(0)=2x3 type(0)=CV_32SC1")
+        self.assertEqual(res3, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=2 dims(-1)=1 size(-1)=2x1 type(0)=CV_32SC1 dims(0)=2 size(0)=2x3")
         c = np.array([[[1, 2], [3, 4], [5, 6]]], dtype='f')
         res4 = cv.utils.dumpInputArrayOfArrays([c, a, b])
-        self.assertEqual(res4, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=3 dims(-1)=1 size(-1)=3x1 type(0)=CV_32FC2 dims(0)=2 size(0)=3x1 type(0)=CV_32FC2")
+        self.assertEqual(res4, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=3 dims(-1)=1 size(-1)=3x1 type(0)=CV_32FC2 dims(0)=2 size(0)=3x1")
+        a = np.zeros((2,3,4), dtype='f')
+        res5 = cv.utils.dumpInputArrayOfArrays([a, b])
+        self.assertEqual(res5, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=2 dims(-1)=1 size(-1)=2x1 type(0)=CV_32FC4 dims(0)=2 size(0)=3x2")
+        # TODO: fix conversion error
+        #a = np.zeros((2,3,4,5), dtype='f')
+        #res6 = cv.utils.dumpInputArray([a, b])
+        #self.assertEqual(res6, "InputArrayOfArrays: empty()=false kind=0x00050000 flags=0x01050000 total(-1)=2 dims(-1)=1 size(-1)=2x1 type(0)=CV_32FC1 dims(0)=4 size(0)=[2 3 4 5]")
 
     def test_parse_to_bool_convertible(self):
         try_to_convert = partial(self._try_to_convert, cv.utils.dumpBool)

From b0def9617fed66a2c3b9e56a4ef7b435d611921c Mon Sep 17 00:00:00 2001
From: Mehdi Zakaria Benadel <mehdi.benadel@gmail.com>
Date: Fri, 12 Jun 2020 20:59:43 +0200
Subject: [PATCH 03/15] Fix typo

This typo just made me lose my mind on the conan package update. please merge.
---
 cmake/OpenCVFindOpenEXR.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/OpenCVFindOpenEXR.cmake b/cmake/OpenCVFindOpenEXR.cmake
index 9103f1f5a7..ef633e853a 100644
--- a/cmake/OpenCVFindOpenEXR.cmake
+++ b/cmake/OpenCVFindOpenEXR.cmake
@@ -88,7 +88,7 @@ FOREACH(SEARCH_PATH ${SEARCH_PATHS})
     ocv_find_openexr("-${OPENEXR_VERSION}")
     ocv_find_openexr("-${OPENEXR_VERSION}_s")
     ocv_find_openexr("-${OPENEXR_VERSION}_d")
-    ocv_find_openexr("-${OPEXEXR_VERSION}_s_d")
+    ocv_find_openexr("-${OPENEXR_VERSION}_s_d")
     ocv_find_openexr("")
     ocv_find_openexr("_s")
     ocv_find_openexr("_d")

From 411ce04f543de50f626313457902345874fee423 Mon Sep 17 00:00:00 2001
From: Namgoo Lee <namgoo.lee@cognex.com>
Date: Tue, 16 Jun 2020 01:43:51 +0900
Subject: [PATCH 04/15] CUDA_OptFlow/OpticalFlowDual_TVL1 Asynchronous test

---
 modules/cudaoptflow/test/test_optflow.cpp | 63 ++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/modules/cudaoptflow/test/test_optflow.cpp b/modules/cudaoptflow/test/test_optflow.cpp
index 37ffe9e5c4..92f9466af6 100644
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -405,10 +405,71 @@ CUDA_TEST_P(OpticalFlowDual_TVL1, Accuracy)
     EXPECT_MAT_SIMILAR(flow, d_flow, 4e-3);
 }
 
+class TVL1AsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+    TVL1AsyncParallelLoopBody(const cv::cuda::GpuMat& d_img1_, const cv::cuda::GpuMat& d_img2_, cv::cuda::GpuMat* d_flow_, int iterations_, double gamma_)
+        : d_img1(d_img1_), d_img2(d_img2_), d_flow(d_flow_), iterations(iterations_), gamma(gamma_) {}
+    ~TVL1AsyncParallelLoopBody() {}
+    void operator()(const cv::Range& r) const
+    {
+        for (int i = r.start; i < r.end; i++) {
+            cv::cuda::Stream stream;
+            cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg = cv::cuda::OpticalFlowDual_TVL1::create();
+            d_alg->setNumIterations(iterations);
+            d_alg->setGamma(gamma);
+            d_alg->calc(d_img1, d_img2, d_flow[i], stream);
+            stream.waitForCompletion();
+        }
+    }
+protected:
+    const cv::cuda::GpuMat& d_img1;
+    const cv::cuda::GpuMat& d_img2;
+    cv::cuda::GpuMat* d_flow;
+    int iterations;
+    double gamma;
+};
+
+#define NUM_STREAMS 16
+
+CUDA_TEST_P(OpticalFlowDual_TVL1, Async)
+{
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
+    {
+        throw SkipTestException("CUDA device doesn't support texture objects");
+    }
+    else
+    {
+        cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame0.empty());
+
+        cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame1.empty());
+
+        const int iterations = 10;
+
+        // Synchronous call
+        cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
+                cv::cuda::OpticalFlowDual_TVL1::create();
+        d_alg->setNumIterations(iterations);
+        d_alg->setGamma(gamma);
+
+        cv::cuda::GpuMat d_flow_gold;
+        d_alg->calc(loadMat(frame0), loadMat(frame1), d_flow_gold);
+
+        // Asynchronous call
+        cv::cuda::GpuMat d_flow[NUM_STREAMS];
+        cv::parallel_for_(cv::Range(0, NUM_STREAMS), TVL1AsyncParallelLoopBody(loadMat(frame0), loadMat(frame1), d_flow, iterations, gamma));
+
+        // Compare the results of synchronous call and asynchronous call
+        for (int i = 0; i < NUM_STREAMS; i++)
+            EXPECT_MAT_NEAR(d_flow_gold, d_flow[i], 0.0);
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
     ALL_DEVICES,
     testing::Values(Gamma(0.0), Gamma(1.0))));
 
-
 }} // namespace
 #endif // HAVE_CUDA

From 676b818d6aefeb445e552a33081e34221cf5f8bc Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 15 Jun 2020 16:02:52 +0300
Subject: [PATCH 05/15] Removed plugin dispatcher

backport of commit 74113737f3710069e5ae2daff09e77c824dd5a2b
---
 modules/dnn/src/ie_ngraph.cpp     | 1 -
 modules/dnn/src/op_inf_engine.cpp | 1 -
 2 files changed, 2 deletions(-)

diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index e1bf738313..7cac0c3593 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -15,7 +15,6 @@
 
 #ifdef HAVE_DNN_NGRAPH
 #include <ie_extension.h>
-#include <ie_plugin_dispatcher.hpp>
 #endif  // HAVE_DNN_NGRAPH
 
 #include <opencv2/core/utils/configuration.private.hpp>
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 5f3af4658e..7020e3b0a9 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -11,7 +11,6 @@
 
 #ifdef HAVE_INF_ENGINE
 #include <ie_extension.h>
-#include <ie_plugin_dispatcher.hpp>
 #endif  // HAVE_INF_ENGINE
 
 #include <opencv2/core/utils/configuration.private.hpp>

From 2043e06102fadd5df4e52853d8f08f0510763aff Mon Sep 17 00:00:00 2001
From: Namgoo Lee <namgoo.lee@cognex.com>
Date: Wed, 17 Jun 2020 01:04:22 +0900
Subject: [PATCH 06/15] cuda optflow tvl1 : async safety

also modify cuda canny to use createTextureObjectPitch2D, etc.
---
 .../core/include/opencv2/core/cuda/common.hpp |  14 +++
 modules/cudaimgproc/src/cuda/canny.cu         |  69 +++++------
 modules/cudaimgproc/test/test_canny.cpp       |   2 +-
 modules/cudaoptflow/src/cuda/tvl1flow.cu      | 107 ++++++++++++++++--
 4 files changed, 138 insertions(+), 54 deletions(-)

diff --git a/modules/core/include/opencv2/core/cuda/common.hpp b/modules/core/include/opencv2/core/cuda/common.hpp
index 14b1f3f5ae..80b2ff08b1 100644
--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@@ -101,6 +101,20 @@ namespace cv { namespace cuda
             cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
             cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
         }
+
+        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc)
+        {
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = static_cast<void*>(img.ptr());
+            resDesc.res.pitch2D.height = img.rows;
+            resDesc.res.pitch2D.width = img.cols;
+            resDesc.res.pitch2D.pitchInBytes = img.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+
+            cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
+        }
     }
 }}
 
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
index 4418b8e5eb..253287ca31 100644
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -90,53 +90,47 @@ namespace cv { namespace cuda { namespace device
 
 namespace canny
 {
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
     struct SrcTex
     {
-        int xoff;
-        int yoff;
+        virtual ~SrcTex() {}
+
         __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
 
-        __device__ __forceinline__ int operator ()(int y, int x) const
+        __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
+
+        int xoff;
+        int yoff;
+    };
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTexRef : SrcTex
+    {
+        __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const override
         {
             return tex2D(tex_src, x + xoff, y + yoff);
         }
     };
 
-    struct SrcTexObject
+    struct SrcTexObj : SrcTex
     {
-        int xoff;
-        int yoff;
-        cudaTextureObject_t tex_src_object;
-        __host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
+        __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
 
-        __device__ __forceinline__ int operator ()(int y, int x) const
+        __device__ __forceinline__ int operator ()(int y, int x) const override
         {
             return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
         }
 
+        cudaTextureObject_t tex_src_object;
     };
 
-    template <class Norm> __global__
-    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (y >= mag.rows || x >= mag.cols)
-            return;
-
-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
-
-        dx(y, x) = dxVal;
-        dy(y, x) = dyVal;
-
-        mag(y, x) = norm(dxVal, dyVal);
-    }
-
-    template <class Norm> __global__
-    void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    template <
+        class T,
+        class Norm,
+        typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
+    >
+    __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -162,15 +156,6 @@ namespace canny
 
         if (cc30)
         {
-            cudaResourceDesc resDesc;
-            memset(&resDesc, 0, sizeof(resDesc));
-            resDesc.resType = cudaResourceTypePitch2D;
-            resDesc.res.pitch2D.devPtr = srcWhole.ptr();
-            resDesc.res.pitch2D.height = srcWhole.rows;
-            resDesc.res.pitch2D.width = srcWhole.cols;
-            resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
-            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
-
             cudaTextureDesc texDesc;
             memset(&texDesc, 0, sizeof(texDesc));
             texDesc.addressMode[0] = cudaAddressModeClamp;
@@ -178,9 +163,9 @@ namespace canny
             texDesc.addressMode[2] = cudaAddressModeClamp;
 
             cudaTextureObject_t tex = 0;
-            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+            createTextureObjectPitch2D(&tex, srcWhole, texDesc);
 
-            SrcTexObject src(xoff, yoff, tex);
+            SrcTexObj src(xoff, yoff, tex);
 
             if (L2Grad)
             {
@@ -205,7 +190,7 @@ namespace canny
         else
         {
             bindTexture(&tex_src, srcWhole);
-            SrcTex src(xoff, yoff);
+            SrcTexRef src(xoff, yoff);
 
             if (L2Grad)
             {
diff --git a/modules/cudaimgproc/test/test_canny.cpp b/modules/cudaimgproc/test/test_canny.cpp
index a782a87b3b..1b48e7d321 100644
--- a/modules/cudaimgproc/test/test_canny.cpp
+++ b/modules/cudaimgproc/test/test_canny.cpp
@@ -116,7 +116,7 @@ protected:
     bool useL2gradient;
 };
 
-#define NUM_STREAMS 64
+#define NUM_STREAMS 128
 
 CUDA_TEST_P(Canny, Async)
 {
diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu
index 66f0d664a0..2688e05c61 100644
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -45,6 +45,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 #include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda.hpp"
 
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@@ -101,11 +102,64 @@ namespace tvl1flow
         }
     }
 
+    struct SrcTex
+    {
+        virtual ~SrcTex() {}
+
+        __device__ __forceinline__ virtual float I1(float x, float y) const = 0;
+        __device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
+        __device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
+    };
+
     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTexRef : SrcTex
+    {
+        __device__ __forceinline__ float I1(float x, float y) const override
+        {
+            return tex2D(tex_I1, x, y);
+        }
+        __device__ __forceinline__ float I1x(float x, float y) const override
+        {
+            return tex2D(tex_I1x, x, y);
+        }
+        __device__ __forceinline__ float I1y(float x, float y) const override
+        {
+            return tex2D(tex_I1y, x, y);
+        }
+    };
 
-    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
+    struct SrcTexObj : SrcTex
+    {
+        __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
+            : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
+
+        __device__ __forceinline__ float I1(float x, float y) const override
+        {
+            return tex2D<float>(tex_obj_I1, x, y);
+        }
+        __device__ __forceinline__ float I1x(float x, float y) const override
+        {
+            return tex2D<float>(tex_obj_I1x, x, y);
+        }
+        __device__ __forceinline__ float I1y(float x, float y) const override
+        {
+            return tex2D<float>(tex_obj_I1y, x, y);
+        }
+
+        cudaTextureObject_t tex_obj_I1;
+        cudaTextureObject_t tex_obj_I1x;
+        cudaTextureObject_t tex_obj_I1y;
+    };
+
+    template <
+        typename T,
+        typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
+    >
+    __global__ void warpBackwardKernel(
+        const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
+        PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -136,9 +190,9 @@ namespace tvl1flow
             {
                 const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
 
-                sum  += w * tex2D(tex_I1 , cx, cy);
-                sumx += w * tex2D(tex_I1x, cx, cy);
-                sumy += w * tex2D(tex_I1y, cx, cy);
+                sum  += w * src.I1(cx, cy);
+                sumx += w * src.I1x(cx, cy);
+                sumy += w * src.I1y(cx, cy);
 
                 wsum += w;
             }
@@ -173,15 +227,46 @@ namespace tvl1flow
         const dim3 block(32, 8);
         const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
 
-        bindTexture(&tex_I1 , I1);
-        bindTexture(&tex_I1x, I1x);
-        bindTexture(&tex_I1y, I1y);
+        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
 
-        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
-        cudaSafeCall( cudaGetLastError() );
+        if (cc30)
+        {
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
 
-        if (!stream)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
+
+            createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
+            createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
+            createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
+
+            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
+            cudaSafeCall(cudaGetLastError());
+
+            if (!stream)
+                cudaSafeCall(cudaDeviceSynchronize());
+            else
+                cudaSafeCall(cudaStreamSynchronize(stream));
+
+            cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
+            cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
+            cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
+        }
+        else
+        {
+            bindTexture(&tex_I1, I1);
+            bindTexture(&tex_I1x, I1x);
+            bindTexture(&tex_I1y, I1y);
+
+            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
+            cudaSafeCall(cudaGetLastError());
+
+            if (!stream)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
     }
 }
 

From d93b6be3cc13d20644b694e0d1efb46c139af70b Mon Sep 17 00:00:00 2001
From: Liubov Batanina <piccione-mail@yandex.ru>
Date: Tue, 16 Jun 2020 20:09:25 +0300
Subject: [PATCH 07/15] Changed StridedSlice to VariadicSplit in Region layer

---
 modules/dnn/src/layers/region_layer.cpp | 48 +++++++++----------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp
index d229369abb..c0ba4b2ccf 100644
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -412,12 +412,12 @@ public:
         auto scale_x_y_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &scale_x_y);
         auto shift_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{0.5});
 
+        auto axis = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{}, {0});
+        auto splits = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{5}, {1, 1, 1, 1, rows - 4});
+        auto split = std::make_shared<ngraph::op::v1::VariadicSplit>(input2d, axis, splits);
         std::shared_ptr<ngraph::Node> box_x;
         {
-            auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{0, 0});
-            auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, cols});
-            box_x = std::make_shared<ngraph::op::v1::StridedSlice>(input2d, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
-            box_x = std::make_shared<ngraph::op::Sigmoid>(box_x);
+            box_x = std::make_shared<ngraph::op::Sigmoid>(split->output(0));
             box_x = std::make_shared<ngraph::op::v1::Subtract>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
             box_x = std::make_shared<ngraph::op::v1::Multiply>(box_x, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
             box_x = std::make_shared<ngraph::op::v1::Add>(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
@@ -443,10 +443,7 @@ public:
 
         std::shared_ptr<ngraph::Node> box_y;
         {
-            auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{1, 0});
-            auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{2, cols});
-            box_y = std::make_shared<ngraph::op::v1::StridedSlice>(input2d, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
-            box_y = std::make_shared<ngraph::op::Sigmoid>(box_y);
+            box_y = std::make_shared<ngraph::op::Sigmoid>(split->output(1));
             box_y = std::make_shared<ngraph::op::v1::Subtract>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
             box_y = std::make_shared<ngraph::op::v1::Multiply>(box_y, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY);
             box_y = std::make_shared<ngraph::op::v1::Add>(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY);
@@ -499,45 +496,32 @@ public:
                 std::copy(bias_h.begin(), bias_h.begin() + h * anchors, bias_h.begin() + i * h * anchors);
             }
 
-            auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{2, 0});
-            auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{3, cols});
-            box_w = std::make_shared<ngraph::op::v1::StridedSlice>(input2d, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
-            box_w = std::make_shared<ngraph::op::v0::Exp>(box_w);
+            box_w = std::make_shared<ngraph::op::v0::Exp>(split->output(2));
             box_w = std::make_shared<ngraph::op::v1::Reshape>(box_w, shape_3d, true);
             auto anchor_w_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, bias_w.data());
             box_w = std::make_shared<ngraph::op::v1::Multiply>(box_w, anchor_w_node, ngraph::op::AutoBroadcastType::NUMPY);
 
-            lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{3, 0});
-            upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{4, cols});
-            box_h = std::make_shared<ngraph::op::v1::StridedSlice>(input2d, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
-            box_h = std::make_shared<ngraph::op::v0::Exp>(box_h);
+            box_h = std::make_shared<ngraph::op::v0::Exp>(split->output(3));
             box_h = std::make_shared<ngraph::op::v1::Reshape>(box_h, shape_3d, true);
             auto anchor_h_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, box_broad_shape, bias_h.data());
             box_h = std::make_shared<ngraph::op::v1::Multiply>(box_h, anchor_h_node, ngraph::op::AutoBroadcastType::NUMPY);
         }
 
+        auto region_splits = ngraph::op::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{3}, {4, 1, rows - 5});
+        auto region_split = std::make_shared<ngraph::op::v1::VariadicSplit>(region, axis, region_splits);
+
         std::shared_ptr<ngraph::Node> scale;
         {
-            auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{4, 0});
-            auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{5, cols});
-            scale = std::make_shared<ngraph::op::v1::StridedSlice>(region, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
-
-            if (classfix == -1)
-            {
-                auto thresh_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{0.5});
-                auto mask = std::make_shared<ngraph::op::v1::Less>(scale, thresh_node);
-                auto zero_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, mask->get_shape(), std::vector<float>(b * cols, 0));
-                scale = std::make_shared<ngraph::op::v1::Select>(mask, scale, zero_node);
-            }
+            float thr = classfix == -1 ? 0.5 : 0;
+            auto thresh_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, std::vector<float>{thr});
+            auto mask = std::make_shared<ngraph::op::v1::Less>(region_split->output(1), thresh_node);
+            auto zero_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, mask->get_shape(), std::vector<float>(cols, 0));
+            scale = std::make_shared<ngraph::op::v1::Select>(mask, zero_node, region_split->output(1));
         }
 
         std::shared_ptr<ngraph::Node> probs;
         {
-            auto lower_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{5, 0});
-            auto upper_bounds = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, std::vector<int64_t>{rows, cols});
-            auto classes = std::make_shared<ngraph::op::v1::StridedSlice>(region, lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
-            probs = std::make_shared<ngraph::op::v1::Multiply>(classes, scale, ngraph::op::AutoBroadcastType::NUMPY);
-
+            probs = std::make_shared<ngraph::op::v1::Multiply>(region_split->output(2), scale, ngraph::op::AutoBroadcastType::NUMPY);
             auto thresh_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &thresh);
             auto mask = std::make_shared<ngraph::op::v1::Greater>(probs, thresh_node);
             auto zero_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, mask->get_shape(), std::vector<float>((rows - 5) * cols, 0));

From b10ab7974322f991e8ff49ec59ca9612db7d0b08 Mon Sep 17 00:00:00 2001
From: NesQl <32612899+liqi-c@users.noreply.github.com>
Date: Wed, 17 Jun 2020 17:05:04 +0800
Subject: [PATCH 08/15] Merge pull request #17468 from
 liqi-c:sharedlib_build_problem

TEngine installation rules fix for static build

* Modify cmake config error for -DBUILD_SHARED_LIBS=OFF

* Modify for not install tengine include directory

* Update compile error.

* move install command to tengine/CMakeLists.txt

* rm include dir when make install,only build static lib will install libtengine.a
---
 3rdparty/libtengine/tengine.cmake | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/3rdparty/libtengine/tengine.cmake b/3rdparty/libtengine/tengine.cmake
index 4085afcf31..10da845a86 100644
--- a/3rdparty/libtengine/tengine.cmake
+++ b/3rdparty/libtengine/tengine.cmake
@@ -22,7 +22,7 @@
 #         sqfu@openailab.com
 #
 
-SET(TENGINE_COMMIT_VERSION "2f3cd86217f3530c8e4a82f3ed5af14c7a4e3943")
+SET(TENGINE_COMMIT_VERSION "8a4c58e0e05cd850f4bb0936a330edc86dc0e28c")
 SET(OCV_TENGINE_DIR "${OpenCV_BINARY_DIR}/3rdparty/libtengine")
 SET(OCV_TENGINE_SOURCE_PATH "${OCV_TENGINE_DIR}/Tengine-${TENGINE_COMMIT_VERSION}")
 
@@ -34,7 +34,7 @@ IF(EXISTS "${OCV_TENGINE_SOURCE_PATH}")
 ELSE()
 	SET(OCV_TENGINE_FILENAME "${TENGINE_COMMIT_VERSION}.zip")#name2
 	SET(OCV_TENGINE_URL "https://github.com/OAID/Tengine/archive/") #url2
-	SET(tengine_md5sum 9124324b6e2b350012e46ae1db4bad7d) #md5sum2
+	SET(tengine_md5sum f51ca8f3963faeeff3f019a6f6edc206) #md5sum2
 
 	#MESSAGE(STATUS "**** TENGINE DOWNLOAD BEGIN ****")
 	ocv_download(FILENAME ${OCV_TENGINE_FILENAME}
@@ -69,7 +69,6 @@ if(BUILD_TENGINE)
 	   elseif(${ANDROID_ABI} STREQUAL "arm64-v8a")
 			   SET(CONFIG_ARCH_ARM64 ON)
 	   endif()
-	   SET(Tengine_LIB "tengine" CACHE INTERNAL "")
 	else()
 		# linux system
 		if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
@@ -77,7 +76,6 @@ if(BUILD_TENGINE)
 		elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) ## AARCH64
 			   SET(CONFIG_ARCH_ARM64 ON)
 		endif()
-		SET(Tengine_LIB "tengine" CACHE INTERNAL "")
 	endif()
 
 	SET(BUILT_IN_OPENCV ON) ## set for tengine compile discern .
@@ -86,6 +84,6 @@ if(BUILD_TENGINE)
 		add_subdirectory("${OCV_TENGINE_SOURCE_PATH}" "${OCV_TENGINE_DIR}/build")
 	else()
 		message(WARNING "TENGINE: Missing 'CMakeLists.txt' in source code package: ${OCV_TENGINE_SOURCE_PATH}")
-		SET(HAVE_TENGINE 1)
 	endif()
+	SET(Tengine_LIB "tengine" CACHE INTERNAL "")
 endif()

From d01cbe93208335204cfe314e713013853a4dc641 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 16 Jun 2020 02:14:04 +0000
Subject: [PATCH 09/15] cudacodec(build): fix detection in CMake, cleanup
 duplicate includes

---
 cmake/OpenCVDetectCUDA.cmake                 |  1 +
 modules/cudacodec/src/cuvid_video_source.hpp | 10 +---------
 modules/cudacodec/src/frame_queue.hpp        | 10 +---------
 modules/cudacodec/src/video_decoder.hpp      | 12 +-----------
 modules/cudacodec/src/video_parser.hpp       | 11 +----------
 modules/cudacodec/src/video_source.hpp       |  5 +----
 6 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 37a77cd660..22879f3152 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -53,6 +53,7 @@ if(CUDA_FOUND)
       else()
         set(${_result} 1)
       endif()
+      unset(_header_result CACHE)
     endmacro()
     SEARCH_NVCUVID_HEADER("nvcuvid.h" HAVE_NVCUVID_HEADER)
     SEARCH_NVCUVID_HEADER("dynlink_nvcuvid.h" HAVE_DYNLINK_NVCUVID_HEADER)
diff --git a/modules/cudacodec/src/cuvid_video_source.hpp b/modules/cudacodec/src/cuvid_video_source.hpp
index 4dd7761620..d36d856611 100644
--- a/modules/cudacodec/src/cuvid_video_source.hpp
+++ b/modules/cudacodec/src/cuvid_video_source.hpp
@@ -44,17 +44,9 @@
 #ifndef __CUVID_VIDEO_SOURCE_HPP__
 #define __CUVID_VIDEO_SOURCE_HPP__
 
-#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
-    #include <dynlink_nvcuvid.h>
-#elif defined(HAVE_NVCUVID_HEADER)
-    #include <nvcuvid.h>
-#endif
-#include "opencv2/core/private.cuda.hpp"
-#include "opencv2/cudacodec.hpp"
 #include "video_source.hpp"
 
-namespace cv { namespace cudacodec { namespace detail
-{
+namespace cv { namespace cudacodec { namespace detail {
 
 class CuvidVideoSource : public VideoSource
 {
diff --git a/modules/cudacodec/src/frame_queue.hpp b/modules/cudacodec/src/frame_queue.hpp
index f7fe7ad57d..3051a1e450 100644
--- a/modules/cudacodec/src/frame_queue.hpp
+++ b/modules/cudacodec/src/frame_queue.hpp
@@ -45,16 +45,8 @@
 #define __FRAME_QUEUE_HPP__
 
 #include "opencv2/core/utility.hpp"
-#include "opencv2/core/private.cuda.hpp"
 
-#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
-    #include <dynlink_nvcuvid.h>
-#elif defined(HAVE_NVCUVID_HEADER)
-    #include <nvcuvid.h>
-#endif
-
-namespace cv { namespace cudacodec { namespace detail
-{
+namespace cv { namespace cudacodec { namespace detail {
 
 class FrameQueue
 {
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
index 46caccaf83..e41b1a655f 100644
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -44,17 +44,7 @@
 #ifndef __VIDEO_DECODER_HPP__
 #define __VIDEO_DECODER_HPP__
 
-#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
-    #include <dynlink_nvcuvid.h>
-#elif defined(HAVE_NVCUVID_HEADER)
-    #include <nvcuvid.h>
-#endif
-
-#include "opencv2/core/private.cuda.hpp"
-#include "opencv2/cudacodec.hpp"
-
-namespace cv { namespace cudacodec { namespace detail
-{
+namespace cv { namespace cudacodec { namespace detail {
 
 class VideoDecoder
 {
diff --git a/modules/cudacodec/src/video_parser.hpp b/modules/cudacodec/src/video_parser.hpp
index 03fff8e96a..91e50b3e2b 100644
--- a/modules/cudacodec/src/video_parser.hpp
+++ b/modules/cudacodec/src/video_parser.hpp
@@ -44,19 +44,10 @@
 #ifndef __VIDEO_PARSER_HPP__
 #define __VIDEO_PARSER_HPP__
 
-#if defined(HAVE_DYNLINK_NVCUVID_HEADER)
-    #include <dynlink_nvcuvid.h>
-#elif defined(HAVE_NVCUVID_HEADER)
-    #include <nvcuvid.h>
-#endif
-
-#include "opencv2/core/private.cuda.hpp"
-#include "opencv2/cudacodec.hpp"
 #include "frame_queue.hpp"
 #include "video_decoder.hpp"
 
-namespace cv { namespace cudacodec { namespace detail
-{
+namespace cv { namespace cudacodec { namespace detail {
 
 class VideoParser
 {
diff --git a/modules/cudacodec/src/video_source.hpp b/modules/cudacodec/src/video_source.hpp
index 9f2ed29d58..887ac04679 100644
--- a/modules/cudacodec/src/video_source.hpp
+++ b/modules/cudacodec/src/video_source.hpp
@@ -44,12 +44,9 @@
 #ifndef __CUDACODEC_VIDEO_SOURCE_H__
 #define __CUDACODEC_VIDEO_SOURCE_H__
 
-#include "opencv2/core/private.cuda.hpp"
-#include "opencv2/cudacodec.hpp"
 #include "thread.hpp"
 
-namespace cv { namespace cudacodec { namespace detail
-{
+namespace cv { namespace cudacodec { namespace detail {
 
 class VideoParser;
 

From 1cba763189a365a6bfd3f613c506797e4e199c21 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Wed, 17 Jun 2020 22:02:51 +0900
Subject: [PATCH 10/15] fix build error of automatic CC detection

---
 cmake/OpenCVDetectCUDA.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 37a77cd660..ecaec3f190 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -101,7 +101,7 @@ if(CUDA_FOUND)
     else()
       set(CC_LIST ${ARGN})
       foreach(target_arch ${CC_LIST})
-        string(REPLACE "." "" target_arch_short ${target_arch})
+        string(REPLACE "." "" target_arch_short "${target_arch}")
         set(NVCC_OPTION "-gencode;arch=compute_${target_arch_short},code=sm_${target_arch_short}")
         execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" ${NVCC_OPTION} "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu"
                          WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
@@ -111,7 +111,7 @@ if(CUDA_FOUND)
           set(${result_list} "${${result_list}} ${target_arch}")
         endif()
       endforeach()
-      string(STRIP ${${result_list}} ${result_list})
+      string(STRIP "${${result_list}}" ${result_list})
       set(CUDA_SUPPORTED_CC ${${result_list}} CACHE INTERNAL "List of supported compute capability")
     endif()
   endmacro()
@@ -124,7 +124,7 @@ if(CUDA_FOUND)
   endmacro()
 
   macro(ocv_wipeout_deprecated _arch_bin_list)
-    string(REPLACE "2.1" "2.1(2.0)" ${_arch_bin_list} ${${_arch_bin_list}})
+    string(REPLACE "2.1" "2.1(2.0)" ${_arch_bin_list} "${${_arch_bin_list}}")
   endmacro()
 
   set(__cuda_arch_ptx "")

From a7cc1159cdb9334041b0093393b3e49144dd0b26 Mon Sep 17 00:00:00 2001
From: Alex Cohn <sasha.cohn@gmail.com>
Date: Thu, 18 Jun 2020 10:40:43 +0300
Subject: [PATCH 11/15] Merge pull request #17573 from
 alexcohn:fix/android_windows_build

* fixing #17572

https://github.com/opencv/opencv/issues/17572 Build for Android failed: "can't concat str to bytes"

on Windows 10 64bit with python 3.6.6

* similar to changes in platforms/winpack_dldt/build_package.py
---
 platforms/android/build_sdk.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/platforms/android/build_sdk.py b/platforms/android/build_sdk.py
index 7cd945d60e..14fd8f664f 100755
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@@ -59,6 +59,8 @@ def check_executable(cmd):
     try:
         log.debug("Executing: %s" % cmd)
         result = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+        if not isinstance(result, str):
+            result = result.decode("utf-8")
         log.debug("Result: %s" % (result+'\n').split('\n')[0])
         return True
     except Exception as e:

From 456e88a8a424d66e839158babcda2b8380d1cc79 Mon Sep 17 00:00:00 2001
From: Yuriy Obukh <Bleach665@gmail.com>
Date: Thu, 18 Jun 2020 14:31:11 +0300
Subject: [PATCH 12/15] fix VS Windows build with eigen.
 https://github.com/opencv/opencv/issues/17548

---
 modules/core/include/opencv2/core/eigen.hpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/modules/core/include/opencv2/core/eigen.hpp b/modules/core/include/opencv2/core/eigen.hpp
index 3d7ba8fa14..3f4be931e6 100644
--- a/modules/core/include/opencv2/core/eigen.hpp
+++ b/modules/core/include/opencv2/core/eigen.hpp
@@ -51,18 +51,19 @@
 
 #include "opencv2/core.hpp"
 
+#if defined _MSC_VER && _MSC_VER >= 1200
+#define NOMINMAX // fix https://github.com/opencv/opencv/issues/17548
+#pragma warning( disable: 4714 ) //__forceinline is not inlined
+#pragma warning( disable: 4127 ) //conditional expression is constant
+#pragma warning( disable: 4244 ) //conversion from '__int64' to 'int', possible loss of data
+#endif
+
 #if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \
     && defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY)
 #include <unsupported/Eigen/CXX11/Tensor>
 #define OPENCV_EIGEN_TENSOR_SUPPORT
 #endif // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
 
-#if defined _MSC_VER && _MSC_VER >= 1200
-#pragma warning( disable: 4714 ) //__forceinline is not inlined
-#pragma warning( disable: 4127 ) //conditional expression is constant
-#pragma warning( disable: 4244 ) //conversion from '__int64' to 'int', possible loss of data
-#endif
-
 namespace cv
 {
 

From c07af090f517a81f7bc823709c7e759d993e9062 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Thu, 18 Jun 2020 21:25:15 +0900
Subject: [PATCH 13/15] fix build error on Jetson TX1 and TX2  * enable_if_t
 and is_base_of is c++14 feature  * override is c++11 feature

---
 modules/cudaimgproc/src/cuda/canny.cu | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
index 253287ca31..0bc3e2a671 100644
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -107,7 +107,7 @@ namespace canny
     {
         __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
 
-        __device__ __forceinline__ int operator ()(int y, int x) const override
+        __device__ __forceinline__ int operator ()(int y, int x) const
         {
             return tex2D(tex_src, x + xoff, y + yoff);
         }
@@ -117,7 +117,7 @@ namespace canny
     {
         __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
 
-        __device__ __forceinline__ int operator ()(int y, int x) const override
+        __device__ __forceinline__ int operator ()(int y, int x) const
         {
             return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
         }
@@ -127,8 +127,7 @@ namespace canny
 
     template <
         class T,
-        class Norm,
-        typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
+        class Norm
     >
     __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
     {

From d25293b721f4e12988f439ffe5cf4fd46072b9f8 Mon Sep 17 00:00:00 2001
From: Philippe FOUBERT <philippe.foubert@free.fr>
Date: Thu, 18 Jun 2020 11:48:43 +0200
Subject: [PATCH 14/15] Fix the build of imgproc using MinGW (variables with
 the same name as symbols defined in MinGW headers)

---
 modules/imgproc/src/color_yuv.simd.hpp | 40 +++++++++++++-------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp
index 8bbd78b244..076d1a4bd5 100644
--- a/modules/imgproc/src/color_yuv.simd.hpp
+++ b/modules/imgproc/src/color_yuv.simd.hpp
@@ -347,16 +347,16 @@ struct RGB2YCrCb_i<ushort>
             sr0 = sr0 - sy0; sr1 = sr1 - sy1;
             sb0 = sb0 - sy0; sb1 = sb1 - sy1;
 
-            v_int32 scr0, scr1, scb0, scb1;
+            v_int32 v_scr0, v_scr1, v_scb0, v_scb1;
 
-            scr0 = (sr0*vc3 + vdd) >> shift;
-            scr1 = (sr1*vc3 + vdd) >> shift;
-            scb0 = (sb0*vc4 + vdd) >> shift;
-            scb1 = (sb1*vc4 + vdd) >> shift;
+            v_scr0 = (sr0*vc3 + vdd) >> shift;
+            v_scr1 = (sr1*vc3 + vdd) >> shift;
+            v_scb0 = (sb0*vc4 + vdd) >> shift;
+            v_scb1 = (sb1*vc4 + vdd) >> shift;
 
             // saturate and pack
-            cr = v_pack_u(scr0, scr1);
-            cb = v_pack_u(scb0, scb1);
+            cr = v_pack_u(v_scr0, v_scr1);
+            cb = v_pack_u(v_scb0, v_scb1);
 
             if(yuvOrder)
             {
@@ -781,36 +781,36 @@ struct YCrCb2RGB_i<uchar>
             v_int8 scr = v_reinterpret_as_s8(cr);
             v_int8 scb = v_reinterpret_as_s8(cb);
 
-            v_int16 scr0, scr1, scb0, scb1;
-            v_expand(scr, scr0, scr1);
-            v_expand(scb, scb0, scb1);
+            v_int16 v_scr0, v_scr1, v_scb0, v_scb1;
+            v_expand(scr, v_scr0, v_scr1);
+            v_expand(scb, v_scb0, v_scb1);
 
             v_int32 b00, b01, b10, b11;
             v_int32 g00, g01, g10, g11;
             v_int32 r00, r01, r10, r11;
 
-            v_mul_expand(scb0, vc3, b00, b01);
-            v_mul_expand(scb1, vc3, b10, b11);
+            v_mul_expand(v_scb0, vc3, b00, b01);
+            v_mul_expand(v_scb1, vc3, b10, b11);
             if(yuvOrder)
             {
                 // if YUV then C3 > 2^15
                 // so we fix the multiplication
                 v_int32 cb00, cb01, cb10, cb11;
-                v_expand(scb0, cb00, cb01);
-                v_expand(scb1, cb10, cb11);
+                v_expand(v_scb0, cb00, cb01);
+                v_expand(v_scb1, cb10, cb11);
                 b00 += cb00 << 15; b01 += cb01 << 15;
                 b10 += cb10 << 15; b11 += cb11 << 15;
             }
 
             v_int32 t00, t01, t10, t11;
-            v_mul_expand(scb0, vc2, t00, t01);
-            v_mul_expand(scb1, vc2, t10, t11);
-            v_mul_expand(scr0, vc1, g00, g01);
-            v_mul_expand(scr1, vc1, g10, g11);
+            v_mul_expand(v_scb0, vc2, t00, t01);
+            v_mul_expand(v_scb1, vc2, t10, t11);
+            v_mul_expand(v_scr0, vc1, g00, g01);
+            v_mul_expand(v_scr1, vc1, g10, g11);
             g00 += t00; g01 += t01;
             g10 += t10; g11 += t11;
-            v_mul_expand(scr0, vc0, r00, r01);
-            v_mul_expand(scr1, vc0, r10, r11);
+            v_mul_expand(v_scr0, vc0, r00, r01);
+            v_mul_expand(v_scr1, vc0, r10, r11);
 
             b00 = (b00 + vdescale) >> shift; b01 = (b01 + vdescale) >> shift;
             b10 = (b10 + vdescale) >> shift; b11 = (b11 + vdescale) >> shift;

From 77fa1a20bf2fc305a7959f78c66a70b0fbc15a02 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <tomoaki.teshima@gmail.com>
Date: Fri, 19 Jun 2020 07:56:37 +0900
Subject: [PATCH 15/15] fix build on Jetson TX1 and TX2   * enable_if_t is a
 c++14 feature

---
 modules/cudaoptflow/src/cuda/tvl1flow.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/cudaoptflow/src/cuda/tvl1flow.cu b/modules/cudaoptflow/src/cuda/tvl1flow.cu
index 2688e05c61..a84b9a3520 100644
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -154,8 +154,7 @@ namespace tvl1flow
     };
 
     template <
-        typename T,
-        typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
+        typename T
     >
     __global__ void warpBackwardKernel(
         const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,