From 9699fde8d2bcace8ba25394406c7168736e7b63e Mon Sep 17 00:00:00 2001
From: Yan Wang <yan.wang@linux.intel.com>
Date: Mon, 22 Dec 2014 18:41:50 +0800
Subject: [PATCH 1/7] Use hypot to do vector calculation.

Signed-off-by: Yan Wang <yan.wang@linux.intel.com>
---
 modules/objdetect/src/opencl/objdetect_hog.cl | 35 +++++++++----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl
index 704dec4447..4fae320d02 100644
--- a/modules/objdetect/src/opencl/objdetect_hog.cl
+++ b/modules/objdetect/src/opencl/objdetect_hog.cl
@@ -48,7 +48,7 @@
 #define CELLS_PER_BLOCK_X 2
 #define CELLS_PER_BLOCK_Y 2
 #define NTHREADS 256
-#define CV_PI_F 3.1415926535897932384626433832795f
+#define CV_PI_F M_PI_F
 
 #ifdef INTEL_DEVICE
 #define QANGLE_TYPE     int
@@ -606,23 +606,23 @@ __kernel void compute_gradients_8UC4_kernel(
     barrier(CLK_LOCAL_MEM_FENCE);
     if (x < width)
     {
-        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
-            sh_row[tid + 2 * (NTHREADS + 2)]);
-        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
-            sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+        float4 a = (float4) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
+            sh_row[tid + 2 * (NTHREADS + 2)], 0);
+        float4 b = (float4) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
+            sh_row[tid + 2 + 2 * (NTHREADS + 2)], 0);
 
-        float3 dx;
+        float4 dx;
         if (correct_gamma == 1)
             dx = sqrt(b) - sqrt(a);
         else
             dx = b - a;
 
-        float3 dy = (float3) 0.f;
+        float4 dy = (float4) 0.f;
 
         if (gidY > 0 && gidY < height - 1)
         {
-            a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
-            b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
+            a = convert_float4(img[(gidY - 1) * img_step + x].xyzw);
+            b = convert_float4(img[(gidY + 1) * img_step + x].xyzw);
 
             if (correct_gamma == 1)
                 dy = sqrt(b) - sqrt(a);
@@ -630,28 +630,25 @@ __kernel void compute_gradients_8UC4_kernel(
                 dy = b - a;
         }
 
+        float4 mag = hypot(dx, dy);
         float best_dx = dx.x;
         float best_dy = dy.x;
 
-        float mag0 = dx.x * dx.x + dy.x * dy.x;
-        float mag1 = dx.y * dx.y + dy.y * dy.y;
-        if (mag0 < mag1)
+        float mag0 = mag.x;
+        if (mag0 < mag.y)
         {
             best_dx = dx.y;
             best_dy = dy.y;
-            mag0 = mag1;
+            mag0 = mag.y;
         }
 
-        mag1 = dx.z * dx.z + dy.z * dy.z;
-        if (mag0 < mag1)
+        if (mag0 < mag.z)
         {
             best_dx = dx.z;
             best_dy = dy.z;
-            mag0 = mag1;
+            mag0 = mag.z;
         }
 
-        mag0 = sqrt(mag0);
-
         float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
         int hidx = (int)floor(ang);
         ang -= hidx;
@@ -710,7 +707,7 @@ __kernel void compute_gradients_8UC1_kernel(
             else
                 dy = a - b;
         }
-        float mag = sqrt(dx * dx + dy * dy);
+        float mag = hypot(dx, dy);
 
         float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
         int hidx = (int)floor(ang);

From 220d937d9a27951a3d66e2c8daaa1399b12d83fc Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 30 Dec 2014 15:36:58 +0300
Subject: [PATCH 2/7] removed buffered versions of histogram functions

used BufferPool mechanism instead
---
 .../include/opencv2/core/private.cuda.hpp     |  6 ++
 .../include/opencv2/cudaimgproc.hpp           | 50 ++--------------
 modules/cudaimgproc/perf/perf_histogram.cpp   |  9 +--
 modules/cudaimgproc/src/histogram.cpp         | 60 +++++++++----------
 samples/gpu/performance/tests.cpp             |  5 +-
 5 files changed, 46 insertions(+), 84 deletions(-)

diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index a97388bd05..9fff4ee281 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -130,6 +130,12 @@ namespace cv { namespace cuda
     class NppStreamHandler
     {
     public:
+        inline explicit NppStreamHandler(Stream& newStream)
+        {
+            oldStream = nppGetStream();
+            nppSetStream(StreamAccessor::getStream(newStream));
+        }
+
         inline explicit NppStreamHandler(cudaStream_t newStream)
         {
             oldStream = nppGetStream();
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index 1ec288fa9c..7aa74aa385 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -205,19 +205,11 @@ CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stre
 
 @param src Source image with CV_8UC1 type.
 @param dst Destination image.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
 @param stream Stream for the asynchronous version.
 
 @sa equalizeHist
  */
-CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::equalizeHist(src, dst, buf, stream);
-}
+CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
 /** @brief Base class for Contrast Limited Adaptive Histogram Equalization. :
  */
@@ -259,27 +251,11 @@ a four-channel image, all channels are processed separately.
 @param histSize Size of the histogram.
 @param lowerLevel Lower boundary of lowest-level bin.
 @param upperLevel Upper boundary of highest-level bin.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
 @param stream Stream for the asynchronous version.
  */
-CV_EXPORTS void histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
-
+CV_EXPORTS void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
 /** @overload */
-static inline void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
-
-/** @overload */
-CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
+CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
 
 /** @brief Calculates a histogram with bins determined by the levels array.
 
@@ -287,27 +263,11 @@ static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int
 For a four-channel image, all channels are processed separately.
 @param hist Destination histogram with one row, (levels.cols-1) columns, and the CV_32SC1 type.
 @param levels Number of levels in the histogram.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
 @param stream Stream for the asynchronous version.
  */
-CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null());
-
+CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null());
 /** @overload */
-static inline void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histRange(src, hist, levels, buf, stream);
-}
-
-/** @overload */
-CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histRange(src, hist, levels, buf, stream);
-}
+CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());
 
 //! @} cudaimgproc_hist
 
diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp
index 0e020394a1..c638ce0ce5 100644
--- a/modules/cudaimgproc/perf/perf_histogram.cpp
+++ b/modules/cudaimgproc/perf/perf_histogram.cpp
@@ -63,9 +63,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC1,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;
 
-        TEST_CYCLE() cv::cuda::histEven(d_src, dst, d_buf, 30, 0, 180);
+        TEST_CYCLE() cv::cuda::histEven(d_src, dst, 30, 0, 180);
 
         CUDA_SANITY_CHECK(dst);
     }
@@ -106,9 +105,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC4,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat d_hist[4];
-        cv::cuda::GpuMat d_buf;
 
-        TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, d_buf, histSize, lowerLevel, upperLevel);
+        TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, histSize, lowerLevel, upperLevel);
 
         cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3;
         d_hist[0].download(cpu_hist0);
@@ -167,9 +165,8 @@ PERF_TEST_P(Sz, EqualizeHist,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;
 
-        TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst, d_buf);
+        TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst);
 
         CUDA_SANITY_CHECK(dst);
     }
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index d63e57de31..a965242f8b 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -49,7 +49,7 @@ using namespace cv::cuda;
 
 void cv::cuda::calcHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::cuda::equalizeHist(InputArray, OutputArray, InputOutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::cuda::CLAHE>(); }
 
@@ -93,7 +93,7 @@ namespace hist
     void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
 }
 
-void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray _buf, Stream& _stream)
+void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
 {
     GpuMat src = _src.getGpuMat();
 
@@ -107,8 +107,8 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray
 
     size_t bufSize = intBufSize + 2 * 256 * sizeof(int);
 
-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, _buf);
-    GpuMat buf = _buf.getGpuMat();
+    BufferPool pool(_stream);
+    GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
 
     GpuMat hist(1, 256, CV_32SC1, buf.data);
     GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int));
@@ -288,7 +288,7 @@ namespace
     {
         typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;
 
-        static void hist(const GpuMat& src, OutputArray _hist, InputOutputArray _buf, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+        static void hist(const GpuMat& src, OutputArray _hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
         {
             const int levels = histSize + 1;
 
@@ -302,15 +302,15 @@ namespace
             int buf_size;
             get_buf_size(sz, levels, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8UC1, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
                 lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
 
-            if (stream == 0)
+            if (!stream)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
@@ -319,7 +319,7 @@ namespace
     {
         typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;
 
-        static void hist(const GpuMat& src, GpuMat hist[4],InputOutputArray _buf, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
+        static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
         {
             int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
             hist[0].create(1, histSize[0], CV_32S);
@@ -336,14 +336,14 @@ namespace
             int buf_size;
             get_buf_size(sz, levels, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
 
-            if (stream == 0)
+            if (!stream)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
@@ -392,7 +392,7 @@ namespace
         typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
         enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
 
-        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, InputOutputArray _buf, cudaStream_t stream)
+        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, Stream& stream)
         {
             CV_Assert( levels.type() == LEVEL_TYPE_CODE && levels.rows == 1 );
 
@@ -406,8 +406,8 @@ namespace
             int buf_size;
             get_buf_size(sz, levels.cols, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
@@ -424,7 +424,7 @@ namespace
         typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
         enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
 
-        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4],InputOutputArray _buf, cudaStream_t stream)
+        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
         {
             CV_Assert( levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1 );
             CV_Assert( levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1 );
@@ -447,8 +447,8 @@ namespace
             int buf_size;
             get_buf_size(sz, nLevels, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
@@ -493,9 +493,9 @@ namespace
     }
 }
 
-void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, InputOutputArray buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
@@ -514,12 +514,12 @@ void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf,
 
     CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
 
-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
 }
 
-void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], InputOutputArray buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
@@ -532,12 +532,12 @@ void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, i
 
     CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
 
-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
 }
 
-void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, InputOutputArray buf, Stream& stream)
+void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, InputOutputArray buf, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
@@ -553,12 +553,12 @@ void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels,
 
     CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1 );
 
-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, levels, stream);
 }
 
-void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream)
+void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
@@ -573,7 +573,7 @@ void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4]
 
     CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4 );
 
-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, levels, stream);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index af3f874e1f..2e7faa3341 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1053,12 +1053,11 @@ TEST(equalizeHist)
 
         cuda::GpuMat d_src(src);
         cuda::GpuMat d_dst;
-        cuda::GpuMat d_buf;
 
-        cuda::equalizeHist(d_src, d_dst, d_buf);
+        cuda::equalizeHist(d_src, d_dst);
 
         CUDA_ON;
-        cuda::equalizeHist(d_src, d_dst, d_buf);
+        cuda::equalizeHist(d_src, d_dst);
         CUDA_OFF;
     }
 }

From f50a0612254be1116140886ba733fa4861737038 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 30 Dec 2014 15:37:14 +0300
Subject: [PATCH 3/7] added stream parameter to all cudaimgproc routines

---
 .../include/opencv2/cudaimgproc.hpp           | 36 ++++++++-----
 modules/cudaimgproc/src/canny.cpp             | 50 +++++++++----------
 modules/cudaimgproc/src/cuda/canny.cu         | 50 +++++++++++--------
 modules/cudaimgproc/src/gftt.cpp              |  7 ++-
 modules/cudaimgproc/src/histogram.cpp         |  6 +--
 modules/cudaimgproc/src/hough_circles.cpp     |  7 ++-
 modules/cudaimgproc/src/hough_lines.cpp       | 21 +++++---
 modules/cudaimgproc/src/hough_segments.cpp    |  7 ++-
 modules/cudaimgproc/src/mssegmentation.cpp    |  9 ++--
 9 files changed, 115 insertions(+), 78 deletions(-)

diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index 7aa74aa385..52bfcef7a9 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -240,8 +240,9 @@ CV_EXPORTS Ptr<cuda::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSi
 @param nLevels Number of computed levels. nLevels must be at least 2.
 @param lowerLevel Lower boundary value of the lowest level.
 @param upperLevel Upper boundary value of the greatest level.
+@param stream Stream for the asynchronous version.
  */
-CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel);
+CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
 
 /** @brief Calculates a histogram with evenly distributed bins.
 
@@ -281,15 +282,17 @@ public:
     /** @brief Finds edges in an image using the @cite Canny86 algorithm.
 
     @param image Single-channel 8-bit input image.
-    @param edges Output edge map. It has the same size and type as image .
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
      */
-    virtual void detect(InputArray image, OutputArray edges) = 0;
+    virtual void detect(InputArray image, OutputArray edges, Stream& stream = Stream::Null()) = 0;
     /** @overload
     @param dx First derivative of image in the vertical direction. Support only CV_32S type.
     @param dy First derivative of image in the horizontal direction. Support only CV_32S type.
-    @param edges Output edge map. It has the same size and type as image .
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
     */
-    virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0;
+    virtual void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream = Stream::Null()) = 0;
 
     virtual void setLowThreshold(double low_thresh) = 0;
     virtual double getLowThreshold() const = 0;
@@ -336,18 +339,20 @@ public:
     \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
     the image). \f$\theta\f$ is the line rotation angle in radians (
     \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+    @param stream Stream for the asynchronous version.
 
     @sa HoughLines
      */
-    virtual void detect(InputArray src, OutputArray lines) = 0;
+    virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;
 
     /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory.
 
     @param d_lines Result of cuda::HoughLinesDetector::detect .
     @param h_lines Output host array.
     @param h_votes Optional output array for line's votes.
+    @param stream Stream for the asynchronous version.
      */
-    virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0;
+    virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray(), Stream& stream = Stream::Null()) = 0;
 
     virtual void setRho(float rho) = 0;
     virtual float getRho() const = 0;
@@ -391,10 +396,11 @@ public:
     @param lines Output vector of lines. Each line is represented by a 4-element vector
     \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
     line segment.
+    @param stream Stream for the asynchronous version.
 
     @sa HoughLinesP
      */
-    virtual void detect(InputArray src, OutputArray lines) = 0;
+    virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;
 
     virtual void setRho(float rho) = 0;
     virtual float getRho() const = 0;
@@ -435,10 +441,11 @@ public:
     @param src 8-bit, single-channel grayscale input image.
     @param circles Output vector of found circles. Each vector is encoded as a 3-element
     floating-point vector \f$(x, y, radius)\f$ .
+    @param stream Stream for the asynchronous version.
 
     @sa HoughCircles
      */
-    virtual void detect(InputArray src, OutputArray circles) = 0;
+    virtual void detect(InputArray src, OutputArray circles, Stream& stream = Stream::Null()) = 0;
 
     virtual void setDp(float dp) = 0;
     virtual float getDp() const = 0;
@@ -553,8 +560,9 @@ public:
     positions).
     @param mask Optional region of interest. If the image is not empty (it needs to have the type
     CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+    @param stream Stream for the asynchronous version.
      */
-    virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0;
+    virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray(), Stream& stream = Stream::Null()) = 0;
 };
 
 /** @brief Creates implementation for cuda::CornersDetector .
@@ -590,7 +598,7 @@ as src .
 @param sp Spatial window radius.
 @param sr Color window radius.
 @param criteria Termination criteria. See TermCriteria.
-@param stream
+@param stream Stream for the asynchronous version.
 
 It maps each point of the source image into another point. As a result, you have a new color and new
 position of each point.
@@ -610,7 +618,7 @@ src size. The type is CV_16SC2 .
 @param sp Spatial window radius.
 @param sr Color window radius.
 @param criteria Termination criteria. See TermCriteria.
-@param stream
+@param stream Stream for the asynchronous version.
 
 @sa cuda::meanShiftFiltering
  */
@@ -626,9 +634,11 @@ CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dsts
 @param sr Color window radius.
 @param minsize Minimum segment size. Smaller segments are merged.
 @param criteria Termination criteria. See TermCriteria.
+@param stream Stream for the asynchronous version.
  */
 CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize,
-                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                                      Stream& stream = Stream::Null());
 
 /////////////////////////// Match Template ////////////////////////////
 
diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp
index eed4a284e5..1e52bd295a 100644
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -53,16 +53,16 @@ Ptr<CannyEdgeDetector> cv::cuda::createCannyEdgeDetector(double, double, int, bo
 
 namespace canny
 {
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);
 
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);
 
-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);
 
-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);
 
-    void getEdges(PtrStepSzi map, PtrStepSzb dst);
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
 }
 
 namespace
@@ -76,8 +76,8 @@ namespace
             old_apperture_size_ = -1;
         }
 
-        void detect(InputArray image, OutputArray edges);
-        void detect(InputArray dx, InputArray dy, OutputArray edges);
+        void detect(InputArray image, OutputArray edges, Stream& stream);
+        void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream);
 
         void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; }
         double getLowThreshold() const { return low_thresh_; }
@@ -111,7 +111,7 @@ namespace
 
     private:
         void createBuf(Size image_size);
-        void CannyCaller(GpuMat& edges);
+        void CannyCaller(GpuMat& edges, Stream& stream);
 
         double low_thresh_;
         double high_thresh_;
@@ -128,7 +128,7 @@ namespace
         int old_apperture_size_;
     };
 
-    void CannyImpl::detect(InputArray _image, OutputArray _edges)
+    void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
     {
         GpuMat image = _image.getGpuMat();
 
@@ -150,24 +150,24 @@ namespace
             image.locateROI(wholeSize, ofs);
             GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step);
 
-            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_);
+            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
         }
         else
         {
 #ifndef HAVE_OPENCV_CUDAFILTERS
             throw_no_cuda();
 #else
-            filterDX_->apply(image, dx_);
-            filterDY_->apply(image, dy_);
+            filterDX_->apply(image, dx_, stream);
+            filterDY_->apply(image, dy_, stream);
 
-            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
 #endif
         }
 
-        CannyCaller(edges);
+        CannyCaller(edges, stream);
     }
 
-    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges)
+    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges, Stream& stream)
     {
         GpuMat dx = _dx.getGpuMat();
         GpuMat dy = _dy.getGpuMat();
@@ -176,8 +176,8 @@ namespace
         CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() );
         CV_Assert( deviceSupports(SHARED_ATOMICS) );
 
-        dx.copyTo(dx_);
-        dy.copyTo(dy_);
+        dx.copyTo(dx_, stream);
+        dy.copyTo(dy_, stream);
 
         if (low_thresh_ > high_thresh_)
             std::swap(low_thresh_, high_thresh_);
@@ -187,9 +187,9 @@ namespace
         _edges.create(dx.size(), CV_8UC1);
         GpuMat edges = _edges.getGpuMat();
 
-        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
 
-        CannyCaller(edges);
+        CannyCaller(edges, stream);
     }
 
     void CannyImpl::createBuf(Size image_size)
@@ -215,16 +215,16 @@ namespace
         ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_);
     }
 
-    void CannyImpl::CannyCaller(GpuMat& edges)
+    void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
     {
         map_.setTo(Scalar::all(0));
-        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_));
+        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));
 
-        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>());
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));
 
-        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>());
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));
 
-        canny::getEdges(map_, edges);
+        canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
     }
 }
 
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
index 3d770e179d..e0ba515693 100644
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -120,7 +120,7 @@ namespace canny
         mag(y, x) = norm(dxVal, dyVal);
     }
 
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
@@ -131,30 +131,31 @@ namespace canny
         if (L2Grad)
         {
             L2 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
         }
         else
         {
             L1 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
         }
 
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall(cudaThreadSynchronize());
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
     {
         if (L2Grad)
         {
             L2 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
         }
         else
         {
             L1 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
         }
     }
 }
@@ -217,17 +218,18 @@ namespace canny
         map(y, x) = edge_type;
     }
 
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
 
         bindTexture(&tex_mag, mag);
 
-        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
@@ -328,20 +330,21 @@ namespace canny
         }
     }
 
-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1)
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
     {
         void* counter_ptr;
         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
 
-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+        cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
 
         const dim3 block(16, 16);
         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
 
-        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
@@ -441,27 +444,30 @@ namespace canny
         }
     }
 
-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2)
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
     {
         void* counter_ptr;
         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
 
         int count;
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+        cudaSafeCall( cudaStreamSynchronize(stream) );
 
         while (count > 0)
         {
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
 
             const dim3 block(128);
             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
 
-            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
+            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             count = min(count, map.cols * map.rows);
 
@@ -499,9 +505,9 @@ namespace cv { namespace cuda { namespace device
 
 namespace canny
 {
-    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream)
     {
-        transform(map, dst, GetEdges(), WithOutMask(), 0);
+        transform(map, dst, GetEdges(), WithOutMask(), stream);
     }
 }
 
diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp
index 243665083a..162ee469ce 100644
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -68,7 +68,7 @@ namespace
         GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
                                     int blockSize, bool useHarrisDetector, double harrisK);
 
-        void detect(InputArray image, OutputArray corners, InputArray mask = noArray());
+        void detect(InputArray image, OutputArray corners, InputArray mask, Stream& stream);
 
     private:
         int maxCorners_;
@@ -96,8 +96,11 @@ namespace
                     cuda::createMinEigenValCorner(srcType, blockSize, 3);
     }
 
-    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask)
+    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::gfft;
 
         GpuMat image = _image.getGpuMat();
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index a965242f8b..e942e9eb86 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -53,7 +53,7 @@ void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda();
 
 cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::cuda::CLAHE>(); }
 
-void cv::cuda::evenLevels(OutputArray, int, int, int) { throw_no_cuda(); }
+void cv::cuda::evenLevels(OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
 
 void cv::cuda::histEven(InputArray, OutputArray, InputOutputArray, int, int, int, Stream&) { throw_no_cuda(); }
 void cv::cuda::histEven(InputArray, GpuMat*, InputOutputArray, int*, int*, int*, Stream&) { throw_no_cuda(); }
@@ -460,7 +460,7 @@ namespace
     };
 }
 
-void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel)
+void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream)
 {
     const int kind = _levels.kind();
 
@@ -475,7 +475,7 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
     nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
 
     if (kind == _InputArray::CUDA_GPU_MAT)
-        _levels.getGpuMatRef().upload(host_levels);
+        _levels.getGpuMatRef().upload(host_levels, stream);
 }
 
 namespace hist
diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp
index 3f9b9334c5..6bdaf16a2d 100644
--- a/modules/cudaimgproc/src/hough_circles.cpp
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@@ -74,7 +74,7 @@ namespace
     public:
         HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);
 
-        void detect(InputArray src, OutputArray circles);
+        void detect(InputArray src, OutputArray circles, Stream& stream);
 
         void setDp(float dp) { dp_ = dp; }
         float getDp() const { return dp_; }
@@ -154,8 +154,11 @@ namespace
         filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
     }
 
-    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles)
+    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_circles;
 
diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp
index b9f159a9cb..7b9c082942 100644
--- a/modules/cudaimgproc/src/hough_lines.cpp
+++ b/modules/cudaimgproc/src/hough_lines.cpp
@@ -75,8 +75,8 @@ namespace
         {
         }
 
-        void detect(InputArray src, OutputArray lines);
-        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
+        void detect(InputArray src, OutputArray lines, Stream& stream);
+        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream);
 
         void setRho(float rho) { rho_ = rho; }
         float getRho() const { return rho_; }
@@ -125,8 +125,11 @@ namespace
         GpuMat result_;
     };
 
-    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines)
+    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_lines;
 
@@ -170,7 +173,7 @@ namespace
         result_.copyTo(lines);
     }
 
-    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes)
+    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream)
     {
         GpuMat d_lines = _d_lines.getGpuMat();
 
@@ -184,12 +187,18 @@ namespace
 
         CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 );
 
-        d_lines.row(0).download(h_lines);
+        if (stream)
+            d_lines.row(0).download(h_lines, stream);
+        else
+            d_lines.row(0).download(h_lines);
 
         if (h_votes.needed())
         {
             GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr<int>(1));
-            d_votes.download(h_votes);
+            if (stream)
+                d_votes.download(h_votes, stream);
+            else
+                d_votes.download(h_votes);
         }
     }
 }
diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp
index 2434f6d26d..e3e34ec3d0 100644
--- a/modules/cudaimgproc/src/hough_segments.cpp
+++ b/modules/cudaimgproc/src/hough_segments.cpp
@@ -79,7 +79,7 @@ namespace
         {
         }
 
-        void detect(InputArray src, OutputArray lines);
+        void detect(InputArray src, OutputArray lines, Stream& stream);
 
         void setRho(float rho) { rho_ = rho; }
         float getRho() const { return rho_; }
@@ -128,8 +128,11 @@ namespace
         GpuMat result_;
     };
 
-    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines)
+    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_lines;
         using namespace cv::cuda::device::hough_segments;
diff --git a/modules/cudaimgproc/src/mssegmentation.cpp b/modules/cudaimgproc/src/mssegmentation.cpp
index ad5819800e..54926f3777 100644
--- a/modules/cudaimgproc/src/mssegmentation.cpp
+++ b/modules/cudaimgproc/src/mssegmentation.cpp
@@ -43,7 +43,7 @@
 
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
-void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria) { throw_no_cuda(); }
+void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
 
 #else
 
@@ -222,7 +222,7 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
 } // anonymous namespace
 
 
-void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria)
+void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria, Stream& stream)
 {
     GpuMat src = _src.getGpuMat();
 
@@ -235,7 +235,10 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp,
 
     // Perform mean shift procedure and obtain region and spatial maps
     GpuMat d_rmap, d_spmap;
-    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
+    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria, stream);
+
+    stream.waitForCompletion();
+
     Mat rmap(d_rmap);
     Mat spmap(d_spmap);
 

From d004ee58c5685b86f5c6127895cfcb830ac5c2de Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Tue, 16 Dec 2014 18:15:50 +0300
Subject: [PATCH 4/7] Support loading old models in ML module

- added test for loading legacy files
- added version to new written models
- fixed loading of several fields in some models
- added generation of new fields from old data
---
 modules/ml/src/ann_mlp.cpp         |   2 +-
 modules/ml/src/boost.cpp           |   8 ++-
 modules/ml/src/data.cpp            |   2 +-
 modules/ml/src/inner_functions.cpp |   1 +
 modules/ml/src/precomp.hpp         |  16 +++++
 modules/ml/src/rtrees.cpp          |   2 +-
 modules/ml/src/svm.cpp             |   3 +-
 modules/ml/src/tree.cpp            |  87 ++++++++++++++++++++++--
 modules/ml/test/test_save_load.cpp | 103 +++++++++++++++++++++++++++++
 9 files changed, 211 insertions(+), 13 deletions(-)

diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index 3e7d44e87c..ef52801317 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -1241,7 +1241,7 @@ public:
         clear();
 
         vector<int> _layer_sizes;
-        fn["layer_sizes"] >> _layer_sizes;
+        readVectorOrMat(fn["layer_sizes"], _layer_sizes);
         create( _layer_sizes );
 
         int i, l_count = layer_count();
diff --git a/modules/ml/src/boost.cpp b/modules/ml/src/boost.cpp
index 5e0b307338..236cd97a2d 100644
--- a/modules/ml/src/boost.cpp
+++ b/modules/ml/src/boost.cpp
@@ -434,13 +434,17 @@ public:
         bparams.priors = params0.priors;
 
         FileNode tparams_node = fn["training_params"];
-        String bts = (String)tparams_node["boosting_type"];
+        // check for old layout
+        String bts = (String)(fn["boosting_type"].empty() ?
+                         tparams_node["boosting_type"] : fn["boosting_type"]);
         bparams.boostType = (bts == "DiscreteAdaboost" ? Boost::DISCRETE :
                              bts == "RealAdaboost" ? Boost::REAL :
                              bts == "LogitBoost" ? Boost::LOGIT :
                              bts == "GentleAdaboost" ? Boost::GENTLE : -1);
         _isClassifier = bparams.boostType == Boost::DISCRETE;
-        bparams.weightTrimRate = (double)tparams_node["weight_trimming_rate"];
+        // check for old layout
+        bparams.weightTrimRate = (double)(fn["weight_trimming_rate"].empty() ?
+                                    tparams_node["weight_trimming_rate"] : fn["weight_trimming_rate"]);
     }
 
     void read( const FileNode& fn )
diff --git a/modules/ml/src/data.cpp b/modules/ml/src/data.cpp
index 6b5ceb4881..d2ac18ff01 100644
--- a/modules/ml/src/data.cpp
+++ b/modules/ml/src/data.cpp
@@ -898,7 +898,7 @@ public:
 
         CV_Assert( m > 0 ); // if m==0, vi is an ordered variable
         const int* cmap = &catMap.at<int>(ofs[0]);
-        bool fastMap = (m == cmap[m] - cmap[0]);
+        bool fastMap = (m == cmap[m - 1] - cmap[0] + 1);
 
         if( fastMap )
         {
diff --git a/modules/ml/src/inner_functions.cpp b/modules/ml/src/inner_functions.cpp
index dbc21ff092..561abbaeb8 100644
--- a/modules/ml/src/inner_functions.cpp
+++ b/modules/ml/src/inner_functions.cpp
@@ -115,6 +115,7 @@ void StatModel::save(const String& filename) const
 {
     FileStorage fs(filename, FileStorage::WRITE);
     fs << getDefaultModelName() << "{";
+    fs << "format" << (int)3;
     write(fs);
     fs << "}";
 }
diff --git a/modules/ml/src/precomp.hpp b/modules/ml/src/precomp.hpp
index d308ae98ec..69ff03047e 100644
--- a/modules/ml/src/precomp.hpp
+++ b/modules/ml/src/precomp.hpp
@@ -263,11 +263,27 @@ namespace ml
         vector<int> subsets;
         vector<int> classLabels;
         vector<float> missingSubst;
+        vector<int> varMapping;
         bool _isClassifier;
 
         Ptr<WorkData> w;
     };
 
+    template <typename T>
+    static inline void readVectorOrMat(const FileNode & node, std::vector<T> & v)
+    {
+        if (node.type() == FileNode::MAP)
+        {
+            Mat m;
+            node >> m;
+            m.copyTo(v);
+        }
+        else if (node.type() == FileNode::SEQ)
+        {
+            node >> v;
+        }
+    }
+
 }}
 
 #endif /* __OPENCV_ML_PRECOMP_HPP__ */
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index 7c9cbaf268..7441faac17 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -346,7 +346,7 @@ public:
         oobError = (double)fn["oob_error"];
         int ntrees = (int)fn["ntrees"];
 
-        fn["var_importance"] >> varImportance;
+        readVectorOrMat(fn["var_importance"], varImportance);
 
         readParams(fn);
 
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index c7c32f0be5..a0df44f78b 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -2038,7 +2038,8 @@ public:
     {
         Params _params;
 
-        String svm_type_str = (String)fn["svmType"];
+        // check for old naming
+        String svm_type_str = (String)(fn["svm_type"].empty() ? fn["svmType"] : fn["svm_type"]);
         int svmType =
             svm_type_str == "C_SVC" ? C_SVC :
             svm_type_str == "NU_SVC" ? NU_SVC :
diff --git a/modules/ml/src/tree.cpp b/modules/ml/src/tree.cpp
index 416abd9364..64f66169b0 100644
--- a/modules/ml/src/tree.cpp
+++ b/modules/ml/src/tree.cpp
@@ -1597,7 +1597,10 @@ void DTreesImpl::writeParams(FileStorage& fs) const
     fs << "}";
 
     if( !varIdx.empty() )
+    {
+        fs << "global_var_idx" << 1;
         fs << "var_idx" << varIdx;
+    }
 
     fs << "var_type" << varType;
 
@@ -1726,9 +1729,8 @@ void DTreesImpl::readParams( const FileNode& fn )
     if( !tparams_node.empty() ) // training parameters are not necessary
     {
         params0.useSurrogates = (int)tparams_node["use_surrogates"] != 0;
-        params0.maxCategories = (int)tparams_node["max_categories"];
+        params0.maxCategories = (int)(tparams_node["max_categories"].empty() ? 16 : tparams_node["max_categories"]);
         params0.regressionAccuracy = (float)tparams_node["regression_accuracy"];
-
         params0.maxDepth = (int)tparams_node["max_depth"];
         params0.minSampleCount = (int)tparams_node["min_sample_count"];
         params0.CVFolds = (int)tparams_node["cross_validation_folds"];
@@ -1741,13 +1743,83 @@ void DTreesImpl::readParams( const FileNode& fn )
         tparams_node["priors"] >> params0.priors;
     }
 
-    fn["var_idx"] >> varIdx;
+    readVectorOrMat(fn["var_idx"], varIdx);
     fn["var_type"] >> varType;
 
-    fn["cat_ofs"] >> catOfs;
-    fn["cat_map"] >> catMap;
-    fn["missing_subst"] >> missingSubst;
-    fn["class_labels"] >> classLabels;
+    int format = 0;
+    fn["format"] >> format;
+    bool isLegacy = format < 3;
+
+    int varAll = (int)fn["var_all"];
+    if (isLegacy && (int)varType.size() <= varAll)
+    {
+        std::vector<uchar> extendedTypes(varAll + 1, 0);
+
+        int i = 0, n;
+        if (!varIdx.empty())
+        {
+            n = (int)varIdx.size();
+            for (; i < n; ++i)
+            {
+                int var = varIdx[i];
+                extendedTypes[var] = varType[i];
+            }
+        }
+        else
+        {
+            n = (int)varType.size();
+            for (; i < n; ++i)
+            {
+                extendedTypes[i] = varType[i];
+            }
+        }
+        extendedTypes[varAll] = (uchar)(_isClassifier ? VAR_CATEGORICAL : VAR_ORDERED);
+        extendedTypes.swap(varType);
+    }
+
+    readVectorOrMat(fn["cat_map"], catMap);
+
+    if (isLegacy)
+    {
+        // generating "catOfs" from "cat_count"
+        catOfs.clear();
+        classLabels.clear();
+        std::vector<int> counts;
+        readVectorOrMat(fn["cat_count"], counts);
+        unsigned int i = 0, j = 0, curShift = 0, size = (int)varType.size() - 1;
+        for (; i < size; ++i)
+        {
+            Vec2i newOffsets(0, 0);
+            if (varType[i] == VAR_CATEGORICAL) // only categorical vars are represented in catMap
+            {
+                newOffsets[0] = curShift;
+                curShift += counts[j];
+                newOffsets[1] = curShift;
+                ++j;
+            }
+            catOfs.push_back(newOffsets);
+        }
+        // other elements in "catMap" are "classLabels"
+        if (curShift < catMap.size())
+        {
+            classLabels.insert(classLabels.end(), catMap.begin() + curShift, catMap.end());
+            catMap.erase(catMap.begin() + curShift, catMap.end());
+        }
+    }
+    else
+    {
+        fn["cat_ofs"] >> catOfs;
+        fn["missing_subst"] >> missingSubst;
+        fn["class_labels"] >> classLabels;
+    }
+
+    // init var mapping for node reading (var indexes or varIdx indexes)
+    bool globalVarIdx = false;
+    fn["global_var_idx"] >> globalVarIdx;
+    if (globalVarIdx || varIdx.empty())
+        setRangeVector(varMapping, (int)varType.size());
+    else
+        varMapping = varIdx;
 
     initCompVarIdx();
     setDParams(params0);
@@ -1759,6 +1831,7 @@ int DTreesImpl::readSplit( const FileNode& fn )
 
     int vi = (int)fn["var"];
     CV_Assert( 0 <= vi && vi <= (int)varType.size() );
+    vi = varMapping[vi]; // convert to varIdx if needed
     split.varIdx = vi;
 
     if( varType[vi] == VAR_CATEGORICAL ) // split on categorical var
diff --git a/modules/ml/test/test_save_load.cpp b/modules/ml/test/test_save_load.cpp
index bef2fd0e1c..74e8eef0df 100644
--- a/modules/ml/test/test_save_load.cpp
+++ b/modules/ml/test/test_save_load.cpp
@@ -158,6 +158,109 @@ TEST(ML_Boost, save_load) { CV_SLMLTest test( CV_BOOST ); test.safe_run(); }
 TEST(ML_RTrees, save_load) { CV_SLMLTest test( CV_RTREES ); test.safe_run(); }
 TEST(DISABLED_ML_ERTrees, save_load) { CV_SLMLTest test( CV_ERTREES ); test.safe_run(); }
 
+class CV_LegacyTest : public cvtest::BaseTest
+{
+public:
+    CV_LegacyTest(const std::string &_modelName, const std::string &_suffixes = std::string())
+        : cvtest::BaseTest(), modelName(_modelName), suffixes(_suffixes)
+    {
+    }
+    virtual ~CV_LegacyTest() {}
+protected:
+    void run(int)
+    {
+        unsigned int idx = 0;
+        for (;;)
+        {
+            if (idx >= suffixes.size())
+                break;
+            int found = (int)suffixes.find(';', idx);
+            string piece = suffixes.substr(idx, found - idx);
+            if (piece.empty())
+                break;
+            oneTest(piece);
+            idx += (unsigned int)piece.size() + 1;
+        }
+    }
+    void oneTest(const string & suffix)
+    {
+        using namespace cv::ml;
+
+        int code = cvtest::TS::OK;
+        string filename = ts->get_data_path() + "legacy/" + modelName + suffix;
+        bool isTree = modelName == CV_BOOST || modelName == CV_DTREE || modelName == CV_RTREES;
+        Ptr<StatModel> model;
+        if (modelName == CV_BOOST)
+            model = StatModel::load<Boost>(filename);
+        else if (modelName == CV_ANN)
+            model = StatModel::load<ANN_MLP>(filename);
+        else if (modelName == CV_DTREE)
+            model = StatModel::load<DTrees>(filename);
+        else if (modelName == CV_NBAYES)
+            model = StatModel::load<NormalBayesClassifier>(filename);
+        else if (modelName == CV_SVM)
+            model = StatModel::load<SVM>(filename);
+        else if (modelName == CV_RTREES)
+            model = StatModel::load<RTrees>(filename);
+        if (!model)
+        {
+            code = cvtest::TS::FAIL_INVALID_TEST_DATA;
+        }
+        else
+        {
+            Mat input = Mat(isTree ? 10 : 1, model->getVarCount(), CV_32F);
+            ts->get_rng().fill(input, RNG::UNIFORM, 0, 40);
+
+            if (isTree)
+                randomFillCategories(filename, input);
+
+            Mat output;
+            model->predict(input, output, StatModel::RAW_OUTPUT | (isTree ? DTrees::PREDICT_SUM : 0));
+            // just check if no internal assertions or errors thrown
+        }
+        ts->set_failed_test_info(code);
+    }
+    void randomFillCategories(const string & filename, Mat & input)
+    {
+        Mat catMap;
+        Mat catCount;
+        std::vector<uchar> varTypes;
+
+        FileStorage fs(filename, FileStorage::READ);
+        FileNode root = fs.getFirstTopLevelNode();
+        root["cat_map"] >> catMap;
+        root["cat_count"] >> catCount;
+        root["var_type"] >> varTypes;
+
+        int offset = 0;
+        int countOffset = 0;
+        uint var = 0, varCount = (uint)varTypes.size();
+        for (; var < varCount; ++var)
+        {
+            if (varTypes[var] == ml::VAR_CATEGORICAL)
+            {
+                int size = catCount.at<int>(0, countOffset);
+                for (int row = 0; row < input.rows; ++row)
+                {
+                    int randomChosenIndex = offset + ((uint)ts->get_rng()) % size;
+                    int value = catMap.at<int>(0, randomChosenIndex);
+                    input.at<float>(row, var) = (float)value;
+                }
+                offset += size;
+                ++countOffset;
+            }
+        }
+    }
+    string modelName;
+    string suffixes;
+};
+
+TEST(ML_ANN, legacy_load) { CV_LegacyTest test(CV_ANN, "_waveform.xml"); test.safe_run(); }
+TEST(ML_Boost, legacy_load) { CV_LegacyTest test(CV_BOOST, "_adult.xml;_1.xml;_2.xml;_3.xml"); test.safe_run(); }
+TEST(ML_DTree, legacy_load) { CV_LegacyTest test(CV_DTREE, "_abalone.xml;_mushroom.xml"); test.safe_run(); }
+TEST(ML_NBayes, legacy_load) { CV_LegacyTest test(CV_NBAYES, "_waveform.xml"); test.safe_run(); }
+TEST(ML_SVM, legacy_load) { CV_LegacyTest test(CV_SVM, "_poletelecomm.xml;_waveform.xml"); test.safe_run(); }
+TEST(ML_RTrees, legacy_load) { CV_LegacyTest test(CV_RTREES, "_waveform.xml"); test.safe_run(); }
 
 /*TEST(ML_SVM, throw_exception_when_save_untrained_model)
 {

From 38d37d2b007a60bc74c02526753f478ca831f523 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Wed, 31 Dec 2014 11:59:53 +0300
Subject: [PATCH 5/7] Fixed uninitialized memory usage in ANN_MLP

---
 modules/ml/src/ann_mlp.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index ef52801317..08b5c5f959 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -103,6 +103,7 @@ public:
 
     ANN_MLPImpl( const Params& p )
     {
+        clear();
         setParams(p);
     }
 
@@ -126,6 +127,7 @@ public:
         rng = RNG((uint64)-1);
         weights.clear();
         trained = false;
+        max_buf_sz = 1 << 12;
     }
 
     int layer_count() const { return (int)layer_sizes.size(); }

From d8272b9395828fac0dd406d88844b7d70cb330b5 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 12 Jan 2015 10:58:43 +0300
Subject: [PATCH 6/7] remove "-target-os-variant" NVCC option for CUDA 7.0

it is marked as obsolete
---
 cmake/FindCUDA.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
index eac12ff67b..ccfc4b93b3 100644
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@@ -1098,7 +1098,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
     set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
   endif()
 
-  if(CUDA_TARGET_OS_VARIANT)
+  if(CUDA_TARGET_OS_VARIANT AND CUDA_VERSION VERSION_LESS "7.0")
     set(nvcc_flags ${nvcc_flags} "-target-os-variant=${CUDA_TARGET_OS_VARIANT}")
   endif()
 

From 7b0d1c932e502a7f157222cb683531e2849de393 Mon Sep 17 00:00:00 2001
From: Adam Borowski <kilobyte@angband.pl>
Date: Tue, 13 Jan 2015 01:52:56 +0100
Subject: [PATCH 7/7] Get rid of sysctl includes on Linux.

The 'sysctl' syscall has been strongly deprecated on Linux for ages.
Currently, on old architectures it will spam syslog whenever used, and on
newer ones it's missing from the headers altogether.  Opencv has migrated
away on Linux already, but #includes were left lingering.  This commit
removes them on non-__APPLE__, unbreaking x32 (and probably others).
---
 modules/core/src/parallel.cpp           | 2 +-
 modules/core/src/system.cpp             | 2 --
 modules/videoio/src/cap_ffmpeg_impl.hpp | 2 ++
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 5a51230cef..1fb980d922 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -56,7 +56,7 @@
     #include <sys/types.h>
     #if defined ANDROID
         #include <sys/sysconf.h>
-    #else
+    #elif defined __APPLE__
         #include <sys/sysctl.h>
     #endif
 #endif
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index d9a20873f6..1f9d686723 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -163,8 +163,6 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix)
 #include <sys/types.h>
 #if defined ANDROID
 #include <sys/sysconf.h>
-#else
-#include <sys/sysctl.h>
 #endif
 #endif
 
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index bd967f963d..5870f4bc71 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -139,8 +139,10 @@ extern "C" {
     #include <unistd.h>
     #include <stdio.h>
     #include <sys/types.h>
+#if defined __APPLE__
     #include <sys/sysctl.h>
 #endif
+#endif
 
 #ifndef MIN
 #define MIN(a, b) ((a) < (b) ? (a) : (b))