diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
index eac12ff67b..ccfc4b93b3 100644
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@@ -1098,7 +1098,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
     set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
   endif()
 
-  if(CUDA_TARGET_OS_VARIANT)
+  if(CUDA_TARGET_OS_VARIANT AND CUDA_VERSION VERSION_LESS "7.0")
     set(nvcc_flags ${nvcc_flags} "-target-os-variant=${CUDA_TARGET_OS_VARIANT}")
   endif()
 
diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index a97388bd05..9fff4ee281 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -130,6 +130,12 @@ namespace cv { namespace cuda
     class NppStreamHandler
     {
     public:
+        inline explicit NppStreamHandler(Stream& newStream)
+        {
+            oldStream = nppGetStream();
+            nppSetStream(StreamAccessor::getStream(newStream));
+        }
+
         inline explicit NppStreamHandler(cudaStream_t newStream)
         {
             oldStream = nppGetStream();
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 5a51230cef..1fb980d922 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -56,7 +56,7 @@
     #include <sys/types.h>
     #if defined ANDROID
         #include <sys/sysconf.h>
-    #else
+    #elif defined __APPLE__
         #include <sys/sysctl.h>
     #endif
 #endif
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index d9a20873f6..1f9d686723 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -163,8 +163,6 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix)
 #include <sys/types.h>
 #if defined ANDROID
 #include <sys/sysconf.h>
-#else
-#include <sys/sysctl.h>
 #endif
 #endif
 
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index a5fbfb81fa..d84ae24534 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -205,19 +205,11 @@ CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stre
 
 @param src Source image with CV_8UC1 type.
 @param dst Destination image.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
 @param stream Stream for the asynchronous version.
 
 @sa equalizeHist
  */
-CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::equalizeHist(src, dst, buf, stream);
-}
+CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
 /** @brief Base class for Contrast Limited Adaptive Histogram Equalization. :
  */
@@ -248,8 +240,9 @@ CV_EXPORTS Ptr<cuda::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSi
 @param nLevels Number of computed levels. nLevels must be at least 2.
 @param lowerLevel Lower boundary value of the lowest level.
 @param upperLevel Upper boundary value of the greatest level.
+@param stream Stream for the asynchronous version.
  */
-CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel);
+CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
 
 /** @brief Calculates a histogram with evenly distributed bins.
 
@@ -259,27 +252,11 @@ a four-channel image, all channels are processed separately.
 @param histSize Size of the histogram.
 @param lowerLevel Lower boundary of lowest-level bin.
 @param upperLevel Upper boundary of highest-level bin.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
 @param stream Stream for the asynchronous version.
  */
-CV_EXPORTS void histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
-
+CV_EXPORTS void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
 /** @overload */
-static inline void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
-
-/** @overload */
-CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
+CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
 
 /** @brief Calculates a histogram with bins determined by the levels array.
 
@@ -287,27 +264,11 @@ static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int
 For a four-channel image, all channels are processed separately.
 @param hist Destination histogram with one row, (levels.cols-1) columns, and the CV_32SC1 type.
 @param levels Number of levels in the histogram.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
 @param stream Stream for the asynchronous version.
  */
-CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null());
-
+CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null());
 /** @overload */
-static inline void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histRange(src, hist, levels, buf, stream);
-}
-
-/** @overload */
-CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histRange(src, hist, levels, buf, stream);
-}
+CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());
 
 //! @} cudaimgproc_hist
 
@@ -321,15 +282,17 @@ public:
     /** @brief Finds edges in an image using the @cite Canny86 algorithm.
 
     @param image Single-channel 8-bit input image.
-    @param edges Output edge map. It has the same size and type as image .
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
      */
-    virtual void detect(InputArray image, OutputArray edges) = 0;
+    virtual void detect(InputArray image, OutputArray edges, Stream& stream = Stream::Null()) = 0;
     /** @overload
     @param dx First derivative of image in the vertical direction. Support only CV_32S type.
     @param dy First derivative of image in the horizontal direction. Support only CV_32S type.
-    @param edges Output edge map. It has the same size and type as image .
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
     */
-    virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0;
+    virtual void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream = Stream::Null()) = 0;
 
     virtual void setLowThreshold(double low_thresh) = 0;
     virtual double getLowThreshold() const = 0;
@@ -376,18 +339,20 @@ public:
     \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
     the image). \f$\theta\f$ is the line rotation angle in radians (
     \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+    @param stream Stream for the asynchronous version.
 
     @sa HoughLines
      */
-    virtual void detect(InputArray src, OutputArray lines) = 0;
+    virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;
 
     /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory.
 
     @param d_lines Result of cuda::HoughLinesDetector::detect .
     @param h_lines Output host array.
     @param h_votes Optional output array for line's votes.
+    @param stream Stream for the asynchronous version.
      */
-    virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0;
+    virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray(), Stream& stream = Stream::Null()) = 0;
 
     virtual void setRho(float rho) = 0;
     virtual float getRho() const = 0;
@@ -431,10 +396,11 @@ public:
     @param lines Output vector of lines. Each line is represented by a 4-element vector
     \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
     line segment.
+    @param stream Stream for the asynchronous version.
 
     @sa HoughLinesP
      */
-    virtual void detect(InputArray src, OutputArray lines) = 0;
+    virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;
 
     virtual void setRho(float rho) = 0;
     virtual float getRho() const = 0;
@@ -475,10 +441,11 @@ public:
     @param src 8-bit, single-channel grayscale input image.
     @param circles Output vector of found circles. Each vector is encoded as a 3-element
     floating-point vector \f$(x, y, radius)\f$ .
+    @param stream Stream for the asynchronous version.
 
     @sa HoughCircles
      */
-    virtual void detect(InputArray src, OutputArray circles) = 0;
+    virtual void detect(InputArray src, OutputArray circles, Stream& stream = Stream::Null()) = 0;
 
     virtual void setDp(float dp) = 0;
     virtual float getDp() const = 0;
@@ -593,8 +560,9 @@ public:
     positions).
     @param mask Optional region of interest. If the image is not empty (it needs to have the type
     CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+    @param stream Stream for the asynchronous version.
      */
-    virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0;
+    virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray(), Stream& stream = Stream::Null()) = 0;
 };
 
 /** @brief Creates implementation for cuda::CornersDetector .
@@ -630,7 +598,7 @@ as src .
 @param sp Spatial window radius.
 @param sr Color window radius.
 @param criteria Termination criteria. See TermCriteria.
-@param stream
+@param stream Stream for the asynchronous version.
 
 It maps each point of the source image into another point. As a result, you have a new color and new
 position of each point.
@@ -650,7 +618,7 @@ src size. The type is CV_16SC2 .
 @param sp Spatial window radius.
 @param sr Color window radius.
 @param criteria Termination criteria. See TermCriteria.
-@param stream
+@param stream Stream for the asynchronous version.
 
 @sa cuda::meanShiftFiltering
  */
@@ -666,9 +634,11 @@ CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dsts
 @param sr Color window radius.
 @param minsize Minimum segment size. Smaller segments are merged.
 @param criteria Termination criteria. See TermCriteria.
+@param stream Stream for the asynchronous version.
  */
 CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize,
-                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                                      Stream& stream = Stream::Null());
 
 /////////////////////////// Match Template ////////////////////////////
 
diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp
index 0e020394a1..c638ce0ce5 100644
--- a/modules/cudaimgproc/perf/perf_histogram.cpp
+++ b/modules/cudaimgproc/perf/perf_histogram.cpp
@@ -63,9 +63,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC1,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;
 
-        TEST_CYCLE() cv::cuda::histEven(d_src, dst, d_buf, 30, 0, 180);
+        TEST_CYCLE() cv::cuda::histEven(d_src, dst, 30, 0, 180);
 
         CUDA_SANITY_CHECK(dst);
     }
@@ -106,9 +105,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC4,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat d_hist[4];
-        cv::cuda::GpuMat d_buf;
 
-        TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, d_buf, histSize, lowerLevel, upperLevel);
+        TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, histSize, lowerLevel, upperLevel);
 
         cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3;
         d_hist[0].download(cpu_hist0);
@@ -167,9 +165,8 @@ PERF_TEST_P(Sz, EqualizeHist,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;
 
-        TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst, d_buf);
+        TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst);
 
         CUDA_SANITY_CHECK(dst);
     }
diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp
index eed4a284e5..1e52bd295a 100644
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -53,16 +53,16 @@ Ptr<CannyEdgeDetector> cv::cuda::createCannyEdgeDetector(double, double, int, bo
 
 namespace canny
 {
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);
 
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);
 
-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);
 
-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);
 
-    void getEdges(PtrStepSzi map, PtrStepSzb dst);
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
 }
 
 namespace
@@ -76,8 +76,8 @@ namespace
             old_apperture_size_ = -1;
         }
 
-        void detect(InputArray image, OutputArray edges);
-        void detect(InputArray dx, InputArray dy, OutputArray edges);
+        void detect(InputArray image, OutputArray edges, Stream& stream);
+        void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream);
 
         void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; }
         double getLowThreshold() const { return low_thresh_; }
@@ -111,7 +111,7 @@ namespace
 
     private:
         void createBuf(Size image_size);
-        void CannyCaller(GpuMat& edges);
+        void CannyCaller(GpuMat& edges, Stream& stream);
 
         double low_thresh_;
         double high_thresh_;
@@ -128,7 +128,7 @@ namespace
         int old_apperture_size_;
     };
 
-    void CannyImpl::detect(InputArray _image, OutputArray _edges)
+    void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
     {
         GpuMat image = _image.getGpuMat();
 
@@ -150,24 +150,24 @@ namespace
             image.locateROI(wholeSize, ofs);
             GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step);
 
-            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_);
+            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
         }
         else
         {
 #ifndef HAVE_OPENCV_CUDAFILTERS
             throw_no_cuda();
 #else
-            filterDX_->apply(image, dx_);
-            filterDY_->apply(image, dy_);
+            filterDX_->apply(image, dx_, stream);
+            filterDY_->apply(image, dy_, stream);
 
-            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
 #endif
         }
 
-        CannyCaller(edges);
+        CannyCaller(edges, stream);
     }
 
-    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges)
+    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges, Stream& stream)
     {
         GpuMat dx = _dx.getGpuMat();
         GpuMat dy = _dy.getGpuMat();
@@ -176,8 +176,8 @@ namespace
         CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() );
         CV_Assert( deviceSupports(SHARED_ATOMICS) );
 
-        dx.copyTo(dx_);
-        dy.copyTo(dy_);
+        dx.copyTo(dx_, stream);
+        dy.copyTo(dy_, stream);
 
         if (low_thresh_ > high_thresh_)
             std::swap(low_thresh_, high_thresh_);
@@ -187,9 +187,9 @@ namespace
         _edges.create(dx.size(), CV_8UC1);
         GpuMat edges = _edges.getGpuMat();
 
-        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
 
-        CannyCaller(edges);
+        CannyCaller(edges, stream);
     }
 
     void CannyImpl::createBuf(Size image_size)
@@ -215,16 +215,16 @@ namespace
         ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_);
     }
 
-    void CannyImpl::CannyCaller(GpuMat& edges)
+    void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
     {
         map_.setTo(Scalar::all(0));
-        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_));
+        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));
 
-        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>());
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));
 
-        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>());
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));
 
-        canny::getEdges(map_, edges);
+        canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
     }
 }
 
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
index 3d770e179d..e0ba515693 100644
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -120,7 +120,7 @@ namespace canny
         mag(y, x) = norm(dxVal, dyVal);
     }
 
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
@@ -131,30 +131,31 @@ namespace canny
         if (L2Grad)
         {
             L2 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
         }
         else
         {
             L1 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
         }
 
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall(cudaThreadSynchronize());
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
     {
         if (L2Grad)
         {
             L2 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
         }
         else
         {
             L1 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
         }
     }
 }
@@ -217,17 +218,18 @@ namespace canny
         map(y, x) = edge_type;
     }
 
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
     {
         const dim3 block(16, 16);
         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
 
         bindTexture(&tex_mag, mag);
 
-        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
@@ -328,20 +330,21 @@ namespace canny
         }
     }
 
-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1)
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
     {
         void* counter_ptr;
         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
 
-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+        cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
 
         const dim3 block(16, 16);
         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
 
-        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
         cudaSafeCall( cudaGetLastError() );
 
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
@@ -441,27 +444,30 @@ namespace canny
         }
     }
 
-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2)
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
     {
         void* counter_ptr;
         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
 
         int count;
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+        cudaSafeCall( cudaStreamSynchronize(stream) );
 
         while (count > 0)
         {
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
 
             const dim3 block(128);
             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
 
-            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
+            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
 
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             count = min(count, map.cols * map.rows);
 
@@ -499,9 +505,9 @@ namespace cv { namespace cuda { namespace device
 
 namespace canny
 {
-    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream)
     {
-        transform(map, dst, GetEdges(), WithOutMask(), 0);
+        transform(map, dst, GetEdges(), WithOutMask(), stream);
     }
 }
 
diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp
index 243665083a..162ee469ce 100644
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -68,7 +68,7 @@ namespace
         GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
                                     int blockSize, bool useHarrisDetector, double harrisK);
 
-        void detect(InputArray image, OutputArray corners, InputArray mask = noArray());
+        void detect(InputArray image, OutputArray corners, InputArray mask, Stream& stream);
 
     private:
         int maxCorners_;
@@ -96,8 +96,11 @@ namespace
                     cuda::createMinEigenValCorner(srcType, blockSize, 3);
     }
 
-    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask)
+    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::gfft;
 
         GpuMat image = _image.getGpuMat();
diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp
index d63e57de31..e942e9eb86 100644
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -49,11 +49,11 @@ using namespace cv::cuda;
 
 void cv::cuda::calcHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::cuda::equalizeHist(InputArray, OutputArray, InputOutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::cuda::CLAHE>(); }
 
-void cv::cuda::evenLevels(OutputArray, int, int, int) { throw_no_cuda(); }
+void cv::cuda::evenLevels(OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
 
 void cv::cuda::histEven(InputArray, OutputArray, InputOutputArray, int, int, int, Stream&) { throw_no_cuda(); }
 void cv::cuda::histEven(InputArray, GpuMat*, InputOutputArray, int*, int*, int*, Stream&) { throw_no_cuda(); }
@@ -93,7 +93,7 @@ namespace hist
     void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
 }
 
-void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray _buf, Stream& _stream)
+void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
 {
     GpuMat src = _src.getGpuMat();
 
@@ -107,8 +107,8 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray
 
     size_t bufSize = intBufSize + 2 * 256 * sizeof(int);
 
-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, _buf);
-    GpuMat buf = _buf.getGpuMat();
+    BufferPool pool(_stream);
+    GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
 
     GpuMat hist(1, 256, CV_32SC1, buf.data);
     GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int));
@@ -288,7 +288,7 @@ namespace
     {
         typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;
 
-        static void hist(const GpuMat& src, OutputArray _hist, InputOutputArray _buf, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+        static void hist(const GpuMat& src, OutputArray _hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
         {
             const int levels = histSize + 1;
 
@@ -302,15 +302,15 @@ namespace
             int buf_size;
             get_buf_size(sz, levels, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8UC1, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
                 lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
 
-            if (stream == 0)
+            if (!stream)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
@@ -319,7 +319,7 @@ namespace
     {
         typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;
 
-        static void hist(const GpuMat& src, GpuMat hist[4],InputOutputArray _buf, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
+        static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
         {
             int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
             hist[0].create(1, histSize[0], CV_32S);
@@ -336,14 +336,14 @@ namespace
             int buf_size;
             get_buf_size(sz, levels, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
 
-            if (stream == 0)
+            if (!stream)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
@@ -392,7 +392,7 @@ namespace
         typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
         enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
 
-        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, InputOutputArray _buf, cudaStream_t stream)
+        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, Stream& stream)
         {
             CV_Assert( levels.type() == LEVEL_TYPE_CODE && levels.rows == 1 );
 
@@ -406,8 +406,8 @@ namespace
             int buf_size;
             get_buf_size(sz, levels.cols, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
@@ -424,7 +424,7 @@ namespace
         typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
         enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
 
-        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4],InputOutputArray _buf, cudaStream_t stream)
+        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
         {
             CV_Assert( levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1 );
             CV_Assert( levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1 );
@@ -447,8 +447,8 @@ namespace
             int buf_size;
             get_buf_size(sz, nLevels, &buf_size);
 
-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);
 
             NppStreamHandler h(stream);
 
@@ -460,7 +460,7 @@ namespace
     };
 }
 
-void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel)
+void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream)
 {
     const int kind = _levels.kind();
 
@@ -475,7 +475,7 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
     nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
 
     if (kind == _InputArray::CUDA_GPU_MAT)
-        _levels.getGpuMatRef().upload(host_levels);
+        _levels.getGpuMatRef().upload(host_levels, stream);
 }
 
 namespace hist
@@ -493,9 +493,9 @@ namespace
     }
 }
 
-void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, InputOutputArray buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
@@ -514,12 +514,12 @@ void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf,
 
     CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
 
-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
 }
 
-void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], InputOutputArray buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
@@ -532,12 +532,12 @@ void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, i
 
     CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
 
-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
 }
 
-void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, InputOutputArray buf, Stream& stream)
+void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, InputOutputArray buf, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
@@ -553,12 +553,12 @@ void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels,
 
     CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1 );
 
-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, levels, stream);
 }
 
-void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream)
+void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream);
     static const hist_t hist_callers[] =
     {
         NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
@@ -573,7 +573,7 @@ void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4]
 
     CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4 );
 
-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, levels, stream);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp
index 3f9b9334c5..6bdaf16a2d 100644
--- a/modules/cudaimgproc/src/hough_circles.cpp
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@@ -74,7 +74,7 @@ namespace
     public:
         HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);
 
-        void detect(InputArray src, OutputArray circles);
+        void detect(InputArray src, OutputArray circles, Stream& stream);
 
         void setDp(float dp) { dp_ = dp; }
         float getDp() const { return dp_; }
@@ -154,8 +154,11 @@ namespace
         filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
     }
 
-    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles)
+    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_circles;
 
diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp
index b9f159a9cb..7b9c082942 100644
--- a/modules/cudaimgproc/src/hough_lines.cpp
+++ b/modules/cudaimgproc/src/hough_lines.cpp
@@ -75,8 +75,8 @@ namespace
         {
         }
 
-        void detect(InputArray src, OutputArray lines);
-        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
+        void detect(InputArray src, OutputArray lines, Stream& stream);
+        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream);
 
         void setRho(float rho) { rho_ = rho; }
         float getRho() const { return rho_; }
@@ -125,8 +125,11 @@ namespace
         GpuMat result_;
     };
 
-    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines)
+    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_lines;
 
@@ -170,7 +173,7 @@ namespace
         result_.copyTo(lines);
     }
 
-    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes)
+    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream)
     {
         GpuMat d_lines = _d_lines.getGpuMat();
 
@@ -184,12 +187,18 @@ namespace
 
         CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 );
 
-        d_lines.row(0).download(h_lines);
+        if (stream)
+            d_lines.row(0).download(h_lines, stream);
+        else
+            d_lines.row(0).download(h_lines);
 
         if (h_votes.needed())
         {
             GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr<int>(1));
-            d_votes.download(h_votes);
+            if (stream)
+                d_votes.download(h_votes, stream);
+            else
+                d_votes.download(h_votes);
         }
     }
 }
diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp
index 2434f6d26d..e3e34ec3d0 100644
--- a/modules/cudaimgproc/src/hough_segments.cpp
+++ b/modules/cudaimgproc/src/hough_segments.cpp
@@ -79,7 +79,7 @@ namespace
         {
         }
 
-        void detect(InputArray src, OutputArray lines);
+        void detect(InputArray src, OutputArray lines, Stream& stream);
 
         void setRho(float rho) { rho_ = rho; }
         float getRho() const { return rho_; }
@@ -128,8 +128,11 @@ namespace
         GpuMat result_;
     };
 
-    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines)
+    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
     {
+        // TODO : implement async version
+        (void) stream;
+
         using namespace cv::cuda::device::hough;
         using namespace cv::cuda::device::hough_lines;
         using namespace cv::cuda::device::hough_segments;
diff --git a/modules/cudaimgproc/src/mssegmentation.cpp b/modules/cudaimgproc/src/mssegmentation.cpp
index ad5819800e..54926f3777 100644
--- a/modules/cudaimgproc/src/mssegmentation.cpp
+++ b/modules/cudaimgproc/src/mssegmentation.cpp
@@ -43,7 +43,7 @@
 
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
-void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria) { throw_no_cuda(); }
+void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
 
 #else
 
@@ -222,7 +222,7 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
 } // anonymous namespace
 
 
-void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria)
+void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria, Stream& stream)
 {
     GpuMat src = _src.getGpuMat();
 
@@ -235,7 +235,10 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp,
 
     // Perform mean shift procedure and obtain region and spatial maps
     GpuMat d_rmap, d_spmap;
-    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
+    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria, stream);
+
+    stream.waitForCompletion();
+
     Mat rmap(d_rmap);
     Mat spmap(d_spmap);
 
diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index 3e7d44e87c..08b5c5f959 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -103,6 +103,7 @@ public:
 
     ANN_MLPImpl( const Params& p )
     {
+        clear();
         setParams(p);
     }
 
@@ -126,6 +127,7 @@ public:
         rng = RNG((uint64)-1);
         weights.clear();
         trained = false;
+        max_buf_sz = 1 << 12;
     }
 
     int layer_count() const { return (int)layer_sizes.size(); }
@@ -1241,7 +1243,7 @@ public:
         clear();
 
         vector<int> _layer_sizes;
-        fn["layer_sizes"] >> _layer_sizes;
+        readVectorOrMat(fn["layer_sizes"], _layer_sizes);
         create( _layer_sizes );
 
         int i, l_count = layer_count();
diff --git a/modules/ml/src/boost.cpp b/modules/ml/src/boost.cpp
index 5e0b307338..236cd97a2d 100644
--- a/modules/ml/src/boost.cpp
+++ b/modules/ml/src/boost.cpp
@@ -434,13 +434,17 @@ public:
         bparams.priors = params0.priors;
 
         FileNode tparams_node = fn["training_params"];
-        String bts = (String)tparams_node["boosting_type"];
+        // check for old layout
+        String bts = (String)(fn["boosting_type"].empty() ?
+                         tparams_node["boosting_type"] : fn["boosting_type"]);
         bparams.boostType = (bts == "DiscreteAdaboost" ? Boost::DISCRETE :
                              bts == "RealAdaboost" ? Boost::REAL :
                              bts == "LogitBoost" ? Boost::LOGIT :
                              bts == "GentleAdaboost" ? Boost::GENTLE : -1);
         _isClassifier = bparams.boostType == Boost::DISCRETE;
-        bparams.weightTrimRate = (double)tparams_node["weight_trimming_rate"];
+        // check for old layout
+        bparams.weightTrimRate = (double)(fn["weight_trimming_rate"].empty() ?
+                                    tparams_node["weight_trimming_rate"] : fn["weight_trimming_rate"]);
     }
 
     void read( const FileNode& fn )
diff --git a/modules/ml/src/data.cpp b/modules/ml/src/data.cpp
index 6b5ceb4881..d2ac18ff01 100644
--- a/modules/ml/src/data.cpp
+++ b/modules/ml/src/data.cpp
@@ -898,7 +898,7 @@ public:
 
         CV_Assert( m > 0 ); // if m==0, vi is an ordered variable
         const int* cmap = &catMap.at<int>(ofs[0]);
-        bool fastMap = (m == cmap[m] - cmap[0]);
+        bool fastMap = (m == cmap[m - 1] - cmap[0] + 1);
 
         if( fastMap )
         {
diff --git a/modules/ml/src/inner_functions.cpp b/modules/ml/src/inner_functions.cpp
index dbc21ff092..561abbaeb8 100644
--- a/modules/ml/src/inner_functions.cpp
+++ b/modules/ml/src/inner_functions.cpp
@@ -115,6 +115,7 @@ void StatModel::save(const String& filename) const
 {
     FileStorage fs(filename, FileStorage::WRITE);
     fs << getDefaultModelName() << "{";
+    fs << "format" << (int)3;
     write(fs);
     fs << "}";
 }
diff --git a/modules/ml/src/precomp.hpp b/modules/ml/src/precomp.hpp
index d308ae98ec..69ff03047e 100644
--- a/modules/ml/src/precomp.hpp
+++ b/modules/ml/src/precomp.hpp
@@ -263,11 +263,27 @@ namespace ml
         vector<int> subsets;
         vector<int> classLabels;
         vector<float> missingSubst;
+        vector<int> varMapping;
         bool _isClassifier;
 
         Ptr<WorkData> w;
     };
 
+    template <typename T>
+    static inline void readVectorOrMat(const FileNode & node, std::vector<T> & v)
+    {
+        if (node.type() == FileNode::MAP)
+        {
+            Mat m;
+            node >> m;
+            m.copyTo(v);
+        }
+        else if (node.type() == FileNode::SEQ)
+        {
+            node >> v;
+        }
+    }
+
 }}
 
 #endif /* __OPENCV_ML_PRECOMP_HPP__ */
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index 7c9cbaf268..7441faac17 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -346,7 +346,7 @@ public:
         oobError = (double)fn["oob_error"];
         int ntrees = (int)fn["ntrees"];
 
-        fn["var_importance"] >> varImportance;
+        readVectorOrMat(fn["var_importance"], varImportance);
 
         readParams(fn);
 
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index c7c32f0be5..a0df44f78b 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -2038,7 +2038,8 @@ public:
     {
         Params _params;
 
-        String svm_type_str = (String)fn["svmType"];
+        // check for old naming
+        String svm_type_str = (String)(fn["svm_type"].empty() ? fn["svmType"] : fn["svm_type"]);
         int svmType =
             svm_type_str == "C_SVC" ? C_SVC :
             svm_type_str == "NU_SVC" ? NU_SVC :
diff --git a/modules/ml/src/tree.cpp b/modules/ml/src/tree.cpp
index 416abd9364..64f66169b0 100644
--- a/modules/ml/src/tree.cpp
+++ b/modules/ml/src/tree.cpp
@@ -1597,7 +1597,10 @@ void DTreesImpl::writeParams(FileStorage& fs) const
     fs << "}";
 
     if( !varIdx.empty() )
+    {
+        fs << "global_var_idx" << 1;
         fs << "var_idx" << varIdx;
+    }
 
     fs << "var_type" << varType;
 
@@ -1726,9 +1729,8 @@ void DTreesImpl::readParams( const FileNode& fn )
     if( !tparams_node.empty() ) // training parameters are not necessary
     {
         params0.useSurrogates = (int)tparams_node["use_surrogates"] != 0;
-        params0.maxCategories = (int)tparams_node["max_categories"];
+        params0.maxCategories = (int)(tparams_node["max_categories"].empty() ? 16 : tparams_node["max_categories"]);
         params0.regressionAccuracy = (float)tparams_node["regression_accuracy"];
-
         params0.maxDepth = (int)tparams_node["max_depth"];
         params0.minSampleCount = (int)tparams_node["min_sample_count"];
         params0.CVFolds = (int)tparams_node["cross_validation_folds"];
@@ -1741,13 +1743,83 @@ void DTreesImpl::readParams( const FileNode& fn )
         tparams_node["priors"] >> params0.priors;
     }
 
-    fn["var_idx"] >> varIdx;
+    readVectorOrMat(fn["var_idx"], varIdx);
     fn["var_type"] >> varType;
 
-    fn["cat_ofs"] >> catOfs;
-    fn["cat_map"] >> catMap;
-    fn["missing_subst"] >> missingSubst;
-    fn["class_labels"] >> classLabels;
+    int format = 0;
+    fn["format"] >> format;
+    bool isLegacy = format < 3;
+
+    int varAll = (int)fn["var_all"];
+    if (isLegacy && (int)varType.size() <= varAll)
+    {
+        std::vector<uchar> extendedTypes(varAll + 1, 0);
+
+        int i = 0, n;
+        if (!varIdx.empty())
+        {
+            n = (int)varIdx.size();
+            for (; i < n; ++i)
+            {
+                int var = varIdx[i];
+                extendedTypes[var] = varType[i];
+            }
+        }
+        else
+        {
+            n = (int)varType.size();
+            for (; i < n; ++i)
+            {
+                extendedTypes[i] = varType[i];
+            }
+        }
+        extendedTypes[varAll] = (uchar)(_isClassifier ? VAR_CATEGORICAL : VAR_ORDERED);
+        extendedTypes.swap(varType);
+    }
+
+    readVectorOrMat(fn["cat_map"], catMap);
+
+    if (isLegacy)
+    {
+        // generating "catOfs" from "cat_count"
+        catOfs.clear();
+        classLabels.clear();
+        std::vector<int> counts;
+        readVectorOrMat(fn["cat_count"], counts);
+        unsigned int i = 0, j = 0, curShift = 0, size = (int)varType.size() - 1;
+        for (; i < size; ++i)
+        {
+            Vec2i newOffsets(0, 0);
+            if (varType[i] == VAR_CATEGORICAL) // only categorical vars are represented in catMap
+            {
+                newOffsets[0] = curShift;
+                curShift += counts[j];
+                newOffsets[1] = curShift;
+                ++j;
+            }
+            catOfs.push_back(newOffsets);
+        }
+        // other elements in "catMap" are "classLabels"
+        if (curShift < catMap.size())
+        {
+            classLabels.insert(classLabels.end(), catMap.begin() + curShift, catMap.end());
+            catMap.erase(catMap.begin() + curShift, catMap.end());
+        }
+    }
+    else
+    {
+        fn["cat_ofs"] >> catOfs;
+        fn["missing_subst"] >> missingSubst;
+        fn["class_labels"] >> classLabels;
+    }
+
+    // init var mapping for node reading (var indexes or varIdx indexes)
+    bool globalVarIdx = false;
+    fn["global_var_idx"] >> globalVarIdx;
+    if (globalVarIdx || varIdx.empty())
+        setRangeVector(varMapping, (int)varType.size());
+    else
+        varMapping = varIdx;
 
     initCompVarIdx();
     setDParams(params0);
@@ -1759,6 +1831,7 @@ int DTreesImpl::readSplit( const FileNode& fn )
 
     int vi = (int)fn["var"];
     CV_Assert( 0 <= vi && vi <= (int)varType.size() );
+    vi = varMapping[vi]; // convert to varIdx if needed
     split.varIdx = vi;
 
     if( varType[vi] == VAR_CATEGORICAL ) // split on categorical var
diff --git a/modules/ml/test/test_save_load.cpp b/modules/ml/test/test_save_load.cpp
index bef2fd0e1c..74e8eef0df 100644
--- a/modules/ml/test/test_save_load.cpp
+++ b/modules/ml/test/test_save_load.cpp
@@ -158,6 +158,109 @@ TEST(ML_Boost, save_load) { CV_SLMLTest test( CV_BOOST ); test.safe_run(); }
 TEST(ML_RTrees, save_load) { CV_SLMLTest test( CV_RTREES ); test.safe_run(); }
 TEST(DISABLED_ML_ERTrees, save_load) { CV_SLMLTest test( CV_ERTREES ); test.safe_run(); }
 
+class CV_LegacyTest : public cvtest::BaseTest
+{
+public:
+    CV_LegacyTest(const std::string &_modelName, const std::string &_suffixes = std::string())
+        : cvtest::BaseTest(), modelName(_modelName), suffixes(_suffixes)
+    {
+    }
+    virtual ~CV_LegacyTest() {}
+protected:
+    void run(int)
+    {
+        unsigned int idx = 0;
+        for (;;)
+        {
+            if (idx >= suffixes.size())
+                break;
+            int found = (int)suffixes.find(';', idx);
+            string piece = suffixes.substr(idx, found - idx);
+            if (piece.empty())
+                break;
+            oneTest(piece);
+            idx += (unsigned int)piece.size() + 1;
+        }
+    }
+    void oneTest(const string & suffix)
+    {
+        using namespace cv::ml;
+
+        int code = cvtest::TS::OK;
+        string filename = ts->get_data_path() + "legacy/" + modelName + suffix;
+        bool isTree = modelName == CV_BOOST || modelName == CV_DTREE || modelName == CV_RTREES;
+        Ptr<StatModel> model;
+        if (modelName == CV_BOOST)
+            model = StatModel::load<Boost>(filename);
+        else if (modelName == CV_ANN)
+            model = StatModel::load<ANN_MLP>(filename);
+        else if (modelName == CV_DTREE)
+            model = StatModel::load<DTrees>(filename);
+        else if (modelName == CV_NBAYES)
+            model = StatModel::load<NormalBayesClassifier>(filename);
+        else if (modelName == CV_SVM)
+            model = StatModel::load<SVM>(filename);
+        else if (modelName == CV_RTREES)
+            model = StatModel::load<RTrees>(filename);
+        if (!model)
+        {
+            code = cvtest::TS::FAIL_INVALID_TEST_DATA;
+        }
+        else
+        {
+            Mat input = Mat(isTree ? 10 : 1, model->getVarCount(), CV_32F);
+            ts->get_rng().fill(input, RNG::UNIFORM, 0, 40);
+
+            if (isTree)
+                randomFillCategories(filename, input);
+
+            Mat output;
+            model->predict(input, output, StatModel::RAW_OUTPUT | (isTree ? DTrees::PREDICT_SUM : 0));
+            // just check if no internal assertions or errors thrown
+        }
+        ts->set_failed_test_info(code);
+    }
+    void randomFillCategories(const string & filename, Mat & input)
+    {
+        Mat catMap;
+        Mat catCount;
+        std::vector<uchar> varTypes;
+
+        FileStorage fs(filename, FileStorage::READ);
+        FileNode root = fs.getFirstTopLevelNode();
+        root["cat_map"] >> catMap;
+        root["cat_count"] >> catCount;
+        root["var_type"] >> varTypes;
+
+        int offset = 0;
+        int countOffset = 0;
+        uint var = 0, varCount = (uint)varTypes.size();
+        for (; var < varCount; ++var)
+        {
+            if (varTypes[var] == ml::VAR_CATEGORICAL)
+            {
+                int size = catCount.at<int>(0, countOffset);
+                for (int row = 0; row < input.rows; ++row)
+                {
+                    int randomChosenIndex = offset + ((uint)ts->get_rng()) % size;
+                    int value = catMap.at<int>(0, randomChosenIndex);
+                    input.at<float>(row, var) = (float)value;
+                }
+                offset += size;
+                ++countOffset;
+            }
+        }
+    }
+    string modelName;
+    string suffixes;
+};
+
+TEST(ML_ANN, legacy_load) { CV_LegacyTest test(CV_ANN, "_waveform.xml"); test.safe_run(); }
+TEST(ML_Boost, legacy_load) { CV_LegacyTest test(CV_BOOST, "_adult.xml;_1.xml;_2.xml;_3.xml"); test.safe_run(); }
+TEST(ML_DTree, legacy_load) { CV_LegacyTest test(CV_DTREE, "_abalone.xml;_mushroom.xml"); test.safe_run(); }
+TEST(ML_NBayes, legacy_load) { CV_LegacyTest test(CV_NBAYES, "_waveform.xml"); test.safe_run(); }
+TEST(ML_SVM, legacy_load) { CV_LegacyTest test(CV_SVM, "_poletelecomm.xml;_waveform.xml"); test.safe_run(); }
+TEST(ML_RTrees, legacy_load) { CV_LegacyTest test(CV_RTREES, "_waveform.xml"); test.safe_run(); }
 
 /*TEST(ML_SVM, throw_exception_when_save_untrained_model)
 {
diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl
index 704dec4447..4fae320d02 100644
--- a/modules/objdetect/src/opencl/objdetect_hog.cl
+++ b/modules/objdetect/src/opencl/objdetect_hog.cl
@@ -48,7 +48,7 @@
 #define CELLS_PER_BLOCK_X 2
 #define CELLS_PER_BLOCK_Y 2
 #define NTHREADS 256
-#define CV_PI_F 3.1415926535897932384626433832795f
+#define CV_PI_F M_PI_F
 
 #ifdef INTEL_DEVICE
 #define QANGLE_TYPE     int
@@ -606,23 +606,23 @@ __kernel void compute_gradients_8UC4_kernel(
     barrier(CLK_LOCAL_MEM_FENCE);
     if (x < width)
     {
-        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
-            sh_row[tid + 2 * (NTHREADS + 2)]);
-        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
-            sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+        float4 a = (float4) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
+            sh_row[tid + 2 * (NTHREADS + 2)], 0);
+        float4 b = (float4) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
+            sh_row[tid + 2 + 2 * (NTHREADS + 2)], 0);
 
-        float3 dx;
+        float4 dx;
         if (correct_gamma == 1)
             dx = sqrt(b) - sqrt(a);
         else
             dx = b - a;
 
-        float3 dy = (float3) 0.f;
+        float4 dy = (float4) 0.f;
 
         if (gidY > 0 && gidY < height - 1)
         {
-            a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
-            b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
+            a = convert_float4(img[(gidY - 1) * img_step + x].xyzw);
+            b = convert_float4(img[(gidY + 1) * img_step + x].xyzw);
 
             if (correct_gamma == 1)
                 dy = sqrt(b) - sqrt(a);
@@ -630,28 +630,25 @@ __kernel void compute_gradients_8UC4_kernel(
                 dy = b - a;
         }
 
+        float4 mag = hypot(dx, dy);
         float best_dx = dx.x;
         float best_dy = dy.x;
 
-        float mag0 = dx.x * dx.x + dy.x * dy.x;
-        float mag1 = dx.y * dx.y + dy.y * dy.y;
-        if (mag0 < mag1)
+        float mag0 = mag.x;
+        if (mag0 < mag.y)
         {
             best_dx = dx.y;
             best_dy = dy.y;
-            mag0 = mag1;
+            mag0 = mag.y;
         }
 
-        mag1 = dx.z * dx.z + dy.z * dy.z;
-        if (mag0 < mag1)
+        if (mag0 < mag.z)
         {
             best_dx = dx.z;
             best_dy = dy.z;
-            mag0 = mag1;
+            mag0 = mag.z;
         }
 
-        mag0 = sqrt(mag0);
-
         float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
         int hidx = (int)floor(ang);
         ang -= hidx;
@@ -710,7 +707,7 @@ __kernel void compute_gradients_8UC1_kernel(
             else
                 dy = a - b;
         }
-        float mag = sqrt(dx * dx + dy * dy);
+        float mag = hypot(dx, dy);
 
         float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
         int hidx = (int)floor(ang);
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index bd967f963d..5870f4bc71 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -139,8 +139,10 @@ extern "C" {
     #include <unistd.h>
     #include <stdio.h>
     #include <sys/types.h>
+#if defined __APPLE__
     #include <sys/sysctl.h>
 #endif
+#endif
 
 #ifndef MIN
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index af3f874e1f..2e7faa3341 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1053,12 +1053,11 @@ TEST(equalizeHist)
 
         cuda::GpuMat d_src(src);
         cuda::GpuMat d_dst;
-        cuda::GpuMat d_buf;
 
-        cuda::equalizeHist(d_src, d_dst, d_buf);
+        cuda::equalizeHist(d_src, d_dst);
 
         CUDA_ON;
-        cuda::equalizeHist(d_src, d_dst, d_buf);
+        cuda::equalizeHist(d_src, d_dst);
         CUDA_OFF;
     }
 }