diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index c5325e20f1..708578d8b5 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -113,7 +113,6 @@ endmacro()
 macro(ocv_add_module _name)
   ocv_debug_message("ocv_add_module(" ${_name} ${ARGN} ")")
   string(TOLOWER "${_name}" name)
-  string(REGEX REPLACE "^opencv_" "" ${name} "${name}")
   set(the_module opencv_${name})
 
   # the first pass - collect modules info, the second pass - create targets
@@ -787,7 +786,7 @@ macro(__ocv_parse_test_sources tests_type)
       set(__file_group_sources "")
     elseif(arg STREQUAL "DEPENDS_ON")
       set(__currentvar "OPENCV_${tests_type}_${the_module}_DEPS")
-    elseif("${__currentvar}" STREQUAL "__file_group_sources" AND NOT __file_group_name)
+    elseif(" ${__currentvar}" STREQUAL " __file_group_sources" AND NOT __file_group_name) # spaces to avoid CMP0054
       set(__file_group_name "${arg}")
     else()
       list(APPEND ${__currentvar} "${arg}")
@@ -808,7 +807,7 @@ function(ocv_add_perf_tests)
     __ocv_parse_test_sources(PERF ${ARGN})
 
     # opencv_imgcodecs is required for imread/imwrite
-    set(perf_deps ${the_module} opencv_ts opencv_imgcodecs ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
+    set(perf_deps opencv_ts ${the_module} opencv_imgcodecs ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
     ocv_check_dependencies(${perf_deps})
 
     if(OCV_DEPENDENCIES_FOUND)
@@ -829,7 +828,7 @@ function(ocv_add_perf_tests)
 
       ocv_add_executable(${the_target} ${OPENCV_PERF_${the_module}_SOURCES} ${${the_target}_pch})
       ocv_target_include_modules(${the_target} ${perf_deps} "${perf_path}")
-      ocv_target_link_libraries(${the_target} ${OPENCV_MODULE_${the_module}_DEPS} ${perf_deps} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
       add_dependencies(opencv_perf_tests ${the_target})
 
       # Additional target properties
@@ -864,7 +863,7 @@ function(ocv_add_accuracy_tests)
     __ocv_parse_test_sources(TEST ${ARGN})
 
     # opencv_imgcodecs is required for imread/imwrite
-    set(test_deps ${the_module} opencv_ts opencv_imgcodecs opencv_videoio ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
+    set(test_deps opencv_ts ${the_module} opencv_imgcodecs opencv_videoio ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
     ocv_check_dependencies(${test_deps})
     if(OCV_DEPENDENCIES_FOUND)
       set(the_target "opencv_test_${name}")
@@ -884,7 +883,7 @@ function(ocv_add_accuracy_tests)
 
       ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch})
       ocv_target_include_modules(${the_target} ${test_deps} "${test_path}")
-      ocv_target_link_libraries(${the_target} ${OPENCV_MODULE_${the_module}_DEPS} ${test_deps} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
       add_dependencies(opencv_tests ${the_target})
 
       # Additional target properties
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index b32465ead2..60d862efca 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -276,12 +276,12 @@ macro(OCV_OPTION variable description value)
     endif()
   endforeach()
   unset(__varname)
-  if("${__condition}" STREQUAL "")
+  if(__condition STREQUAL "")
     set(__condition 2 GREATER 1)
   endif()
 
   if(${__condition})
-    if("${__value}" MATCHES ";")
+    if(__value MATCHES ";")
       if(${__value})
         option(${variable} "${description}" ON)
       else()
diff --git a/doc/tutorials/highgui/raster-gdal/raster_io_gdal.markdown b/doc/tutorials/highgui/raster-gdal/raster_io_gdal.markdown
index a60c754551..feb2421170 100644
--- a/doc/tutorials/highgui/raster-gdal/raster_io_gdal.markdown
+++ b/doc/tutorials/highgui/raster-gdal/raster_io_gdal.markdown
@@ -3,7 +3,7 @@ Reading Geospatial Raster files with GDAL {#tutorial_raster_io_gdal}
 
 Geospatial raster data is a heavily used product in Geographic Information Systems and
 Photogrammetry. Raster data typically can represent imagery and Digital Elevation Models (DEM). The
-standard library for loading GIS imagery is the Geographic Data Abstraction Library (GDAL). In this
+standard library for loading GIS imagery is the Geographic Data Abstraction Library [(GDAL)](http://www.gdal.org). In this
 example, we will show techniques for loading GIS raster formats using native OpenCV functions. In
 addition, we will show some an example of how OpenCV can use this data for novel and interesting
 purposes.
@@ -13,8 +13,8 @@ Goals
 
 The primary objectives for this tutorial:
 
--   How to use OpenCV imread to load satellite imagery.
--   How to use OpenCV imread to load SRTM Digital Elevation Models
+-   How to use OpenCV [imread](@ref imread) to load satellite imagery.
+-   How to use OpenCV [imread](@ref imread) to load SRTM Digital Elevation Models
 -   Given the corner coordinates of both the image and DEM, correllate the elevation data to the
     image to find elevations for each pixel.
 -   Show a basic, easy-to-implement example of a terrain heat map.
@@ -54,9 +54,9 @@ signed shorts.
 Notes
 -----
 
-### Lat/Lon (Geodetic) Coordinates should normally be avoided
+### Lat/Lon (Geographic) Coordinates should normally be avoided
 
-The Geodetic Coordinate System is a spherical coordinate system, meaning that using them with
+The Geographic Coordinate System is a spherical coordinate system, meaning that using them with
 Cartesian mathematics is technically incorrect. This demo uses them to increase the readability and
 is accurate enough to make the point. A better coordinate system would be Universal Transverse
 Mercator.
@@ -94,8 +94,8 @@ Below is the output of the program. Use the first image as the input. For the DE
 the SRTM file located at the USGS here.
 [<http://dds.cr.usgs.gov/srtm/version2_1/SRTM1/Region_04/N37W123.hgt.zip>](http://dds.cr.usgs.gov/srtm/version2_1/SRTM1/Region_04/N37W123.hgt.zip)
 
-![](images/gdal_output.jpg)
+![Input Image](images/gdal_output.jpg)
 
-![](images/gdal_heat-map.jpg)
+![Heat Map](images/gdal_heat-map.jpg)
 
-![](images/gdal_flood-zone.jpg)
+![Heat Map Overlay](images/gdal_flood-zone.jpg)
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index 43a4cd1477..5a86624c50 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -2972,7 +2972,13 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
     for( i = 0; i < nimages; i++ )
     {
         ni = objectPoints.getMat(i).checkVector(3, CV_32F);
-        CV_Assert( ni >= 0 );
+        if( ni <= 0 )
+            CV_Error(CV_StsUnsupportedFormat, "objectPoints should contain vector of vectors of points of type Point3f");
+        int ni1 = imagePoints1.getMat(i).checkVector(2, CV_32F);
+        if( ni1 <= 0 )
+            CV_Error(CV_StsUnsupportedFormat, "imagePoints1 should contain vector of vectors of points of type Point2f");
+        CV_Assert( ni == ni1 );
+
         total += ni;
     }
 
@@ -2995,8 +3001,6 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
         Mat objpt = objectPoints.getMat(i);
         Mat imgpt1 = imagePoints1.getMat(i);
         ni = objpt.checkVector(3, CV_32F);
-        int ni1 = imgpt1.checkVector(2, CV_32F);
-        CV_Assert( ni > 0 && ni == ni1 );
         npoints.at<int>(i) = ni;
         memcpy( objPtData + j, objpt.ptr(), ni*sizeof(objPtData[0]) );
         memcpy( imgPtData1 + j, imgpt1.ptr(), ni*sizeof(imgPtData1[0]) );
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 73a39e77a1..cb39c15fb4 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -3284,7 +3284,8 @@ inline void UMat::release()
 {
     if( u && CV_XADD(&(u->urefcount), -1) == 1 )
         deallocate();
-    size.p[0] = 0;
+    for(int i = 0; i < dims; i++)
+        size.p[i] = 0;
     u = 0;
 }
 
diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index 9fff4ee281..d676ce8506 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -80,6 +80,16 @@
 namespace cv { namespace cuda {
     CV_EXPORTS cv::String getNppErrorMessage(int code);
     CV_EXPORTS cv::String getCudaDriverApiErrorMessage(int code);
+
+    CV_EXPORTS GpuMat getInputMat(InputArray _src, Stream& stream);
+
+    CV_EXPORTS GpuMat getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream);
+    static inline GpuMat getOutputMat(OutputArray _dst, Size size, int type, Stream& stream)
+    {
+        return getOutputMat(_dst, size.height, size.width, type, stream);
+    }
+
+    CV_EXPORTS void syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream);
 }}
 
 #ifndef HAVE_CUDA
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 68c8979a8d..f881c785b3 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -2355,6 +2355,165 @@ struct Mul_SIMD<float, float>
     }
 };
 
+#elif CV_SSE2
+
+#if CV_SSE4_1
+
+template <>
+struct Mul_SIMD<ushort, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale != 1.0f )
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
+                                           _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
+                                           _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct Mul_SIMD<schar, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
+
+                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
+            }
+        else
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
+
+                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+template <>
+struct Mul_SIMD<short, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale != 1.0f )
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
 #endif
 
 template<typename T, typename WT> static void
@@ -2772,7 +2931,144 @@ struct AddWeighted_SIMD
     }
 };
 
-#if CV_NEON
+#if CV_SSE2
+
+template <>
+struct AddWeighted_SIMD<schar, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
+
+            __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+            __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
+
+            __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
+                                              _mm_cvtps_epi32(v_dstf1));
+
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
+        }
+
+        return x;
+    }
+
+    bool haveSSE2;
+};
+
+template <>
+struct AddWeighted_SIMD<short, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
+                                                                   _mm_cvtps_epi32(v_dstf1)));
+        }
+
+        return x;
+    }
+
+    bool haveSSE2;
+};
+
+#if CV_SSE4_1
+
+template <>
+struct AddWeighted_SIMD<ushort, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE4_1)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
+                                                                    _mm_cvtps_epi32(v_dstf1)));
+        }
+
+        return x;
+    }
+
+    bool haveSSE4_1;
+};
+
+#endif
+
+#elif CV_NEON
 
 template <>
 struct AddWeighted_SIMD<schar, float>
diff --git a/modules/core/src/cuda/gpu_mat.cu b/modules/core/src/cuda/gpu_mat.cu
index 71b1b52198..f21c5f4c19 100644
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -390,6 +390,11 @@ GpuMat& cv::cuda::GpuMat::setTo(Scalar value, InputArray _mask, Stream& stream)
 
     GpuMat mask = _mask.getGpuMat();
 
+    if (mask.empty())
+    {
+        return setTo(value, stream);
+    }
+
     CV_DbgAssert( size() == mask.size() && mask.type() == CV_8UC1 );
 
     typedef void (*func_t)(const GpuMat& mat, const GpuMat& mask, Scalar scalar, Stream& stream);
diff --git a/modules/core/src/cuda_gpu_mat.cpp b/modules/core/src/cuda_gpu_mat.cpp
index 4440d58536..9a17ddd85d 100644
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@@ -342,6 +342,75 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
     }
 }
 
+GpuMat cv::cuda::getInputMat(InputArray _src, Stream& stream)
+{
+    GpuMat src;
+
+#ifndef HAVE_CUDA
+    (void) _src;
+    (void) stream;
+    throw_no_cuda();
+#else
+    if (_src.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        src = _src.getGpuMat();
+    }
+    else if (!_src.empty())
+    {
+        BufferPool pool(stream);
+        src = pool.getBuffer(_src.size(), _src.type());
+        src.upload(_src, stream);
+    }
+#endif
+
+    return src;
+}
+
+GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream)
+{
+    GpuMat dst;
+
+#ifndef HAVE_CUDA
+    (void) _dst;
+    (void) rows;
+    (void) cols;
+    (void) type;
+    (void) stream;
+    throw_no_cuda();
+#else
+    if (_dst.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        _dst.create(rows, cols, type);
+        dst = _dst.getGpuMat();
+    }
+    else
+    {
+        BufferPool pool(stream);
+        dst = pool.getBuffer(rows, cols, type);
+    }
+#endif
+
+    return dst;
+}
+
+void cv::cuda::syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream)
+{
+#ifndef HAVE_CUDA
+    (void) dst;
+    (void) _dst;
+    (void) stream;
+    throw_no_cuda();
+#else
+    if (_dst.kind() != _InputArray::CUDA_GPU_MAT)
+    {
+        if (stream)
+            dst.download(_dst, stream);
+        else
+            dst.download(_dst);
+    }
+#endif
+}
+
 #ifndef HAVE_CUDA
 
 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 1f9d686723..daf13a2dda 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -48,6 +48,13 @@
 # endif
 #endif
 
+#if defined ANDROID || defined __linux__
+#  include <unistd.h>
+#  include <fcntl.h>
+#  include <elf.h>
+#  include <linux/auxvec.h>
+#endif
+
 #if defined WIN32 || defined _WIN32 || defined WINCE
 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
@@ -251,6 +258,29 @@ struct HWFeatures
             f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
         }
 
+    #if defined ANDROID || defined __linux__
+        int cpufile = open("/proc/self/auxv", O_RDONLY);
+
+        if (cpufile >= 0)
+        {
+            Elf32_auxv_t auxv;
+            const size_t size_auxv_t = sizeof(Elf32_auxv_t);
+
+            while (read(cpufile, &auxv, sizeof(Elf32_auxv_t)) == size_auxv_t)
+            {
+                if (auxv.a_type == AT_HWCAP)
+                {
+                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    break;
+                }
+            }
+
+            close(cpufile);
+        }
+    #elif (defined __clang__ || defined __APPLE__) && defined __ARM_NEON__
+        f.have[CV_CPU_NEON] = true;
+    #endif
+
         return f;
     }
 
diff --git a/modules/cuda/CMakeLists.txt b/modules/cuda/CMakeLists.txt
index 389e90b47e..d668ea8b01 100644
--- a/modules/cuda/CMakeLists.txt
+++ b/modules/cuda/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Computer Vision")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
 
-ocv_define_module(cuda opencv_calib3d opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
+ocv_define_module(cuda opencv_calib3d opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
diff --git a/modules/cuda/include/opencv2/cuda.hpp b/modules/cuda/include/opencv2/cuda.hpp
index 93bb511cd0..c6004296bd 100644
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@@ -53,274 +53,11 @@
     @addtogroup cuda
     @{
         @defgroup cuda_calib3d Camera Calibration and 3D Reconstruction
-        @defgroup cuda_objdetect Object Detection
     @}
  */
 
 namespace cv { namespace cuda {
 
-//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
-
-//! @addtogroup cuda_objdetect
-//! @{
-
-struct CV_EXPORTS HOGConfidence
-{
-   double scale;
-   std::vector<Point> locations;
-   std::vector<double> confidences;
-   std::vector<double> part_scores[4];
-};
-
-/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
-
-Interfaces of all methods are kept similar to the CPU HOG descriptor and detector analogues as much
-as possible.
-
-@note
-   -   An example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/cpp/peopledetect.cpp
-    -   A CUDA example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/gpu/hog.cpp
-    -   (Python) An example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/python2/peopledetect.py
- */
-struct CV_EXPORTS HOGDescriptor
-{
-    enum { DEFAULT_WIN_SIGMA = -1 };
-    enum { DEFAULT_NLEVELS = 64 };
-    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
-
-    /** @brief Creates the HOG descriptor and detector.
-
-    @param win_size Detection window size. Align to block size and block stride.
-    @param block_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
-    @param block_stride Block stride. It must be a multiple of cell size.
-    @param cell_size Cell size. Only (8, 8) is supported for now.
-    @param nbins Number of bins. Only 9 bins per cell are supported for now.
-    @param win_sigma Gaussian smoothing window parameter.
-    @param threshold_L2hys L2-Hys normalization method shrinkage.
-    @param gamma_correction Flag to specify whether the gamma correction preprocessing is required or
-    not.
-    @param nlevels Maximum number of detection window increases.
-     */
-    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
-                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
-                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
-                  double threshold_L2hys=0.2, bool gamma_correction=true,
-                  int nlevels=DEFAULT_NLEVELS);
-
-    /** @brief Returns the number of coefficients required for the classification.
-     */
-    size_t getDescriptorSize() const;
-    /** @brief Returns the block histogram size.
-    */
-    size_t getBlockHistogramSize() const;
-
-    /** @brief Sets coefficients for the linear SVM classifier.
-    */
-    void setSVMDetector(const std::vector<float>& detector);
-
-    /** @brief Returns coefficients of the classifier trained for people detection (for default window size).
-    */
-    static std::vector<float> getDefaultPeopleDetector();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
-    */
-    static std::vector<float> getPeopleDetector48x96();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
-    */
-    static std::vector<float> getPeopleDetector64x128();
-
-    /** @brief Performs object detection without a multi-scale window.
-
-    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
-    @param found_locations Left-top corner points of detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane.
-    Usually it is 0 and should be specfied in the detector coefficients (as the last free
-    coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
-    manually here.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-     */
-    void detect(const GpuMat& img, std::vector<Point>& found_locations,
-                double hit_threshold=0, Size win_stride=Size(),
-                Size padding=Size());
-
-    /** @brief Performs object detection with a multi-scale window.
-
-    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
-    @param found_locations Detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane. See
-    cuda::HOGDescriptor::detect for details.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-    @param scale0 Coefficient of the detection window increase.
-    @param group_threshold Coefficient to regulate the similarity threshold. When detected, some
-    objects can be covered by many rectangles. 0 means not to perform grouping. See groupRectangles .
-     */
-    void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                          double hit_threshold=0, Size win_stride=Size(),
-                          Size padding=Size(), double scale0=1.05,
-                          int group_threshold=2);
-
-    void computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
-                                                Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences);
-
-    void computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                                                                    double hit_threshold, Size win_stride, Size padding,
-                                                                    std::vector<HOGConfidence> &conf_out, int group_threshold);
-
-    /** @brief Returns block descriptors computed for the whole image.
-
-    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param descriptors 2D array of descriptors.
-    @param descr_format Descriptor storage format:
-    -   **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
-    -   **DESCR_FORMAT_COL_BY_COL** - Column-major order.
-
-    The function is mainly used to learn the classifier.
-     */
-    void getDescriptors(const GpuMat& img, Size win_stride,
-                        GpuMat& descriptors,
-                        int descr_format=DESCR_FORMAT_COL_BY_COL);
-
-    Size win_size;
-    Size block_size;
-    Size block_stride;
-    Size cell_size;
-    int nbins;
-    double win_sigma;
-    double threshold_L2hys;
-    bool gamma_correction;
-    int nlevels;
-
-protected:
-    void computeBlockHistograms(const GpuMat& img);
-    void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
-
-    double getWinSigma() const;
-    bool checkDetectorSize() const;
-
-    static int numPartsWithin(int size, int part_size, int stride);
-    static Size numPartsWithin(Size size, Size part_size, Size stride);
-
-    // Coefficients of the separating plane
-    float free_coef;
-    GpuMat detector;
-
-    // Results of the last classification step
-    GpuMat labels, labels_buf;
-    Mat labels_host;
-
-    // Results of the last histogram evaluation step
-    GpuMat block_hists, block_hists_buf;
-
-    // Gradients conputation results
-    GpuMat grad, qangle, grad_buf, qangle_buf;
-
-    // returns subbuffer with required size, reallocates buffer if nessesary.
-    static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
-    static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
-
-    std::vector<GpuMat> image_scales;
-};
-
-//////////////////////////// CascadeClassifier ////////////////////////////
-
-/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
-
-@note
-   -   A cascade classifier example can be found at
-        opencv_source_code/samples/gpu/cascadeclassifier.cpp
-    -   A Nvidea API specific cascade classifier example can be found at
-        opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
- */
-class CV_EXPORTS CascadeClassifier_CUDA
-{
-public:
-    CascadeClassifier_CUDA();
-    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
-
-    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
-    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
-    type of OpenCV XML cascade supported for LBP.
-     */
-    CascadeClassifier_CUDA(const String& filename);
-    ~CascadeClassifier_CUDA();
-
-    /** @brief Checks whether the classifier is loaded or not.
-    */
-    bool empty() const;
-    /** @brief Loads the classifier from a file. The previous content is destroyed.
-
-    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
-    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
-    type of OpenCV XML cascade supported for LBP.
-     */
-    bool load(const String& filename);
-    /** @brief Destroys the loaded classifier.
-    */
-    void release();
-
-    /** @overload */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
-    /** @brief Detects objects of different sizes in the input image.
-
-    @param image Matrix of type CV_8U containing an image where objects should be detected.
-    @param objectsBuf Buffer to store detected objects (rectangles). If it is empty, it is allocated
-    with the default size. If not empty, the function searches not more than N objects, where
-    N = sizeof(objectsBufer's data)/sizeof(cv::Rect).
-    @param maxObjectSize Maximum possible object size. Objects larger than that are ignored. Used for
-    second signature and supported only for LBP cascades.
-    @param scaleFactor Parameter specifying how much the image size is reduced at each image scale.
-    @param minNeighbors Parameter specifying how many neighbors each candidate rectangle should have
-    to retain it.
-    @param minSize Minimum possible object size. Objects smaller than that are ignored.
-
-    The detected objects are returned as a list of rectangles.
-
-    The function returns the number of detected objects, so you can retrieve them as in the following
-    example:
-    @code
-        cuda::CascadeClassifier_CUDA cascade_gpu(...);
-
-        Mat image_cpu = imread(...)
-        GpuMat image_gpu(image_cpu);
-
-        GpuMat objbuf;
-        int detections_number = cascade_gpu.detectMultiScale( image_gpu,
-                  objbuf, 1.2, minNeighbors);
-
-        Mat obj_host;
-        // download only detected number of rectangles
-        objbuf.colRange(0, detections_number).download(obj_host);
-
-        Rect* faces = obj_host.ptr<Rect>();
-        for(int i = 0; i < detections_num; ++i)
-           cv::rectangle(image_cpu, faces[i], Scalar(255));
-
-        imshow("Faces", image_cpu);
-    @endcode
-    @sa CascadeClassifier::detectMultiScale
-     */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
-
-    bool findLargestObject;
-    bool visualizeInPlace;
-
-    Size getClassifierSize() const;
-
-private:
-    struct CascadeClassifierImpl;
-    CascadeClassifierImpl* impl;
-    struct HaarCascade;
-    struct LbpCascade;
-    friend class CascadeClassifier_CUDA_LBP;
-};
-
-//! @} cuda_objdetect
-
 //////////////////////////// Labeling ////////////////////////////
 
 //! @addtogroup cuda
diff --git a/modules/cuda/perf/perf_precomp.hpp b/modules/cuda/perf/perf_precomp.hpp
index fbf7d3a191..f810968cb7 100644
--- a/modules/cuda/perf/perf_precomp.hpp
+++ b/modules/cuda/perf/perf_precomp.hpp
@@ -56,7 +56,6 @@
 
 #include "opencv2/cuda.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"
 
 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
diff --git a/modules/cuda/src/hog.cpp b/modules/cuda/src/hog.cpp
deleted file mode 100644
index f71bf2b5ad..0000000000
--- a/modules/cuda/src/hog.cpp
+++ /dev/null
@@ -1,1619 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::cuda::HOGDescriptor::HOGDescriptor(Size, Size, Size, Size, int, double, double, bool, int) { throw_no_cuda(); }
-size_t cv::cuda::HOGDescriptor::getDescriptorSize() const { throw_no_cuda(); return 0; }
-size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const { throw_no_cuda(); return 0; }
-double cv::cuda::HOGDescriptor::getWinSigma() const { throw_no_cuda(); return 0; }
-bool cv::cuda::HOGDescriptor::checkDetectorSize() const { throw_no_cuda(); return false; }
-void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>&) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::detect(const GpuMat&, std::vector<Point>&, double, Size, Size) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, double, int) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat&) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat&, Size, GpuMat&, int) { throw_no_cuda(); }
-std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector() { throw_no_cuda(); return std::vector<float>(); }
-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96() { throw_no_cuda(); return std::vector<float>(); }
-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128() { throw_no_cuda(); return std::vector<float>(); }
-void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat&, std::vector<Point>&, double, Size, Size, std::vector<Point>&, std::vector<double>&) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, std::vector<HOGConfidence>&, int) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace hog
-    {
-        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                              int nblocks_win_x, int nblocks_win_y);
-
-        void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                           int height, int width, const cv::cuda::PtrStepSzf& grad,
-                           const cv::cuda::PtrStepSzb& qangle, float sigma, float* block_hists);
-
-        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                             int height, int width, float* block_hists, float threshold);
-
-        void classify_hists(int win_height, int win_width, int block_stride_y,
-                            int block_stride_x, int win_stride_y, int win_stride_x, int height,
-                            int width, float* block_hists, float* coefs, float free_coef,
-                            float threshold, unsigned char* labels);
-
-        void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                           int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                           float* coefs, float free_coef, float threshold, float *confidences);
-
-        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    cv::cuda::PtrStepSzf descriptors);
-        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    cv::cuda::PtrStepSzf descriptors);
-
-        void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
-                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
-        void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
-                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
-
-        void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
-        void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
-    }
-}}}
-
-using namespace ::cv::cuda::device;
-
-cv::cuda::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
-                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
-        : win_size(win_size_),
-          block_size(block_size_),
-          block_stride(block_stride_),
-          cell_size(cell_size_),
-          nbins(nbins_),
-          win_sigma(win_sigma_),
-          threshold_L2hys(threshold_L2hys_),
-          gamma_correction(gamma_correction_),
-          nlevels(nlevels_)
-{
-    CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
-              (win_size.height - block_size.height) % block_stride.height == 0);
-
-    CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
-
-    CV_Assert(block_stride == cell_size);
-
-    CV_Assert(cell_size == Size(8, 8));
-
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
-    CV_Assert(cells_per_block == Size(2, 2));
-
-    cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
-}
-
-size_t cv::cuda::HOGDescriptor::getDescriptorSize() const
-{
-    return numPartsWithin(win_size, block_size, block_stride).area() * getBlockHistogramSize();
-}
-
-size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const
-{
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
-    return (size_t)(nbins * cells_per_block.area());
-}
-
-double cv::cuda::HOGDescriptor::getWinSigma() const
-{
-    return win_sigma >= 0 ? win_sigma : (block_size.width + block_size.height) / 8.0;
-}
-
-bool cv::cuda::HOGDescriptor::checkDetectorSize() const
-{
-    size_t detector_size = detector.rows * detector.cols;
-    size_t descriptor_size = getDescriptorSize();
-    return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
-}
-
-void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>& _detector)
-{
-    std::vector<float> detector_reordered(_detector.size());
-
-    size_t block_hist_size = getBlockHistogramSize();
-    cv::Size blocks_per_img = numPartsWithin(win_size, block_size, block_stride);
-
-    for (int i = 0; i < blocks_per_img.height; ++i)
-        for (int j = 0; j < blocks_per_img.width; ++j)
-        {
-            const float* src = &_detector[0] + (j * blocks_per_img.height + i) * block_hist_size;
-            float* dst = &detector_reordered[0] + (i * blocks_per_img.width + j) * block_hist_size;
-            for (size_t k = 0; k < block_hist_size; ++k)
-                dst[k] = src[k];
-        }
-
-    this->detector.upload(Mat(detector_reordered).reshape(1, 1));
-
-    size_t descriptor_size = getDescriptorSize();
-    free_coef = _detector.size() > descriptor_size ? _detector[descriptor_size] : 0;
-
-    CV_Assert(checkDetectorSize());
-}
-
-cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(const Size& sz, int type, GpuMat& buf)
-{
-    if (buf.empty() || buf.type() != type)
-        buf.create(sz, type);
-    else
-        if (buf.cols < sz.width || buf.rows < sz.height)
-            buf.create(std::max(buf.rows, sz.height), std::max(buf.cols, sz.width), type);
-
-    return buf(Rect(Point(0,0), sz));
-}
-
-cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(int rows, int cols, int type, GpuMat& buf)
-{
-    return getBuffer(Size(cols, rows), type, buf);
-}
-
-
-void cv::cuda::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& _grad, GpuMat& _qangle)
-{
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-
-    //   grad.create(img.size(), CV_32FC2);
-    _grad = getBuffer(img.size(), CV_32FC2, grad_buf);
-
-    //   qangle.create(img.size(), CV_8UC2);
-    _qangle = getBuffer(img.size(), CV_8UC2, qangle_buf);
-
-    float angleScale = (float)(nbins / CV_PI);
-    switch (img.type())
-    {
-        case CV_8UC1:
-            hog::compute_gradients_8UC1(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
-            break;
-        case CV_8UC4:
-            hog::compute_gradients_8UC4(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
-            break;
-    }
-}
-
-
-void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
-{
-    computeGradient(img, grad, qangle);
-
-    size_t block_hist_size = getBlockHistogramSize();
-    Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
-
-    //   block_hists.create(1, block_hist_size * blocks_per_img.area(), CV_32F);
-    block_hists = getBuffer(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F, block_hists_buf);
-
-    hog::compute_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols,
-                        grad, qangle, (float)getWinSigma(), block_hists.ptr<float>());
-
-    hog::normalize_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols,
-                         block_hists.ptr<float>(), (float)threshold_L2hys);
-}
-
-
-void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format)
-{
-    CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
-
-    computeBlockHistograms(img);
-
-    const size_t block_hist_size = getBlockHistogramSize();
-    Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    Size wins_per_img   = numPartsWithin(img.size(), win_size, win_stride);
-
-    descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
-
-    switch (descr_format)
-    {
-    case DESCR_FORMAT_ROW_BY_ROW:
-        hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), descriptors);
-        break;
-    case DESCR_FORMAT_COL_BY_COL:
-        hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), descriptors);
-        break;
-    default:
-        CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
-    }
-}
-
-void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
-                          Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences)
-{
-  CV_Assert(padding == Size(0, 0));
-
-  hits.clear();
-  if (detector.empty())
-    return;
-
-  computeBlockHistograms(img);
-
-  if (win_stride == Size())
-    win_stride = block_stride;
-  else
-    CV_Assert(win_stride.width % block_stride.width == 0 &&
-         win_stride.height % block_stride.height == 0);
-
-  Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
-  labels.create(1, wins_per_img.area(), CV_32F);
-
-  hog::compute_confidence_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-               win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(),
-               detector.ptr<float>(), (float)free_coef, (float)hit_threshold, labels.ptr<float>());
-
-  labels.download(labels_host);
-  float* vec = labels_host.ptr<float>();
-
-  // does not support roi for now..
-  locations.clear();
-  confidences.clear();
-  for (int i = 0; i < wins_per_img.area(); i++)
-    {
-      int y = i / wins_per_img.width;
-      int x = i - wins_per_img.width * y;
-      if (vec[i] >= hit_threshold)
-   hits.push_back(Point(x * win_stride.width, y * win_stride.height));
-
-      Point pt(win_stride.width * x, win_stride.height * y);
-      locations.push_back(pt);
-      confidences.push_back((double)vec[i]);
-    }
-}
-
-void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                            double hit_threshold, Size win_stride, Size padding,
-                            std::vector<HOGConfidence> &conf_out, int group_threshold)
-{
-    std::vector<double> level_scale;
-    double scale = 1.;
-    int levels = 0;
-
-    for (levels = 0; levels < (int)conf_out.size(); levels++)
-    {
-        scale = conf_out[levels].scale;
-        level_scale.push_back(scale);
-        if (cvRound(img.cols/scale) < win_size.width || cvRound(img.rows/scale) < win_size.height)
-            break;
-    }
-
-    levels = std::max(levels, 1);
-    level_scale.resize(levels);
-
-    std::vector<Rect> all_candidates;
-    std::vector<Point> locations;
-
-    for (size_t i = 0; i < level_scale.size(); i++)
-    {
-        scale = level_scale[i];
-        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
-        GpuMat smaller_img;
-
-        if (sz == img.size())
-            smaller_img = img;
-        else
-        {
-            smaller_img.create(sz, img.type());
-            switch (img.type())
-            {
-            case CV_8UC1: hog::resize_8UC1(img, smaller_img); break;
-            case CV_8UC4: hog::resize_8UC4(img, smaller_img); break;
-            }
-        }
-
-        computeConfidence(smaller_img, locations, hit_threshold, win_stride, padding, conf_out[i].locations, conf_out[i].confidences);
-
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
-        for (size_t j = 0; j < locations.size(); j++)
-            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
-    }
-
-    found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
-}
-
-
-void cv::cuda::HOGDescriptor::detect(const GpuMat& img, std::vector<Point>& hits, double hit_threshold, Size win_stride, Size padding)
-{
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-    CV_Assert(padding == Size(0, 0));
-
-    hits.clear();
-    if (detector.empty())
-        return;
-
-    computeBlockHistograms(img);
-
-    if (win_stride == Size())
-        win_stride = block_stride;
-    else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
-
-    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
-    //   labels.create(1, wins_per_img.area(), CV_8U);
-    labels = getBuffer(1, wins_per_img.area(), CV_8U, labels_buf);
-
-    hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                        win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(),
-                        detector.ptr<float>(), (float)free_coef, (float)hit_threshold, labels.ptr());
-
-    labels.download(labels_host);
-    unsigned char* vec = labels_host.ptr();
-    for (int i = 0; i < wins_per_img.area(); i++)
-    {
-        int y = i / wins_per_img.width;
-        int x = i - wins_per_img.width * y;
-        if (vec[i])
-            hits.push_back(Point(x * win_stride.width, y * win_stride.height));
-    }
-}
-
-
-
-void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations, double hit_threshold,
-                                              Size win_stride, Size padding, double scale0, int group_threshold)
-{
-
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-
-    std::vector<double> level_scale;
-    double scale = 1.;
-    int levels = 0;
-
-    for (levels = 0; levels < nlevels; levels++)
-    {
-        level_scale.push_back(scale);
-        if (cvRound(img.cols/scale) < win_size.width ||
-            cvRound(img.rows/scale) < win_size.height || scale0 <= 1)
-            break;
-        scale *= scale0;
-    }
-    levels = std::max(levels, 1);
-    level_scale.resize(levels);
-    image_scales.resize(levels);
-
-    std::vector<Rect> all_candidates;
-    std::vector<Point> locations;
-
-    for (size_t i = 0; i < level_scale.size(); i++)
-    {
-        scale = level_scale[i];
-        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
-        GpuMat smaller_img;
-
-        if (sz == img.size())
-            smaller_img = img;
-        else
-        {
-            image_scales[i].create(sz, img.type());
-            switch (img.type())
-            {
-                case CV_8UC1: hog::resize_8UC1(img, image_scales[i]); break;
-                case CV_8UC4: hog::resize_8UC4(img, image_scales[i]); break;
-            }
-            smaller_img = image_scales[i];
-        }
-
-        detect(smaller_img, locations, hit_threshold, win_stride, padding);
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
-        for (size_t j = 0; j < locations.size(); j++)
-            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
-    }
-
-    found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
-}
-
-int cv::cuda::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
-{
-    return (size - part_size + stride) / stride;
-}
-
-cv::Size cv::cuda::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
-{
-    return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
-}
-
-std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector()
-{
-    return getPeopleDetector64x128();
-}
-
-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96()
-{
-    static const float detector[] = {
-        0.294350f, -0.098796f, -0.129522f, 0.078753f, 0.387527f, 0.261529f,
-        0.145939f, 0.061520f, 0.328699f, 0.227148f, -0.066467f, -0.086723f,
-        0.047559f, 0.106714f, 0.037897f, 0.111461f, -0.024406f, 0.304769f,
-        0.254676f, -0.069235f, 0.082566f, 0.147260f, 0.326969f, 0.148888f,
-        0.055270f, -0.087985f, 0.261720f, 0.143442f, 0.026812f, 0.238212f,
-        0.194020f, 0.056341f, -0.025854f, -0.034444f, -0.156631f, 0.205174f,
-        0.089008f, -0.139811f, -0.100147f, -0.037830f, -0.029230f, -0.055641f,
-        0.033248f, -0.016512f, 0.155244f, 0.247315f, -0.124694f, -0.048414f,
-        -0.062219f, 0.193683f, 0.004574f, 0.055089f, 0.093565f, 0.167712f,
-        0.167581f, 0.018895f, 0.215258f, 0.122609f, 0.090520f, -0.067219f,
-        -0.049029f, -0.099615f, 0.241804f, -0.094893f, -0.176248f, 0.001727f,
-        -0.134473f, 0.104442f, 0.050942f, 0.081165f, 0.072156f, 0.121646f,
-        0.002656f, -0.297974f, -0.133587f, -0.060121f, -0.092515f, -0.048974f,
-        -0.084754f, -0.180111f, -0.038590f, 0.086283f, -0.134636f, -0.107249f,
-        0.132890f, 0.141556f, 0.249425f, 0.130273f, -0.030031f, 0.073212f,
-        -0.008155f, 0.019931f, 0.071688f, 0.000300f, -0.019525f, -0.021725f,
-        -0.040993f, -0.086841f, 0.070124f, 0.240033f, 0.265350f, 0.043208f,
-        0.166754f, 0.091453f, 0.060916f, -0.036972f, -0.091043f, 0.079873f,
-        0.219781f, 0.158102f, -0.140618f, -0.043016f, 0.124802f, 0.093668f,
-        0.103208f, 0.094872f, 0.080541f, 0.137711f, 0.160566f, -0.169231f,
-        0.013983f, 0.309508f, -0.004217f, -0.057200f, -0.064489f, 0.014066f,
-        0.361009f, 0.251328f, -0.080983f, -0.044183f, 0.061436f, -0.037381f,
-        -0.078786f, 0.030993f, 0.066314f, 0.037683f, 0.152325f, -0.091683f,
-        0.070203f, 0.217856f, 0.036435f, -0.076462f, 0.006254f, -0.094431f,
-        0.154829f, -0.023038f, -0.196961f, -0.024594f, 0.178465f, -0.050139f,
-        -0.045932f, -0.000965f, 0.109112f, 0.046165f, -0.159373f, -0.008713f,
-        0.041307f, 0.097129f, -0.057211f, -0.064599f, 0.077165f, 0.176167f,
-        0.138322f, 0.065753f, -0.104950f, 0.017933f, 0.136255f, -0.011598f,
-        0.047007f, 0.080550f, 0.068619f, 0.084661f, -0.035493f, -0.091314f,
-        -0.041411f, 0.060971f, -0.101912f, -0.079870f, -0.085977f, -0.022686f,
-        0.079788f, -0.098064f, -0.054603f, 0.040383f, 0.300794f, 0.128603f,
-        0.094844f, 0.047407f, 0.101825f, 0.061832f, -0.162160f, -0.204553f,
-        -0.035165f, 0.101450f, -0.016641f, -0.027140f, -0.134392f, -0.008743f,
-        0.102331f, 0.114853f, 0.009644f, 0.062823f, 0.237339f, 0.167843f,
-        0.053066f, -0.012592f, 0.043158f, 0.002305f, 0.065001f, -0.038929f,
-        -0.020356f, 0.152343f, 0.043469f, -0.029967f, -0.042948f, 0.032481f,
-        0.068488f, -0.110840f, -0.111083f, 0.111980f, -0.002072f, -0.005562f,
-        0.082926f, 0.006635f, -0.108153f, 0.024242f, -0.086464f, -0.189884f,
-        -0.017492f, 0.191456f, -0.007683f, -0.128769f, -0.038017f, -0.132380f,
-        0.091926f, 0.079696f, -0.106728f, -0.007656f, 0.172744f, 0.011576f,
-        0.009883f, 0.083258f, -0.026516f, 0.145534f, 0.153924f, -0.130290f,
-        -0.108945f, 0.124490f, -0.003186f, -0.100485f, 0.015024f, -0.060512f,
-        0.026288f, -0.086713f, -0.169012f, 0.076517f, 0.215778f, 0.043701f,
-        -0.131642f, -0.012585f, -0.045181f, -0.118183f, -0.241544f, -0.167293f,
-        -0.020107f, -0.019917f, -0.101827f, -0.107096f, -0.010503f, 0.044938f,
-        0.189680f, 0.217119f, -0.046086f, 0.044508f, 0.199716f, -0.036004f,
-        -0.148927f, 0.013355f, -0.078279f, 0.030451f, 0.056301f, -0.024609f,
-        0.083224f, 0.099533f, -0.039432f, -0.138880f, 0.005482f, -0.024120f,
-        -0.140468f, -0.066381f, -0.017057f, 0.009260f, -0.058004f, -0.028486f,
-        -0.061610f, 0.007483f, -0.158309f, -0.150687f, -0.044595f, -0.105121f,
-        -0.045763f, -0.006618f, -0.024419f, -0.117713f, -0.119366f, -0.175941f,
-        -0.071542f, 0.119027f, 0.111362f, 0.043080f, 0.034889f, 0.093003f,
-        0.007842f, 0.057368f, -0.108834f, -0.079968f, 0.230959f, 0.020205f,
-        0.011470f, 0.098877f, 0.101310f, -0.030215f, -0.018018f, -0.059552f,
-        -0.106157f, 0.021866f, -0.036471f, 0.080051f, 0.041165f, -0.082101f,
-        0.117726f, 0.030961f, -0.054763f, -0.084102f, -0.185778f, -0.061305f,
-        -0.038089f, -0.110728f, -0.264010f, 0.076675f, -0.077111f, -0.137644f,
-        0.036232f, 0.277995f, 0.019116f, 0.107738f, 0.144003f, 0.080304f,
-        0.215036f, 0.228897f, 0.072713f, 0.077773f, 0.120168f, 0.075324f,
-        0.062730f, 0.122478f, -0.049008f, 0.164912f, 0.162450f, 0.041246f,
-        0.009891f, -0.097827f, -0.038700f, -0.023027f, -0.120020f, 0.203364f,
-        0.248474f, 0.149810f, -0.036276f, -0.082814f, -0.090343f, -0.027143f,
-        -0.075689f, -0.320310f, -0.000500f, -0.143334f, -0.065077f, -0.186936f,
-        0.129372f, 0.116431f, 0.181699f, 0.170436f, 0.418854f, 0.460045f,
-        0.333719f, 0.230515f, 0.047822f, -0.044954f, -0.068086f, 0.140179f,
-        -0.044821f, 0.085550f, 0.092483f, -0.107296f, -0.130670f, -0.206629f,
-        0.114601f, -0.317869f, -0.076663f, 0.038680f, 0.212753f, -0.016059f,
-        -0.126526f, -0.163602f, 0.210154f, 0.099887f, -0.126366f, 0.118453f,
-        0.019309f, -0.021611f, -0.096499f, -0.111809f, -0.200489f, 0.142854f,
-        0.228840f, -0.353346f, -0.179151f, 0.116834f, 0.252389f, -0.031728f,
-        -0.188135f, -0.158998f, 0.386523f, 0.122315f, 0.209944f, 0.394023f,
-        0.359030f, 0.260717f, 0.170335f, 0.013683f, -0.142596f, -0.026138f,
-        -0.011878f, -0.150519f, 0.047159f, -0.107062f, -0.147347f, -0.187689f,
-        -0.186027f, -0.208048f, 0.058468f, -0.073026f, -0.236556f, -0.079788f,
-        -0.146216f, -0.058563f, -0.101361f, -0.071294f, -0.071093f, 0.116919f,
-        0.234304f, 0.306781f, 0.321866f, 0.240000f, 0.073261f, -0.012173f,
-        0.026479f, 0.050173f, 0.166127f, 0.228955f, 0.061905f, 0.156460f,
-        0.205990f, 0.120672f, 0.037350f, 0.167884f, 0.290099f, 0.420900f,
-        -0.012601f, 0.189839f, 0.306378f, 0.118383f, -0.095598f, -0.072360f,
-        -0.132496f, -0.224259f, -0.126021f, 0.022714f, 0.284039f, 0.051369f,
-        -0.000927f, -0.058735f, -0.083354f, -0.141254f, -0.187578f, -0.202669f,
-        0.048902f, 0.246597f, 0.441863f, 0.342519f, 0.066979f, 0.215286f,
-        0.188191f, -0.072240f, -0.208142f, -0.030196f, 0.178141f, 0.136985f,
-        -0.043374f, -0.181098f, 0.091815f, 0.116177f, -0.126690f, -0.386625f,
-        0.368165f, 0.269149f, -0.088042f, -0.028823f, 0.092961f, 0.024099f,
-        0.046112f, 0.176756f, 0.135849f, 0.124955f, 0.195467f, -0.037218f,
-        0.167217f, 0.188938f, 0.053528f, -0.066561f, 0.133721f, -0.070565f,
-        0.115898f, 0.152435f, -0.116993f, -0.110592f, -0.179005f, 0.026668f,
-        0.080530f, 0.075084f, -0.070401f, 0.012497f, 0.021849f, -0.139764f,
-        -0.022020f, -0.096301f, -0.064954f, -0.127446f, -0.013806f, -0.108315f,
-        0.156285f, 0.149867f, -0.011382f, 0.064532f, 0.029168f, 0.027393f,
-        0.069716f, 0.153735f, 0.038459f, 0.230714f, 0.253840f, 0.059522f,
-        -0.045053f, 0.014083f, 0.071103f, 0.068747f, 0.095887f, 0.005832f,
-        0.144887f, 0.026357f, -0.067359f, -0.044151f, -0.123283f, -0.019911f,
-        0.005318f, 0.109208f, -0.003201f, -0.021734f, 0.142025f, -0.066907f,
-        -0.120070f, -0.188639f, 0.012472f, -0.048704f, -0.012366f, -0.184828f,
-        0.168591f, 0.267166f, 0.058208f, -0.044101f, 0.033500f, 0.178558f,
-        0.104550f, 0.122418f, 0.080177f, 0.173246f, 0.298537f, 0.064173f,
-        0.053397f, 0.174341f, 0.230984f, 0.117025f, 0.166242f, 0.227781f,
-        0.120623f, 0.176952f, -0.011393f, -0.086483f, -0.008270f, 0.051700f,
-        -0.153369f, -0.058837f, -0.057639f, -0.060115f, 0.026349f, -0.160745f,
-        -0.037894f, -0.048575f, 0.041052f, -0.022112f, 0.060365f, 0.051906f,
-        0.162657f, 0.138519f, -0.050185f, -0.005938f, 0.071301f, 0.127686f,
-        0.062342f, 0.144400f, 0.072600f, 0.198436f, 0.246219f, -0.078185f,
-        -0.036169f, 0.075934f, 0.047328f, -0.013601f, 0.087205f, 0.019900f,
-        0.022606f, -0.015365f, -0.092506f, 0.075275f, -0.116375f, 0.050500f,
-        0.045118f, 0.166567f, 0.072073f, 0.060371f, 0.131747f, -0.169863f,
-        -0.039352f, -0.047486f, -0.039797f, -0.204312f, 0.021710f, 0.129443f,
-        -0.021173f, 0.173416f, -0.070794f, -0.063986f, 0.069689f, -0.064099f,
-        -0.123201f, -0.017372f, -0.206870f, 0.065863f, 0.113226f, 0.024707f,
-        -0.071341f, -0.066964f, -0.098278f, -0.062927f, 0.075840f, 0.014716f,
-        0.019378f, 0.132699f, -0.074191f, -0.089557f, -0.078446f, -0.197488f,
-        -0.173665f, 0.052583f, 0.044361f, 0.113549f, 0.098492f, 0.077379f,
-        -0.011146f, -0.192593f, -0.164435f, 0.045568f, 0.205699f, 0.049187f,
-        -0.082281f, 0.134874f, 0.185499f, 0.034968f, -0.119561f, -0.112372f,
-        -0.115091f, -0.054042f, -0.183816f, -0.078100f, 0.190695f, 0.091617f,
-        0.004257f, -0.041135f, -0.061453f, -0.141592f, -0.194809f, -0.120638f,
-        0.020168f, 0.109672f, 0.067398f, -0.015238f, -0.239145f, -0.264671f,
-        -0.185176f, 0.050472f, 0.020793f, 0.035678f, 0.022839f, -0.052055f,
-        -0.127968f, -0.113049f, -0.228416f, -0.258281f, -0.053437f, 0.076424f,
-        0.061450f, 0.237478f, 0.003618f, -0.055865f, -0.108087f, -0.028937f,
-        0.045585f, 0.052829f, -0.001471f, 0.022826f, 0.059565f, -0.104430f,
-        -0.077266f, -0.211882f, -0.212078f, 0.028074f, 0.075846f, 0.016265f,
-        0.161879f, 0.134477f, 0.008935f, -0.048041f, 0.074692f, 0.004928f,
-        -0.025156f, 0.192874f, 0.074410f, 0.308732f, 0.267400f, 0.094208f,
-        -0.005251f, 0.042041f, -0.032148f, 0.015588f, 0.252869f, 0.175302f,
-        0.022892f, 0.081673f, 0.063208f, 0.162626f, 0.194426f, 0.233890f,
-        0.262292f, 0.186930f, 0.084079f, -0.286388f, -0.213034f, -0.048867f,
-        -0.207669f, -0.170050f, 0.011673f, -0.092958f, -0.192786f, -0.273536f,
-        0.230904f, 0.266732f, 0.320519f, 0.297155f, 0.548169f, 0.304922f,
-        0.132687f, 0.247333f, 0.212488f, -0.271472f, -0.142105f, -0.002627f,
-        -0.119215f, 0.128383f, 0.100079f, -0.057490f, -0.121902f, -0.228892f,
-        0.202292f, -0.399795f, -0.371326f, -0.095836f, -0.063626f, -0.161375f,
-        -0.311180f, -0.294797f, 0.242122f, 0.011788f, 0.095573f, 0.322523f,
-        0.511840f, 0.322880f, 0.313259f, 0.173331f, 0.002542f, -0.029802f,
-        0.324766f, -0.326170f, -0.340547f, -0.138288f, -0.002963f, -0.114060f,
-        -0.377312f, -0.442570f, 0.212446f, -0.007759f, -0.011576f, 0.169711f,
-        0.308689f, 0.317348f, 0.539390f, 0.332845f, 0.057331f, -0.068180f,
-        0.101994f, 0.266995f, 0.209570f, 0.355730f, 0.091635f, 0.170238f,
-        0.125215f, 0.274154f, 0.070223f, 0.025515f, 0.049946f, -0.000550f,
-        0.043715f, -0.141843f, 0.020844f, 0.129871f, 0.256588f, 0.105015f,
-        0.148339f, 0.170682f, 0.028792f, 0.074037f, 0.160042f, 0.405137f,
-        0.246187f, 0.352160f, 0.168951f, 0.222263f, 0.264439f, 0.065945f,
-        0.021963f, -0.075084f, 0.093105f, 0.027318f, 0.098864f, 0.057566f,
-        -0.080282f, 0.185032f, 0.314419f, 0.333727f, 0.125798f, 0.294919f,
-        0.386002f, 0.217619f, -0.183517f, -0.278622f, -0.002342f, -0.027821f,
-        -0.134266f, -0.331843f, -0.008296f, 0.124564f, 0.053712f, -0.369016f,
-        -0.095036f, 0.209381f, 0.423760f, 0.371760f, 0.106397f, 0.369408f,
-        0.485608f, 0.231201f, -0.138685f, -0.349208f, -0.070083f, 0.028991f,
-        -0.081630f, -0.395992f, -0.146791f, -0.027354f, 0.063396f, -0.272484f,
-        0.058299f, 0.338207f, 0.110767f, -0.052642f, -0.233848f, -0.027448f,
-        0.030328f, 0.155572f, -0.093826f, 0.019331f, 0.120638f, 0.006292f,
-        -0.106083f, -0.236290f, -0.140933f, -0.088067f, -0.025138f, -0.208395f,
-        -0.025502f, 0.144192f, -0.048353f, -0.106144f, -0.305121f, -0.114147f,
-        0.090963f, 0.327727f, 0.035606f, -0.093779f, 0.002651f, -0.171081f,
-        -0.188131f, -0.216571f, -0.209101f, -0.054402f, 0.157147f, -0.057127f,
-        0.066584f, 0.008988f, 0.041191f, 0.034456f, -0.078255f, 0.052099f,
-        -0.022239f, 0.066981f, -0.117520f, -0.072637f, 0.062512f, 0.037570f,
-        -0.057544f, -0.312359f, 0.034357f, -0.031549f, 0.002566f, -0.207375f,
-        -0.070654f, -0.018786f, -0.044815f, -0.012814f, -0.076320f, 0.078183f,
-        0.023877f, 0.117078f, 0.022292f, -0.205424f, -0.060430f, -0.017296f,
-        -0.004827f, -0.321036f, -0.092155f, 0.038837f, 0.073190f, -0.067513f,
-        0.026521f, 0.171945f, 0.087318f, 0.034495f, -0.034089f, 0.154410f,
-        -0.061431f, 0.007435f, -0.111094f, -0.095976f, 0.014741f, -0.132324f,
-        -0.029517f, -0.192160f, 0.098667f, 0.020762f, 0.177050f, -0.064510f,
-        -0.054437f, -0.058678f, -0.001858f, 0.167602f, 0.015735f, 0.054338f,
-        0.016477f, 0.186381f, -0.010667f, 0.054692f, 0.126742f, 0.013140f,
-        0.090353f, -0.133608f, -0.018017f, -0.152619f, 0.027600f, -0.138700f,
-        -0.050274f, 0.045141f, -0.118731f, 0.094797f, -0.167605f, 0.097461f,
-        -0.009131f, 0.199920f, -0.052976f, 0.158194f, 0.178568f, -0.107600f,
-        0.009671f, -0.084072f, -0.040258f, -0.205673f, 0.102891f, 0.223511f,
-        0.042699f, 0.118548f, -0.021274f, 0.110997f, -0.155121f, 0.027696f,
-        -0.149968f, 0.051552f, -0.129219f, 0.173524f, 0.073972f, -0.189045f,
-        -0.034523f, -0.106655f, -0.011843f, -0.197381f, 0.219413f, 0.183197f,
-        -0.054920f, 0.144955f, 0.036517f, -0.085412f, -0.229070f, -0.143710f,
-        -0.049486f, 0.156634f, -0.008673f, -0.064778f, 0.082344f, 0.145673f,
-        0.002912f, -0.210121f, -0.116564f, 0.078425f, 0.220908f, -0.067594f,
-        0.048610f, 0.084912f, -0.066202f, -0.112515f, -0.217767f, -0.082640f,
-        -0.017414f, 0.230265f, -0.070735f, 0.066073f, 0.215256f, 0.071157f,
-        -0.087220f, -0.202235f, -0.011918f, 0.099562f, 0.174716f, -0.063845f,
-        -0.121055f, 0.014367f, 0.132709f, -0.005060f, -0.244606f, -0.179693f,
-        -0.134690f, 0.023239f, -0.193116f, -0.076975f, -0.021164f, -0.001938f,
-        -0.163799f, -0.111437f, -0.210362f, -0.166376f, 0.034754f, 0.010036f,
-        -0.021917f, 0.068014f, -0.086893f, -0.251746f, -0.267171f, 0.037383f,
-        0.003966f, 0.033571f, -0.151506f, 0.025437f, -0.020626f, -0.308454f,
-        -0.343143f, -0.092263f, -0.026261f, -0.028345f, 0.036036f, 0.035169f,
-        0.129470f, 0.122205f, 0.015661f, -0.070612f, -0.094333f, -0.066055f,
-        -0.041083f, 0.159146f, 0.073184f, 0.110044f, 0.174471f, 0.078069f,
-        -0.014881f, 0.008116f, 0.013209f, 0.075857f, 0.195605f, 0.062714f,
-        0.067955f, 0.056544f, -0.153908f, -0.141749f, -0.072550f, 0.033523f,
-        -0.024665f, 0.134487f, 0.079076f, 0.133562f, 0.227130f, 0.018054f,
-        0.004928f, 0.169162f, 0.065152f, 0.072160f, 0.131631f, 0.096303f,
-        0.054288f, 0.106256f, 0.114632f, 0.119038f, 0.515200f, 0.247429f,
-        0.199134f, 0.211957f, 0.127558f, -0.294684f, -0.194890f, -0.049988f,
-        -0.112247f, -0.008122f, -0.006176f, 0.037035f, -0.110881f, -0.249989f,
-        0.152434f, 0.234621f, 0.153340f, 0.349283f, 0.683049f, 0.157174f,
-        0.124844f, 0.099136f, 0.064407f, -0.248400f, -0.155323f, -0.026498f,
-        -0.023450f, 0.049051f, -0.114187f, 0.007195f, -0.176825f, -0.376926f,
-        0.366159f, -0.179938f, -0.148508f, 0.006043f, 0.170048f, 0.097866f,
-        -0.102658f, -0.260430f, 0.248868f, 0.037019f, -0.118111f, 0.078176f,
-        0.194171f, 0.211328f, 0.368612f, 0.361213f, 0.130013f, 0.094650f,
-        0.227396f, -0.178058f, -0.114782f, -0.008093f, 0.231080f, -0.011843f,
-        -0.097917f, -0.325788f, 0.141879f, 0.119738f, -0.230427f, -0.117419f,
-        -0.114153f, 0.037903f, 0.116383f, 0.218773f, -0.101884f, 0.059466f,
-        0.119255f, 0.010874f, -0.031449f, 0.045996f, 0.119931f, 0.273760f,
-        0.311700f, 0.261794f, 0.194809f, 0.339829f, 0.239449f, 0.064140f,
-        0.077597f, 0.098996f, 0.143534f, 0.184602f, 0.037507f, 0.225494f,
-        0.096142f, -0.147370f, -0.207833f, -0.174742f, -0.086391f, -0.038942f,
-        0.159577f, -0.088492f, -0.000989f, 0.108154f, -0.025890f, -0.072713f,
-        0.025997f, -0.006803f, -0.086879f, -0.011290f, -0.269200f, -0.103450f,
-        -0.124910f, -0.116340f, 0.141459f, 0.208800f, 0.042268f, 0.265034f,
-        0.516474f, 0.217591f, -0.018843f, -0.313328f, -0.168363f, 0.047129f,
-        0.090480f, -0.109852f, -0.018761f, 0.210669f, 0.281269f, -0.043591f,
-        -0.034147f, -0.237772f, -0.134843f, -0.072481f, -0.103831f, 0.038355f,
-        0.308619f, 0.148023f, -0.045867f, -0.123950f, -0.210860f, -0.064973f,
-        -0.036308f, -0.046731f, -0.022099f, 0.095776f, 0.409423f, 0.060635f,
-        -0.065196f, 0.051828f, 0.027981f, -0.009609f, -0.137681f, -0.095011f,
-        -0.019045f, 0.177278f, 0.009759f, -0.092119f, -0.016958f, -0.133860f,
-        -0.118421f, -0.032039f, -0.006214f, -0.084541f, 0.063971f, -0.073642f,
-        0.165676f, 0.110443f, 0.044131f, 0.046568f, 0.053292f, -0.055466f,
-        0.015512f, 0.371947f, 0.232102f, -0.016923f, 0.103979f, -0.091758f,
-        0.005907f, 0.209100f, 0.157433f, 0.030518f, 0.250366f, 0.062322f,
-        0.036720f, 0.094676f, 0.017306f, -0.010328f, -0.079012f, 0.016781f,
-        -0.112435f, 0.061795f, 0.042543f, -0.126799f, -0.009975f, -0.056760f,
-        0.046424f, -0.194712f, -0.139399f, -0.037731f, 0.157989f, -0.016261f,
-        0.123345f, 0.230563f, 0.083300f, -0.016392f, 0.059567f, -0.016035f,
-        -0.064767f, 0.231945f, 0.156629f, 0.034602f, 0.145628f, 0.041315f,
-        0.034535f, 0.019967f, -0.089188f, -0.012091f, 0.307857f, 0.211405f,
-        -0.025091f, -0.148249f, -0.129384f, 0.063536f, -0.068603f, -0.067941f,
-        -0.035104f, 0.210832f, 0.063810f, 0.062764f, -0.089889f, -0.030554f,
-        0.014791f, -0.053362f, -0.037818f, -0.196640f, 0.008388f, -0.082654f,
-        0.143056f, 0.064221f, 0.069795f, 0.191040f, 0.097321f, -0.028679f,
-        0.075794f, 0.313154f, 0.086240f, 0.207643f, 0.017809f, 0.122867f,
-        0.224586f, 0.167403f, -0.023884f, 0.047434f, 0.344091f, 0.187745f,
-        0.136177f, 0.141738f, 0.063799f, 0.045233f, -0.077342f, -0.003525f,
-        -0.165041f, -0.025616f, -0.073745f, 0.164439f, 0.011200f, -0.145896f,
-        -0.027954f, -0.061987f, -0.039874f, -0.142775f, 0.151042f, -0.038238f,
-        0.053152f, 0.078615f, 0.086061f, 0.100593f, 0.128046f, -0.071006f,
-        -0.116558f, 0.208445f, 0.051086f, 0.076843f, 0.023191f, -0.084781f,
-        -0.011790f, 0.147807f, -0.048554f, -0.113932f, 0.283322f, 0.190934f,
-        0.092789f, 0.033018f, -0.142428f, -0.142480f, -0.099023f, -0.041020f,
-        -0.042760f, 0.203295f, -0.053475f, 0.042424f, 0.222839f, -0.019167f,
-        -0.133176f, -0.276216f, -0.031998f, 0.117290f, 0.177827f, -0.059973f,
-        -0.064744f, -0.117040f, -0.155482f, -0.099531f, 0.164121f, -0.026682f,
-        -0.093810f, 0.238993f, -0.006506f, 0.007830f, 0.065819f, -0.203643f,
-        -0.100925f, -0.053652f, -0.130770f, 0.026277f, 0.131796f, 0.032742f,
-        0.127186f, 0.116694f, -0.161122f, -0.279773f, -0.252515f, -0.002638f,
-        0.042812f, 0.096776f, -0.123280f, 0.064858f, -0.010455f, -0.219760f,
-        -0.239331f, -0.104363f, -0.058022f, -0.053584f, 0.025611f, 0.005129f,
-        -0.100418f, -0.045712f, -0.194418f, -0.126366f, -0.030530f, 0.051168f,
-        0.215959f, 0.172402f, -0.054700f, -0.185995f, -0.278360f, -0.193693f,
-        -0.040309f, 0.003735f, -0.007770f, 0.123556f, 0.190179f, -0.077315f,
-        0.117403f, 0.212942f, 0.012160f, 0.000113f, 0.027331f, 0.040202f,
-        0.033293f, 0.219438f, 0.184174f, 0.259349f, 0.311206f, 0.082547f,
-        -0.047875f, -0.078417f, 0.010746f, 0.082620f, 0.311931f, 0.307605f,
-        0.003863f, 0.021405f, -0.026388f, -0.019572f, 0.020582f, -0.059353f,
-        0.025199f, 0.261319f, 0.086316f, 0.143614f, 0.107780f, 0.003900f,
-        -0.188397f, -0.038563f, -0.106045f, -0.125154f, -0.010509f, 0.054021f,
-        0.242130f, 0.279152f, 0.215546f, 0.346995f, 0.440856f, 0.237452f,
-        0.234154f, 0.301646f, 0.168929f, -0.208358f, -0.126848f, 0.010260f,
-        0.121018f, -0.062975f, -0.052848f, 0.050341f, -0.061103f, -0.266482f,
-        0.107186f, 0.140221f, 0.280065f, 0.287889f, 0.373198f, 0.151596f,
-        0.013593f, 0.115616f, 0.014616f, -0.281710f, -0.237597f, -0.117305f,
-        -0.000034f, -0.136739f, -0.196275f, -0.095225f, -0.125310f, -0.250514f,
-        0.236804f, -0.071805f, -0.037421f, 0.048230f, 0.321596f, 0.063632f,
-        0.024039f, -0.029133f, 0.230983f, 0.160593f, -0.154355f, -0.013086f,
-        -0.079929f, 0.094692f, 0.160391f, 0.180239f, 0.053895f, 0.100759f,
-        0.288631f, 0.038191f, 0.181692f, 0.229682f, 0.440166f, 0.063401f,
-        0.006273f, 0.020865f, 0.338695f, 0.256244f, -0.043927f, 0.115617f,
-        0.003296f, 0.173965f, 0.021318f, -0.040936f, -0.118932f, 0.182380f,
-        0.235922f, -0.053233f, -0.015053f, -0.101057f, 0.095341f, 0.051111f,
-        0.161831f, 0.032614f, 0.159496f, 0.072375f, 0.025089f, 0.023748f,
-        0.029151f, 0.161284f, -0.117717f, -0.036191f, -0.176822f, -0.162006f,
-        0.226542f, -0.078329f, 0.043079f, -0.119172f, 0.054614f, -0.101365f,
-        -0.064541f, -0.115304f, 0.135170f, 0.298872f, 0.098060f, 0.089428f,
-        -0.007497f, 0.110391f, -0.028824f, 0.020835f, -0.036804f, 0.125411f,
-        0.192105f, -0.048931f, 0.003086f, -0.010681f, 0.074698f, -0.016263f,
-        0.096063f, 0.060267f, -0.007277f, 0.139139f, -0.080635f, 0.036628f,
-        0.086058f, 0.131979f, 0.085707f, 0.025301f, 0.226094f, 0.194759f,
-        0.042193f, -0.157846f, -0.068402f, -0.141450f, -0.112659f, -0.076305f,
-        -0.069085f, -0.114332f, -0.102005f, 0.132193f, -0.067042f, 0.106643f,
-        0.198964f, 0.171616f, 0.167237f, -0.033730f, -0.026755f, 0.083621f,
-        0.149459f, -0.002799f, -0.000318f, 0.011753f, 0.065889f, -0.089375f,
-        -0.049610f, 0.224579f, 0.216548f, -0.034908f, -0.017851f, -0.088144f,
-        0.007530f, 0.240268f, 0.073270f, 0.013263f, 0.175323f, 0.012082f,
-        0.093993f, 0.015282f, 0.105854f, 0.107990f, 0.077798f, -0.096166f,
-        -0.079607f, 0.177820f, 0.142392f, 0.033337f, -0.078100f, -0.081616f,
-        -0.046993f, 0.139459f, 0.020272f, -0.123161f, 0.175269f, 0.105217f,
-        0.057328f, 0.080909f, -0.012612f, -0.097081f, 0.082060f, -0.096716f,
-        -0.063921f, 0.201884f, 0.128166f, -0.035051f, -0.032227f, -0.068139f,
-        -0.115915f, 0.095080f, -0.086007f, -0.067543f, 0.030776f, 0.032712f,
-        0.088937f, 0.054336f, -0.039329f, -0.114022f, 0.171672f, -0.112321f,
-        -0.217646f, 0.065186f, 0.060223f, 0.192174f, 0.055580f, -0.131107f,
-        -0.144338f, 0.056730f, -0.034707f, -0.081616f, -0.135298f, -0.000614f,
-        0.087189f, 0.014614f, 0.067709f, 0.107689f, 0.225780f, 0.084361f,
-        -0.008544f, 0.051649f, -0.048369f, -0.037739f, -0.060710f, 0.002654f,
-        0.016935f, 0.085563f, -0.015961f, -0.019265f, 0.111788f, 0.062376f,
-        0.202019f, 0.047713f, 0.042261f, 0.069716f, 0.242913f, 0.021052f,
-        -0.072812f, -0.155920f, -0.026436f, 0.035621f, -0.079300f, -0.028787f,
-        -0.048329f, 0.084718f, -0.060565f, -0.083750f, -0.164075f, -0.040742f,
-        -0.086219f, 0.015271f, -0.005204f, -0.016038f, 0.045816f, -0.050433f,
-        -0.077652f, 0.117109f, 0.009611f, -0.009045f, -0.008634f, -0.055373f,
-        -0.085968f, 0.028527f, -0.054736f, -0.168089f, 0.175839f, 0.071205f,
-        -0.023603f, 0.037907f, -0.004561f, -0.022634f, 0.123831f, 0.094469f,
-        -0.072920f, -0.133642f, -0.014032f, -0.142754f, -0.026999f, -0.199409f,
-        0.013268f, 0.226989f, 0.048650f, -0.170988f, -0.050141f, 0.007880f,
-        0.061880f, 0.019078f, -0.043578f, -0.038139f, 0.134814f, 0.054097f,
-        -0.081670f, 0.176838f, 0.047920f, -0.038176f, 0.050406f, -0.107181f,
-        -0.036279f, 0.027060f, 0.081594f, -0.002820f, 0.090507f, -0.033338f,
-        -0.059571f, 0.013404f, -0.099860f, 0.073371f, 0.342805f, 0.098305f,
-        -0.150910f, -0.020822f, -0.056960f, 0.046262f, -0.043413f, -0.149405f,
-        -0.129105f, -0.010899f, -0.014229f, -0.179949f, -0.113044f, -0.049468f,
-        -0.065513f, 0.090269f, -0.011919f, 0.087846f, 0.095796f, 0.146127f,
-        0.101599f, 0.078066f, -0.084348f, -0.100002f, -0.020134f, -0.050169f,
-        0.062122f, 0.014640f, 0.019143f, 0.036543f, 0.180924f, -0.013976f,
-        -0.066768f, -0.001090f, -0.070419f, -0.004839f, -0.001504f, 0.034483f,
-        -0.044954f, -0.050336f, -0.088638f, -0.174782f, -0.116082f, -0.205507f,
-        0.015587f, -0.042839f, -0.096879f, -0.144097f, -0.050268f, -0.196796f,
-        0.109639f, 0.271411f, 0.173732f, 0.108070f, 0.156437f, 0.124255f,
-        0.097242f, 0.238693f, 0.083941f, 0.109105f, 0.223940f, 0.267188f,
-        0.027385f, 0.025819f, 0.125070f, 0.093738f, 0.040353f, 0.038645f,
-        -0.012730f, 0.144063f, 0.052931f, -0.009138f, 0.084193f, 0.160272f,
-        -0.041366f, 0.011951f, -0.121446f, -0.106713f, -0.047566f, 0.047984f,
-        -0.255224f, -0.076116f, 0.098685f, -0.150845f, -0.171513f, -0.156590f,
-        0.058331f, 0.187493f, 0.413018f, 0.554265f, 0.372242f, 0.237943f,
-        0.124571f, 0.110829f, 0.010322f, -0.174477f, -0.067627f, -0.001979f,
-        0.142913f, 0.040597f, 0.019907f, 0.025963f, -0.043585f, -0.120732f,
-        0.099937f, 0.091059f, 0.247307f, 0.204226f, -0.042753f, -0.068580f,
-        -0.119002f, 0.026722f, 0.034853f, -0.060934f, -0.025054f, -0.093026f,
-        -0.035372f, -0.233209f, -0.049869f, -0.039151f, -0.022279f, -0.065380f,
-        -9.063785f };
-    return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
-}
-
-
-
-
-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128()
-{
-    static const float detector[] = {
-       0.05359386f, -0.14721455f, -0.05532170f, 0.05077307f,
-       0.11547081f, -0.04268804f, 0.04635834f, -0.05468199f, 0.08232084f,
-       0.10424068f, -0.02294518f, 0.01108519f, 0.01378693f, 0.11193510f,
-       0.01268418f, 0.08528346f, -0.06309239f, 0.13054633f, 0.08100729f,
-       -0.05209739f, -0.04315529f, 0.09341384f, 0.11035026f, -0.07596218f,
-       -0.05517511f, -0.04465296f, 0.02947334f, 0.04555536f,
-       -3.55954492e-003f, 0.07818956f, 0.07730991f, 0.07890715f, 0.06222893f,
-       0.09001380f, -0.03574381f, 0.03414327f, 0.05677258f, -0.04773581f,
-       0.03746637f, -0.03521175f, 0.06955440f, -0.03849038f, 0.01052293f,
-       0.01736112f, 0.10867710f, 0.08748853f, 3.29739624e-003f, 0.10907028f,
-       0.07913758f, 0.10393070f, 0.02091867f, 0.11594022f, 0.13182420f,
-       0.09879354f, 0.05362710f, -0.06745391f, -7.01260753e-003f,
-       5.24702156e-003f, 0.03236255f, 0.01407916f, 0.02207983f, 0.02537322f,
-       0.04547948f, 0.07200756f, 0.03129894f, -0.06274468f, 0.02107014f,
-       0.06035208f, 0.08636236f, 4.53164103e-003f, 0.02193363f, 0.02309801f,
-       0.05568166f, -0.02645093f, 0.04448695f, 0.02837519f, 0.08975694f,
-       0.04461516f, 0.08975355f, 0.07514391f, 0.02306982f, 0.10410084f,
-       0.06368385f, 0.05943464f, 4.58420580e-003f, 0.05220337f, 0.06675851f,
-       0.08358569f, 0.06712101f, 0.06559004f, -0.03930482f, -9.15936660e-003f,
-       -0.05897915f, 0.02816453f, 0.05032348f, 0.06780671f, 0.03377650f,
-       -6.09417039e-004f, -0.01795146f, -0.03083684f, -0.01302475f,
-       -0.02972313f, 7.88706727e-003f, -0.03525961f, -2.50397739e-003f,
-       0.05245084f, 0.11791293f, -0.02167498f, 0.05299332f, 0.06640524f,
-       0.05190265f, -8.27316567e-003f, 0.03033127f, 0.05842173f,
-       -4.01050318e-003f, -6.25105947e-003f, 0.05862958f, -0.02465461f,
-       0.05546781f, -0.08228195f, -0.07234028f, 0.04640540f, -0.01308254f,
-       -0.02506191f, 0.03100746f, -0.04665651f, -0.04591486f, 0.02949927f,
-       0.06035462f, 0.02244646f, -0.01698639f, 0.01040041f, 0.01131170f,
-       0.05419579f, -0.02130277f, -0.04321722f, -0.03665198f, 0.01126490f,
-       -0.02606488f, -0.02228328f, -0.02255680f, -0.03427236f,
-       -7.75165204e-003f, -0.06195229f, 8.21638294e-003f, 0.09535975f,
-       -0.03709979f, -0.06942501f, 0.14579427f, -0.05448192f, -0.02055904f,
-       0.05747357f, 0.02781788f, -0.07077577f, -0.05178314f, -0.10429011f,
-       -0.11235505f, 0.07529039f, -0.07559302f, -0.08786739f, 0.02983843f,
-       0.02667585f, 0.01382199f, -0.01797496f, -0.03141199f, -0.02098101f,
-       0.09029204f, 0.04955018f, 0.13718739f, 0.11379953f, 1.80019124e-003f,
-       -0.04577610f, -1.11108483e-003f, -0.09470536f, -0.11596080f,
-       0.04489342f, 0.01784211f, 3.06850672e-003f, 0.10781866f,
-       3.36498418e-003f, -0.10842580f, -0.07436839f, -0.10535070f,
-       -0.01866805f, 0.16057891f, -5.07316366e-003f, -0.04295658f,
-       -5.90488780e-003f, 8.82003549e-003f, -0.01492646f, -0.05029279f,
-       -0.12875880f, 8.78831954e-004f, -0.01297184f, -0.07592774f,
-       -0.02668831f, -6.93787413e-004f, 0.02406698f, -0.01773298f,
-       -0.03855745f, -0.05877856f, 0.03259695f, 0.12826584f, 0.06292590f,
-       -4.10733931e-003f, 0.10996531f, 0.01332991f, 0.02088735f, 0.04037504f,
-       -0.05210760f, 0.07760046f, 0.06399347f, -0.05751930f, -0.10053057f,
-       0.07505023f, -0.02139782f, 0.01796176f, 2.34400877e-003f, -0.04208319f,
-       0.07355055f, 0.05093350f, -0.02996780f, -0.02219072f, 0.03355330f,
-       0.04418742f, -0.05580705f, -0.05037573f, -0.04548179f, 0.01379514f,
-       0.02150671f, -0.02194211f, -0.13682702f, 0.05464972f, 0.01608082f,
-       0.05309116f, 0.04701022f, 1.33690401e-003f, 0.07575664f, 0.09625306f,
-       8.92647635e-003f, -0.02819123f, 0.10866830f, -0.03439325f,
-       -0.07092371f, -0.06004780f, -0.02712298f, -7.07467366e-003f,
-       -0.01637020f, 0.01336790f, -0.10313606f, 0.04906582f, -0.05732445f,
-       -0.02731079f, 0.01042235f, -0.08340668f, 0.03686501f, 0.06108340f,
-       0.01322748f, -0.07809529f, 0.03774724f, -0.03413248f, -0.06096525f,
-       -0.04212124f, -0.07982176f, -1.25973229e-003f, -0.03045501f,
-       -0.01236493f, -0.06312395f, 0.04789570f, -0.04602066f, 0.08576570f,
-       0.02521080f, 0.02988098f, 0.10314583f, 0.07060035f, 0.04520544f,
-       -0.04426654f, 0.13146530f, 0.08386490f, 0.02164590f, -2.12280243e-003f,
-       -0.03686353f, -0.02074944f, -0.03829959f, -0.01530596f, 0.02689708f,
-       0.11867401f, -0.06043470f, -0.02785023f, -0.04775074f, 0.04878745f,
-       0.06350956f, 0.03494788f, 0.01467400f, 1.17890188e-003f, 0.04379614f,
-       2.03681854e-003f, -0.03958609f, -0.01072688f, 6.43705716e-003f,
-       0.02996500f, -0.03418507f, -0.01960307f, -0.01219154f,
-       -4.37000440e-003f, -0.02549453f, 0.02646318f, -0.01632513f,
-       6.46516960e-003f, -0.01929734f, 4.78711911e-003f, 0.04962371f,
-       0.03809111f, 0.07265724f, 0.05758125f, -0.03741554f, 0.01648608f,
-       -8.45285598e-003f, 0.03996826f, -0.08185477f, 0.02638875f,
-       -0.04026615f, -0.02744674f, -0.04071517f, 1.05096330e-003f,
-       -0.04741232f, -0.06733172f, 8.70434940e-003f, -0.02192543f,
-       1.35350740e-003f, -0.03056974f, -0.02975521f, -0.02887780f,
-       -0.01210713f, -0.04828526f, -0.09066251f, -0.09969629f, -0.03665164f,
-       -8.88111943e-004f, -0.06826669f, -0.01866150f, -0.03627640f,
-       -0.01408288f, 0.01874239f, -0.02075835f, 0.09145175f, -0.03547291f,
-       0.05396780f, 0.04198981f, 0.01301925f, -0.03384354f, -0.12201976f,
-       0.06830920f, -0.03715654f, 9.55848210e-003f, 5.05685573e-003f,
-       0.05659294f, 3.90764466e-003f, 0.02808490f, -0.05518097f, -0.03711621f,
-       -0.02835565f, -0.04420464f, -0.01031947f, 0.01883466f,
-       -8.49525444e-003f, -0.09419250f, -0.01269387f, -0.02133371f,
-       -0.10190815f, -0.07844430f, 2.43644323e-003f, -4.09610150e-003f,
-       0.01202551f, -0.06452291f, -0.10593818f, -0.02464746f, -0.02199699f,
-       -0.07401930f, 0.07285886f, 8.87513801e-004f, 9.97662079e-003f,
-       8.46779719e-003f, 0.03730333f, -0.02905126f, 0.03573337f, -0.04393689f,
-       -0.12014472f, 0.03176554f, -2.76015815e-003f, 0.10824566f, 0.05090732f,
-       -3.30179278e-003f, -0.05123822f, 5.04784798e-003f, -0.05664124f,
-       -5.99415926e-003f, -0.05341901f, -0.01221393f, 0.01291318f,
-       9.91760660e-003f, -7.56987557e-003f, -0.06193124f, -2.24549137e-003f,
-       0.01987562f, -0.02018840f, -0.06975540f, -0.06601523f, -0.03349112f,
-       -0.08910118f, -0.03371435f, -0.07406893f, -0.02248047f, -0.06159951f,
-       2.77751544e-003f, -0.05723337f, -0.04792468f, 0.07518548f,
-       2.77279224e-003f, 0.04211938f, 0.03100502f, 0.05278448f, 0.03954679f,
-       -0.03006846f, -0.03851741f, -0.02792403f, -0.02875333f, 0.01531280f,
-       0.02186953f, -0.01989829f, 2.50679464e-003f, -0.10258728f,
-       -0.04785743f, -0.02887216f, 3.85063468e-003f, 0.01112236f,
-       8.29218887e-003f, -0.04822981f, -0.04503597f, -0.03713100f,
-       -0.06988008f, -0.11002295f, -2.69209221e-003f, 1.85383670e-003f,
-       -0.05921049f, -0.06105053f, -0.08458050f, -0.04527602f,
-       8.90329306e-004f, -0.05875023f, -2.68602883e-003f, -0.01591195f,
-       0.03631859f, 0.05493166f, 0.07300330f, 5.53333294e-003f, 0.06400407f,
-       0.01847740f, -5.76280477e-003f, -0.03210877f, 4.25160583e-003f,
-       0.01166520f, -1.44864211e-003f, 0.02253744f, -0.03367080f, 0.06983195f,
-       -4.22323542e-003f, -8.89401045e-003f, -0.07943393f, 0.05199728f,
-       0.06065201f, 0.04133492f, 1.44032843e-003f, -0.09585235f, -0.03964731f,
-       0.04232114f, 0.01750465f, -0.04487902f, -7.59733608e-003f, 0.02011171f,
-       0.04673622f, 0.09011173f, -0.07869188f, -0.04682482f, -0.05080139f,
-       -3.99383716e-003f, -0.05346331f, 0.01085723f, -0.03599333f,
-       -0.07097908f, 0.03551549f, 0.02680387f, 0.03471529f, 0.01790393f,
-       0.05471273f, 9.62048303e-003f, -0.03180215f, 0.05864431f, 0.02330614f,
-       0.01633144f, -0.05616681f, -0.10245429f, -0.08302189f, 0.07291322f,
-       -0.01972590f, -0.02619633f, -0.02485327f, -0.04627592f,
-       1.48853404e-003f, 0.05514185f, -0.01270860f, -0.01948900f, 0.06373586f,
-       0.05002292f, -0.03009798f, 8.76216311e-003f, -0.02474238f,
-       -0.05504891f, 1.74034527e-003f, -0.03333667f, 0.01524987f, 0.11663762f,
-       -1.32344989e-003f, -0.06608453f, 0.05687166f, -6.89525274e-004f,
-       -0.04402352f, 0.09450210f, -0.04222684f, -0.05360983f, 0.01779531f,
-       0.02561388f, -0.11075410f, -8.77790991e-003f, -0.01099504f,
-       -0.10380266f, 0.03103457f, -0.02105741f, -0.07371717f, 0.05146710f,
-       0.10581432f, -0.08617968f, -0.02892107f, 0.01092199f, 0.14551543f,
-       -2.24320893e-003f, -0.05818033f, -0.07390742f, 0.05701261f,
-       0.12937020f, -0.04986651f, 0.10182415f, 0.05028650f, 0.12515625f,
-       0.09175041f, 0.06404983f, 0.01523394f, 0.09460562f, 0.06106631f,
-       -0.14266998f, -0.02926703f, 0.02762171f, 0.02164151f,
-       -9.58488265e-004f, -0.04231362f, -0.09866509f, 0.04322244f,
-       0.05872034f, -0.04838847f, 0.06319253f, 0.02443798f, -0.03606876f,
-       9.38737206e-003f, 0.04289991f, -0.01027411f, 0.08156885f, 0.08751175f,
-       -0.13191354f, 8.16054735e-003f, -0.01452161f, 0.02952677f, 0.03615945f,
-       -2.09128903e-003f, 0.02246693f, 0.09623287f, 0.09412123f, -0.02924758f,
-       -0.07815186f, -0.02203079f, -2.02566991e-003f, 0.01094733f,
-       -0.01442332f, 0.02838561f, 0.11882371f, 7.28798332e-003f, -0.10345965f,
-       0.07561217f, -0.02049661f, 4.44177445e-003f, 0.01609347f, -0.04893158f,
-       -0.08758243f, -7.67420698e-003f, 0.08862378f, 0.06098121f, 0.06565887f,
-       7.32981879e-003f, 0.03558407f, -0.03874352f, -0.02490055f,
-       -0.06771075f, 0.09939223f, -0.01066077f, 0.01382995f, -0.07289080f,
-       7.47184316e-003f, 0.10621431f, -0.02878659f, 0.02383525f, -0.03274646f,
-       0.02137008f, 0.03837290f, 0.02450992f, -0.04296818f, -0.02895143f,
-       0.05327370f, 0.01499020f, 0.04998732f, 0.12938657f, 0.09391870f,
-       0.04292390f, -0.03359194f, -0.06809492f, 0.01125796f, 0.17290455f,
-       -0.03430733f, -0.06255233f, -0.01813114f, 0.11726857f, -0.06127599f,
-       -0.08677909f, -0.03429872f, 0.04684938f, 0.08161420f, 0.03538774f,
-       0.01833884f, 0.11321855f, 0.03261845f, -0.04826299f, 0.01752407f,
-       -0.01796414f, -0.10464549f, -3.30041884e-003f, 2.29343961e-004f,
-       0.01457292f, -0.02132982f, -0.02602923f, -9.87351313e-003f,
-       0.04273872f, -0.02103316f, -0.07994065f, 0.02614958f, -0.02111666f,
-       -0.06964913f, -0.13453490f, -0.06861878f, -6.09341264e-003f,
-       0.08251446f, 0.15612499f, 2.46531400e-003f, 8.88424646e-003f,
-       -0.04152999f, 0.02054853f, 0.05277953f, -0.03087788f, 0.02817579f,
-       0.13939077f, 0.07641046f, -0.03627627f, -0.03015098f, -0.04041540f,
-       -0.01360690f, -0.06227205f, -0.02738223f, 0.13577610f, 0.15235767f,
-       -0.05392922f, -0.11175954f, 0.02157129f, 0.01146481f, -0.05264937f,
-       -0.06595174f, -0.02749175f, 0.11812254f, 0.17404149f, -0.06137035f,
-       -0.11003478f, -0.01351621f, -0.01745916f, -0.08577441f, -0.04469909f,
-       -0.06106115f, 0.10559758f, 0.20806813f, -0.09174948f, 7.09621934e-004f,
-       0.03579374f, 0.07215115f, 0.02221742f, 0.01827742f, -7.90785067e-003f,
-       0.01489554f, 0.14519960f, -0.06425831f, 0.02990399f, -1.80181325e-003f,
-       -0.01401528f, -0.04171134f, -3.70530109e-003f, -0.09090481f,
-       0.09520713f, 0.08845516f, -0.02651753f, -0.03016730f, 0.02562448f,
-       0.03563816f, -0.03817881f, 0.01433385f, 0.02256983f, 0.02872120f,
-       0.01001934f, -0.06332260f, 0.04338406f, 0.07001807f, -0.04705722f,
-       -0.07318907f, 0.02630457f, 0.03106382f, 0.06648342f, 0.10913180f,
-       -0.01630815f, 0.02910308f, 0.02895109f, 0.08040254f, 0.06969310f,
-       0.06797734f, 6.08639978e-003f, 4.16588830e-003f, 0.08926726f,
-       -0.03123648f, 0.02700146f, 0.01168734f, -0.01631594f, 4.61015804e-003f,
-       8.51359498e-003f, -0.03544224f, 0.03571994f, 4.29766066e-003f,
-       -0.01970077f, -8.79793242e-003f, 0.09607988f, 0.01544222f,
-       -0.03923707f, 0.07308586f, 0.06061262f, 1.31683104e-004f,
-       -7.98222050e-003f, 0.02399261f, -0.06084389f, -0.02743429f,
-       -0.05475523f, -0.04131311f, 0.03559756f, 0.03055342f, 0.02981433f,
-       0.14860515f, 0.01766787f, 0.02945257f, 0.04898238f, 0.01026922f,
-       0.02811658f, 0.08267091f, 0.02732154f, -0.01237693f, 0.11760156f,
-       0.03802063f, -0.03309754f, 5.24957618e-003f, -0.02460510f, 0.02691451f,
-       0.05399988f, -0.10133506f, 0.06385437f, -0.01818005f, 0.02259503f,
-       0.03573135f, 0.01042848f, -0.04153402f, -0.04043029f, 0.01643575f,
-       0.08326677f, 4.61383024e-004f, -0.05308095f, -0.08536223f,
-       -1.61011645e-003f, -0.02163720f, -0.01783352f, 0.03859637f,
-       0.08498885f, -0.01725216f, 0.08625131f, 0.10995087f, 0.09177644f,
-       0.08498347f, 0.07646490f, 0.05580502f, 0.02693516f, 0.09996913f,
-       0.09070327f, 0.06667200f, 0.05873008f, -0.02247842f, 0.07772321f,
-       0.12408436f, 0.12629253f, -8.41997913e-004f, 0.01477783f, 0.09165990f,
-       -2.98401713e-003f, -0.06466447f, -0.07057302f, 2.09516948e-004f,
-       0.02210209f, -0.02158809f, -0.08602506f, -0.02284836f,
-       4.01876355e-003f, 9.56660323e-003f, -0.02073978f, -0.04635138f,
-       -7.59423291e-003f, -0.01377393f, -0.04559359f, -0.13284740f,
-       -0.08671406f, -0.03654395f, 0.01142869f, 0.03287891f, -0.04392983f,
-       0.06142959f, 0.17710890f, 0.10385257f, 0.01329137f, 0.10067633f,
-       0.12450829f, -0.04476709f, 0.09049144f, 0.04589312f, 0.11167907f,
-       0.08587538f, 0.04767583f, 1.67188141e-003f, 0.02359802f, -0.03808852f,
-       0.03126272f, -0.01919029f, -0.05698918f, -0.02365112f, -0.06519032f,
-       -0.05599358f, -0.07097308f, -0.03301812f, -0.04719102f, -0.02566297f,
-       0.01324074f, -0.09230672f, -0.05518232f, -0.04712864f, -0.03380903f,
-       -0.06719479f, 0.01183908f, -0.09326738f, 0.01642865f, 0.03789867f,
-       -6.61567831e-003f, 0.07796386f, 0.07246574f, 0.04706347f, -0.02523437f,
-       -0.01696830f, -0.08068866f, 0.06030888f, 0.10527060f, -0.06611756f,
-       0.02977346f, 0.02621830f, 0.01913855f, -0.08479366f, -0.06322418f,
-       -0.13570616f, -0.07644490f, 9.31900274e-003f, -0.08095149f,
-       -0.10197903f, -0.05204025f, 0.01413151f, -0.07800411f, -0.01885122f,
-       -0.07509381f, -0.10136326f, -0.05212355f, -0.09944065f,
-       -1.33606605e-003f, -0.06342617f, -0.04178550f, -0.12373723f,
-       -0.02832736f, -0.06057501f, 0.05830070f, 0.07604282f, -0.06462587f,
-       8.02447461e-003f, 0.11580125f, 0.12332212f, 0.01978462f,
-       -2.72378162e-003f, 0.05850752f, -0.04674481f, 0.05148062f,
-       -2.62542837e-003f, 0.11253355f, 0.09893716f, 0.09785093f, -0.04659257f,
-       -0.01102429f, -0.07002308f, 0.03088913f, -0.02565549f, -0.07671449f,
-       3.17443861e-003f, -0.10783514f, -0.02314270f, -0.11089555f,
-       -0.01024768f, 0.03116021f, -0.04964825f, 0.02281825f, 5.50005678e-003f,
-       -0.08427856f, -0.14685495f, -0.07719755f, -0.13342668f, -0.04525511f,
-       -0.09914210f, 0.02588859f, 0.03469279f, 0.04664020f, 0.11688190f,
-       0.09647275f, 0.10857815f, -0.01448726f, 0.04299758f, -0.06763151f,
-       1.33257592e-003f, 0.14331576f, 0.07574340f, 0.09166205f, 0.05674926f,
-       0.11325553f, -0.01106494f, 0.02062161f, -0.11484840f, -0.07492137f,
-       -0.02864293f, -0.01275638f, -0.06946032f, -0.10101652f, -0.04113498f,
-       -0.02214783f, -0.01273942f, -0.07480393f, -0.10556041f, -0.07622112f,
-       -0.09988393f, -0.11453961f, -0.12073903f, -0.09412795f, -0.07146588f,
-       -0.04054537f, -0.06127083f, 0.04221122f, 0.07688113f, 0.04099256f,
-       0.12663734f, 0.14683802f, 0.21761774f, 0.12525328f, 0.18431792f,
-       -1.66402373e-003f, 2.37777247e-003f, 0.01445475f, 0.03509416f,
-       0.02654697f, 0.01716739f, 0.05374011f, 0.02944174f, 0.11323927f,
-       -0.01485456f, -0.01611330f, -1.85554172e-003f, -0.01708549f,
-       -0.05435753f, -0.05302101f, 0.05260378f, -0.03582945f,
-       -3.42867890e-004f, 1.36076682e-003f, -0.04436073f, -0.04228432f,
-       0.03281291f, -0.05480836f, -0.10197772f, -0.07206279f, -0.10741059f,
-       -0.02366946f, 0.10278475f, -2.74783419e-003f, -0.03242477f,
-       0.02308955f, 0.02835869f, 0.10348799f, 0.19580358f, 0.10252027f,
-       0.08039929f, 0.05525554f, -0.13250865f, -0.14395352f, 3.13586881e-003f,
-       -0.03387071f, 8.94669443e-003f, 0.05406157f, -4.97324532e-003f,
-       -0.01189114f, 2.82919413e-004f, -0.03901557f, -0.04898705f,
-       0.02164520f, -0.01382906f, -0.01850416f, 0.01869347f, -0.02450060f,
-       0.02291678f, 0.08196463f, 0.03309153f, -0.10629974f, 0.02473924f,
-       0.05344394f, -0.02404823f, -0.03243643f, -5.55244600e-003f,
-       -0.08009996f, 0.02811539f, 0.04235742f, 0.01859004f, 0.04902123f,
-       -0.01438252f, -0.01526853f, 0.02044195f, -0.05008660f, 0.04244113f,
-       0.07611816f, 0.04950470f, -0.06020549f, -4.26026015e-003f, 0.13133512f,
-       -0.01438738f, -0.01958807f, -0.04044152f, -0.12425045f,
-       2.84353318e-003f, -0.05042776f, -0.09121484f, 7.34345755e-003f,
-       0.09388847f, 0.11800314f, 4.72295098e-003f, 4.44378285e-003f,
-       -0.07984917f, -0.03613737f, 0.04490915f, -0.02246483f, 0.04681071f,
-       0.05240871f, 0.02157206f, -0.04603431f, -0.01197929f, -0.02748779f,
-       0.13621049f, 0.08812155f, -0.07802048f, 4.86458559e-003f, -0.01598836f,
-       0.01024450f, -0.03463517f, -0.02304239f, -0.08692665f, 0.06655128f,
-       0.05785803f, -0.12640759f, 0.02307472f, 0.07337402f, 0.07525434f,
-       0.04943763f, -0.02241034f, -0.09978238f, 0.14487994f, -0.06570521f,
-       -0.07855482f, 0.02830222f, -5.29603509e-004f, -0.04669895f,
-       -0.11822784f, -0.12246452f, -0.15365660f, -0.02969127f, 0.08078201f,
-       0.13512598f, 0.11505685f, 0.04740673f, 0.01376022f, -0.05852978f,
-       -0.01537809f, -0.05541119f, 0.02491065f, -0.02870786f, 0.02760978f,
-       0.23836176f, 0.22347429f, 0.10306466f, -0.06919070f, -0.10132039f,
-       -0.20198342f, -0.05040560f, 0.27163076f, 0.36987007f, 0.34540465f,
-       0.29095781f, 0.05649706f, 0.04125737f, 0.07505883f, -0.02737836f,
-       -8.43431335e-003f, 0.07368195f, 0.01653876f, -0.09402955f,
-       -0.09574359f, 0.01474337f, -0.07128561f, -0.03460737f, 0.11438941f,
-       0.13752601f, -0.06385452f, -0.06310338f, 8.19548313e-003f, 0.11622470f,
-       5.05133113e-003f, -0.07602754f, 0.06695660f, 0.25723928f, 0.09037900f,
-       0.28826267f, 0.13165380f, -0.05312614f, -0.02137198f, -0.03442232f,
-       -0.06255679f, 0.03899667f, 0.18391028f, 0.26016650f, 0.03374462f,
-       0.01860465f, 0.19077586f, 0.18160543f, 3.43634398e-003f, -0.03036782f,
-       0.19683038f, 0.35378191f, 0.24968483f, -0.03222649f, 0.28972381f,
-       0.43091634f, 0.30778357f, 0.02335266f, -0.09877399f, -6.85245218e-003f,
-       0.08945240f, -0.08150686f, 0.02792493f, 0.24806842f, 0.17338486f,
-       0.06231801f, -0.10432383f, -0.16653322f, -0.13197899f, -0.08531576f,
-       -0.19271527f, -0.13536365f, 0.22240199f, 0.39219588f, 0.26597717f,
-       -0.01231649f, 0.01016179f, 0.13379875f, 0.12018334f, -0.04852953f,
-       -0.07915270f, 0.07036012f, 3.87723115e-003f, -0.06126805f,
-       -0.15015170f, -0.11406515f, -0.08556531f, -0.07429333f, -0.16115491f,
-       0.13214062f, 0.25691369f, 0.05697750f, 0.06861912f, -6.02903729e-003f,
-       -7.94562511e-003f, 0.04799571f, 0.06695165f, -0.01926842f, 0.06206308f,
-       0.13450983f, -0.06381495f, -2.98370165e-003f, -0.03482971f,
-       7.53991678e-003f, 0.03895611f, 0.11464261f, 0.01669971f,
-       8.27818643e-003f, -7.49160210e-003f, -0.11712562f, -0.10650621f,
-       -0.10353880f, -0.04994106f, -7.65618810e-004f, 0.03023767f,
-       -0.04759270f, -0.07302686f, -0.05825012f, -0.13156348f, -0.10639747f,
-       -0.19393684f, -0.09973683f, -0.07918908f, 4.63177625e-004f,
-       -6.61382044e-004f, 0.15853868f, 0.08561199f, -0.07660093f,
-       -0.08015265f, -0.06164073f, 0.01882577f, -7.29908410e-004f,
-       0.06840892f, 0.03843764f, 0.20274927f, 0.22028814f, -5.26101235e-003f,
-       0.01452435f, -0.06331623f, 0.02865064f, 0.05673740f, 0.12171564f,
-       0.03837196f, 0.03555467f, -0.02662914f, -0.10280123f, -0.06526285f,
-       -0.11066351f, -0.08988424f, -0.10103678f, 8.10526591e-003f,
-       5.95238712e-003f, 0.02617721f, -0.01705742f, -0.10897956f,
-       -0.08004991f, -0.11271993f, -0.06185647f, -0.06103712f, 0.01597041f,
-       -0.05923606f, 0.09410726f, 0.22858568f, 0.03263380f, 0.06772990f,
-       -0.09003516f, 0.01017870f, 0.01931688f, 0.08628357f, -0.01430009f,
-       0.10954945f, 0.16612452f, -0.02434544f, -0.03310068f, -0.04236627f,
-       0.01212392f, -6.15046406e-003f, 0.06954194f, 0.03015283f, 0.01787957f,
-       0.02781667f, -0.05561153f, -8.96244217e-003f, -0.04971489f,
-       0.07510284f, 0.01775282f, 0.05889897f, -0.07981427f, 0.03647643f,
-       -3.73833324e-003f, -0.08894575f, -0.06429435f, -0.08068276f,
-       0.03567704f, -0.07131936f, -7.21910037e-003f, -0.09566668f,
-       0.17886090f, 0.14911725f, 0.02070032f, -0.05017120f, -0.04992622f,
-       0.01570143f, -0.09906903f, 0.06456193f, 0.15329507f, 0.18820767f,
-       0.11689861f, -0.01178513f, -0.02225163f, -0.01905318f, 0.10271224f,
-       -7.27029052e-003f, 0.11664233f, 0.14796902f, 0.07771893f, 0.02400013f,
-       -0.05361797f, -0.01972888f, 0.01376177f, 0.06740040f, -0.06525395f,
-       0.05726178f, -0.02404981f, -0.14018567f, -0.02074987f, -0.04621970f,
-       -0.04688627f, -0.01842059f, 0.07722727f, -0.04852883f, 0.01529004f,
-       -0.19639495f, 0.10817073f, 0.03795860f, -0.09435206f, -0.07984378f,
-       -0.03383440f, 0.11081333f, 0.02237366f, 0.12703256f, 0.21613893f,
-       0.02918790f, 4.66472283e-003f, -0.10274266f, -0.04854131f,
-       -3.46305710e-003f, 0.08652268f, 0.02251546f, 0.09636052f, 0.17180754f,
-       -0.09272388f, 4.59174305e-004f, -0.11723048f, -0.12210111f,
-       -0.15547538f, 0.07218186f, -0.05297846f, 0.03779940f, 0.05150875f,
-       -0.03802310f, 0.03870645f, -0.15250699f, -0.08696499f, -0.02021560f,
-       0.04118926f, -0.15177974f, 0.01577647f, 0.10249301f, 7.50041893e-003f,
-       0.01721806f, -0.06828983f, -0.02397596f, -0.06598977f, -0.04317593f,
-       -0.08064980f, 6.66632550e-003f, 0.03333484f, 0.07093620f, 0.08231064f,
-       -0.06577903f, -0.06698844f, -0.06984019f, -0.06508023f, -0.14145090f,
-       -0.02393239f, 0.06485303f, 8.83263443e-003f, 0.09251080f, -0.07557579f,
-       -0.05067699f, -0.09798748f, -0.06703258f, -0.14056294f, 0.03245994f,
-       0.12554143f, 0.01761621f, 0.12980327f, -0.04081950f, -0.11906909f,
-       -0.14813015f, -0.08376863f, -0.12200681f, 0.04988137f, 0.05424247f,
-       -3.90952639e-003f, 0.03255733f, -0.12717837f, -0.07461493f,
-       -0.05703964f, -0.01736189f, -0.08026433f, -0.05433894f, -0.01719359f,
-       0.02886275f, 0.01772653f, -0.09163518f, 3.57789593e-003f, -0.10129993f,
-       -0.02653764f, -0.08131415f, -0.03847986f, -7.62157550e-004f,
-       0.06486648f, 0.19675669f, -0.04919156f, -0.07059129f, -0.04857785f,
-       -0.01042383f, -0.08328653f, 0.03660302f, -0.03696846f, 0.04969259f,
-       0.08241162f, -0.12514858f, -0.06122676f, -0.03750202f,
-       6.52989605e-003f, -0.10247213f, 0.02568346f, 4.51781414e-003f,
-       -0.03734229f, -0.01131264f, -0.05412074f, 8.89345480e-004f,
-       -0.12388977f, -0.05959237f, -0.12418608f, -0.06151643f, -0.07310260f,
-       0.02441575f, 0.07023528f, -0.07548289f, -7.57147965e-004f,
-       -0.09061348f, -0.08112976f, -0.06920306f, 9.54394229e-003f,
-       -0.01219902f, 1.21273217e-003f, -8.88989680e-003f, -0.08309301f,
-       -0.04552661f, -0.10739882f, -0.05691034f, -0.13928030f, 0.09027749f,
-       0.15123098f, 0.03175976f, 0.17763577f, 3.29913251e-004f, 0.05151888f,
-       -0.09844074f, -0.09475287f, -0.08571247f, 0.16241577f, 0.19336018f,
-       8.57454538e-003f, 0.11474732f, -0.01493934f, 0.03352379f, -0.08966240f,
-       -0.02322310f, 0.02663568f, 0.05448750f, -0.03536883f, -0.07210463f,
-       -0.06807277f, -0.03121621f, -0.05932408f, -0.17282860f, -0.15873498f,
-       -0.04956378f, 0.01603377f, -0.12385946f, 0.13878587f, 0.21468069f,
-       0.13510075f, 0.20992437f, 0.08845878f, 0.08104013f, 0.03754176f,
-       0.12173114f, 0.11103114f, 0.10643122f, 0.13941477f, 0.11640384f,
-       0.14786847f, 0.01218238f, 0.01160753f, 0.03547940f, 0.08794311f,
-       -0.01695384f, -0.07692261f, -0.08236158f, 6.79194089e-003f,
-       -0.02458403f, 0.13022894f, 0.10953187f, 0.09857773f, 0.04735930f,
-       -0.04353498f, -0.15173385f, -0.17904443f, -0.10450364f, -0.13418166f,
-       -0.06633098f, -0.03170381f, -0.06839000f, -0.11350126f, -0.06983913f,
-       0.19083543f, 0.17604128f, 0.07730632f, 0.10022651f, 0.36428109f,
-       0.28291923f, 0.12688625f, 0.15942036f, 0.14064661f, -0.11201853f,
-       -0.13969108f, -0.09088077f, -0.14107047f, 0.05117374f,
-       -2.63348082e-003f, -0.10794610f, -0.09715455f, -0.05284977f,
-       0.01565668f, 0.05031200f, 0.07021113f, -0.02963028f, 0.01766960f,
-       0.08333644f, -0.03211382f, 4.90096770e-003f, 0.05186674f, -0.05045737f,
-       -0.09624767f, -0.02525997f, 0.06916669f, 0.01213916f, 0.05333899f,
-       -0.03443280f, -0.10055527f, -0.06291115f, 5.42851724e-003f,
-       -6.30360236e-003f, 0.02270257f, -0.01769792f, 0.03273688f, 0.07746078f,
-       7.77099328e-003f, 0.05041346f, 0.01648103f, -0.02321534f, -0.09930186f,
-       -0.02293853f, 0.02034990f, -0.08324204f, 0.08510064f, -0.03732836f,
-       -0.06465405f, -0.06086946f, 0.13680504f, -0.11469388f, -0.03896406f,
-       -0.07142810f, 2.67581246e-003f, -0.03639632f, -0.09849060f,
-       -0.11014334f, 0.17489147f, 0.17610909f, -0.16091567f, -0.07248894f,
-       0.01567141f, 0.23742996f, 0.07552249f, -0.06270349f, -0.07303379f,
-       0.25442186f, 0.16903116f, -0.08168741f, -0.05913896f, -0.03954096f,
-       6.81776879e-003f, -0.05615319f, -0.07303037f, -0.12176382f,
-       0.12385108f, 0.22084464f, -0.05543206f, -0.03310431f, 0.05731593f,
-       0.19481890f, 0.04016430f, -0.06480758f, -0.12353460f, 0.18733442f,
-       -0.09631214f, -0.11192076f, 0.12404587f, 0.15671748f, 0.19256128f,
-       0.10895617f, 0.03391477f, -0.13032004f, -0.05626907f, -0.09025607f,
-       0.23485197f, 0.27812332f, 0.26725492f, 0.07255980f, 0.16565137f,
-       0.22388470f, 0.07441066f, -0.21003133f, -0.08075339f, -0.15031935f,
-       0.07023834f, 0.10872041f, 0.18156518f, 0.20037253f, 0.13571967f,
-       -0.11915682f, -0.11131983f, -0.18878011f, 0.06074620f, 0.20578890f,
-       0.12413109f, 0.03930207f, 0.29176015f, 0.29502738f, 0.27856228f,
-       -0.01803601f, 0.16646385f, 0.19268319f, 0.01900682f, 0.06026287f,
-       2.35868432e-003f, 0.01558199f, 0.02707230f, 0.11383014f, 0.12103992f,
-       0.03907350f, 0.04637353f, 0.09020995f, 0.11919726f, -3.63007211e-003f,
-       0.02220155f, 0.10336831f, 0.17351882f, 0.12259731f, 0.18983354f,
-       0.15736865f, 0.01160725f, -0.01690723f, -9.69582412e-004f, 0.07213813f,
-       0.01161613f, 0.17864859f, 0.24486147f, 0.18208991f, 0.20177495f,
-       0.05972528f, -8.93934630e-003f, -0.02316955f, 0.14436610f, 0.14114498f,
-       0.05520950f, 0.06353590f, -0.19124921f, 0.10174713f, 0.29414919f,
-       0.26448128f, 0.09344960f, 0.15284036f, 0.19797507f, 0.11369792f,
-       -0.12722753f, -0.21396367f, -0.02008235f, -0.06566695f, -0.01662150f,
-       -0.03937003f, 0.04778343f, 0.05017274f, -0.02299062f, -0.20208496f,
-       -0.06395898f, 0.13721776f, 0.22544557f, 0.14888357f, 0.08687132f,
-       0.27088094f, 0.32206613f, 0.09782200f, -0.18523243f, -0.17232181f,
-       -0.01041531f, 0.04008654f, 0.04199702f, -0.08081299f, -0.03755421f,
-       -0.04809646f, -0.05222081f, -0.21709201f, -0.06622940f, 0.02945281f,
-       -0.04600435f, -0.05256077f, -0.08432942f, 0.02848100f, 0.03490564f,
-       8.28621630e-003f, -0.11051246f, -0.11210597f, -0.01998289f,
-       -0.05369405f, -0.08869293f, -0.18799506f, -0.05436598f, -0.05011634f,
-       -0.05419716f, -0.06151857f, -0.10827805f, 0.04346735f, 0.04016083f,
-       0.01520820f, -0.12173316f, -0.04880285f, -0.01101406f, 0.03250847f,
-       -0.06009551f, -0.03082932f, -0.02295134f, -0.06856834f, -0.08775249f,
-       -0.23793389f, -0.09174541f, -0.05538322f, -0.04321031f, -0.11874759f,
-       -0.04221844f, -0.06070468f, 0.01194489f, 0.02608565f, -0.03892140f,
-       -0.01643151f, -0.02602034f, -0.01305472f, 0.03920100f, -0.06514261f,
-       0.01126918f, -6.27710763e-003f, -0.02720047f, -0.11133634f,
-       0.03300330f, 0.02398472f, 0.04079665f, -0.10564448f, 0.05966159f,
-       0.01195221f, -0.03179441f, -0.01692590f, -0.06177841f, 0.01841576f,
-       -5.51078189e-003f, -0.06821765f, -0.03191888f, -0.09545476f,
-       0.03030550f, -0.04896152f, -0.02914624f, -0.13283344f, -0.04783419f,
-       6.07836898e-003f, -0.01449538f, -0.13358212f, -0.09687774f,
-       -0.02813793f, 0.01213498f, 0.06650011f, -0.02039067f, 0.13356198f,
-       0.05986415f, -9.12760664e-003f, -0.18780160f, -0.11992817f,
-       -0.06342237f, 0.01229534f, 0.07143231f, 0.10713009f, 0.11085765f,
-       0.06569190f, -0.02956399f, -0.16288325f, -0.13993549f, -0.01292515f,
-       0.03833013f, 0.09130384f, -0.05086257f, 0.05617329f, -0.03896667f,
-       -0.06282311f, -0.11490010f, -0.14264110f, -0.04530499f, 0.01598189f,
-       0.09167797f, 0.08663294f, 0.04885277f, -0.05741219f, -0.07565769f,
-       -0.17136464f, -0.02619422f, -0.02477579f, 0.02679587f, 0.11621952f,
-       0.08788391f, 0.15520640f, 0.04709549f, 0.04504483f, -0.10214074f,
-       -0.12293372f, -0.04820546f, -0.05484834f, 0.05473754f, 0.07346445f,
-       0.05577277f, -0.08209965f, 0.03462975f, -0.20962234f, -0.09324598f,
-       3.79481679e-003f, 0.03617633f, 0.16742408f, 0.07058107f, 0.10204960f,
-       -0.06795346f, 3.22807301e-003f, -0.12589309f, -0.17496960f,
-       0.02078314f, -0.07694324f, 0.12184640f, 0.08997164f, 0.04793497f,
-       -0.11383379f, -0.08046359f, -0.25716835f, -0.08080962f,
-       6.80711539e-003f, -0.02930280f, -3.04938294e-003f, -0.11106286f,
-       -0.04628860f, -0.07821649f, 7.70127494e-003f, -0.10247706f,
-       1.21042714e-003f, 0.20573859f, -0.03241005f, 8.42972286e-003f,
-       0.01946464f, -0.01197973f, -0.14579976f, 0.04233614f,
-       -4.14096704e-003f, -0.06866436f, -0.02431862f, -0.13529138f,
-       1.25891645e-003f, -0.11425111f, -0.04303651f, -0.01694815f,
-       0.05720210f, -0.16040207f, 0.02772896f, 0.05498345f, -0.15010567f,
-       0.01450866f, 0.02350303f, -0.04301004f, -0.04951802f, 0.21702233f,
-       -0.03159155f, -0.01963303f, 0.18232647f, -0.03263875f,
-       -2.88476888e-003f, 0.01587562f, -1.94303901e-003f, -0.07789494f,
-       0.04674156f, -6.25576358e-003f, 0.08925962f, 0.21353747f, 0.01254677f,
-       -0.06999976f, -0.05931328f, -0.01884327f, -0.04306272f, 0.11794136f,
-       0.03842728f, -0.03907030f, 0.05636114f, -0.09766009f, -0.02104000f,
-       8.72711372e-003f, -0.02736877f, -0.05112274f, 0.16996814f, 0.02955785f,
-       0.02094014f, 0.08414304f, -0.03335762f, -0.03617457f, -0.05808248f,
-       -0.08872101f, 0.02927705f, 0.27077839f, 0.06075108f, 0.07478261f,
-       0.15282831f, -0.03908454f, -0.05101782f, -9.51998029e-003f,
-       -0.03272416f, -0.08735625f, 0.07633440f, -0.07185312f, 0.13841286f,
-       0.07812646f, -0.12901451f, -0.05488589f, -0.05644578f, -0.03290703f,
-       -0.11184757f, 0.03751570f, -0.05978153f, -0.09155276f, 0.05657315f,
-       -0.04328186f, -0.03047933f, -0.01413135f, -0.10181040f, -0.01384013f,
-       0.20132534f, -0.01536873f, -0.07641169f, 0.05906778f, -0.07833145f,
-       -0.01523801f, -0.07502609f, -0.09461885f, -0.15013233f, 0.16050665f,
-       0.09021381f, 0.08473236f, 0.03386267f, -0.09147339f, -0.09170618f,
-       -0.08498498f, -0.05119187f, -0.10431040f, 0.01041618f, -0.03064913f,
-       0.09340212f, 0.06448522f, -0.03881054f, -0.04985436f, -0.14794017f,
-       -0.05200112f, -0.02144495f, 0.04000821f, 0.12420804f, -0.01851651f,
-       -0.04116732f, -0.11951703f, -0.04879033f, -0.08722515f, -0.08454733f,
-       -0.10549165f, 0.11251976f, 0.10766345f, 0.19201984f, 0.06128913f,
-       -0.02734615f, -0.08834923f, -0.16999826f, -0.03548348f,
-       -5.36092324e-003f, 0.08297954f, 0.07226378f, 0.04194529f, 0.04668673f,
-       8.73902347e-003f, 0.06980139f, 0.05652480f, 0.05879445f, 0.02477076f,
-       0.02451423f, 0.12433673f, 0.05600227f, 0.06886370f, 0.03863076f,
-       0.07459056f, 0.02264139f, 0.01495469f, 0.06344220f, 0.06945208f,
-       0.02931899f, 0.11719371f, 0.04527427f, 0.03248192f, 2.08271481e-003f,
-       0.02044626f, 0.11403449f, 0.04303892f, 0.06444661f, 0.04959024f,
-       0.08174094f, 0.09240247f, 0.04894639f, 0.02252937f, -0.01652530f,
-       0.07587013f, 0.06064249f, 0.13954395f, 0.02772832f, 0.07093039f,
-       0.08501238f, 0.01701301f, 0.09055722f, 0.33421436f, 0.20163782f,
-       0.09821030f, 0.07951369f, 0.08695120f, -0.12757730f, -0.13865978f,
-       -0.06610068f, -0.10985506f, 0.03406816f, -0.01116336f, -0.07281768f,
-       -0.13525715f, -0.12844718f, 0.08956250f, 0.09171610f, 0.10092317f,
-       0.23385370f, 0.34489515f, 0.09901748f, 0.02002922f, 0.12335990f,
-       0.07606190f, -0.14899330f, -0.15634622f, -0.06494618f, -0.01760547f,
-       0.03404277f, -0.13208845f, -0.12101169f, -0.18294574f, -0.16560709f,
-       0.02183887f, -0.02752613f, 0.01813638f, 0.02000757f, 0.01319924f,
-       0.08030242f, 0.01220535f, 2.98233377e-003f, -0.01307070f, 0.05970297f,
-       -0.05345284f, -0.03381982f, -9.87543724e-003f, -0.06869387f,
-       0.03956730f, -0.03108176f, -0.05732809f, 0.02172386f, 0.04159765f,
-       2.62783933e-003f, 0.04813229f, 0.09358983f, -8.18389002e-003f,
-       0.01724574f, -0.02547474f, -0.04967288f, -0.02390376f, 0.06640504f,
-       -0.06306566f, 0.01137518f, 0.05589378f, -0.08237787f, 0.02455001f,
-       -0.03059422f, -0.08953978f, 0.06851497f, 0.07190268f, -0.07610799f,
-       7.87237938e-003f, -7.85830803e-003f, 0.06006952f, -0.01126728f,
-       -2.85743061e-003f, -0.04772895f, 0.01884944f, 0.15005857f,
-       -0.06268821f, -0.01989072f, 0.01138399f, 0.08760451f, 0.03879007f,
-       -9.66926850e-003f, -0.08012961f, 0.06414555f, -0.01362950f,
-       -0.09135523f, 0.01755159f, 0.04459474f, 0.09650917f, 0.05219948f,
-       -2.19440833e-003f, -0.07037939f, -0.01599054f, 0.13103317f,
-       -0.02492603f, -0.01032540f, -0.02903307f, 0.04489160f, 0.05148086f,
-       0.01858173f, -0.02919228f, 0.08299296f, -0.04590359f, -0.15745632f,
-       -0.09068198f, -0.02972453f, 0.12985018f, 0.22320485f, 0.24261914f,
-       0.03642650f, -0.05506422f, 2.67413049e-003f, -0.03834032f, 0.06449424f,
-       0.03834866f, 0.03816991f, 0.25039271f, 0.34212017f, 0.32433882f,
-       0.18824573f, -0.08599839f, -0.17599408f, -0.15317015f, -0.09913155f,
-       -0.02856072f, -0.05304699f, -1.06437842e-003f, -0.06641813f,
-       -0.07509298f, 0.01463361f, -0.07551918f, -0.04510373f,
-       -8.44620075e-003f, 0.01772176f, 0.04068235f, 0.20295307f, 0.15719447f,
-       0.05712103f, 0.26296997f, 0.14657754f, 0.01547317f, -0.05052776f,
-       -0.03881342f, -0.01437883f, -0.04930177f, 0.11719568f, 0.24098417f,
-       0.26468599f, 0.31698579f, 0.10103608f, -0.01096375f, -0.01367013f,
-       0.17104232f, 0.20065314f, 2.67622480e-003f, -0.01190034f, 0.18301608f,
-       0.09459770f, -0.06357619f, -0.06473801f, 0.01377906f, -0.10032775f,
-       -0.06388740f, 3.80393048e-003f, 0.06206078f, 0.10349120f, 0.26804337f,
-       8.17918684e-003f, -0.02314351f, 9.34422202e-003f, 0.09198381f,
-       0.03681326f, -8.77339672e-003f, -0.09662418f, -0.02715708f,
-       0.13503517f, 0.08962728f, -6.57071499e-003f, -0.03201199f, 0.28510824f,
-       0.32095715f, 0.18512695f, -0.14230858f, -0.14048551f, -0.07181299f,
-       -0.08575408f, -0.08661680f, -0.17416079f, 7.54326640e-004f,
-       0.05601677f, 0.13585392f, -0.04960437f, -0.07708392f, 0.10676333f,
-       -0.04407546f, -0.07209078f, 0.03663663f, 0.28949317f, 0.41127121f,
-       0.27431169f, -0.06900328f, -0.21474190f, -0.15578632f, -0.19555484f,
-       -0.15209621f, -0.11269179f, 0.07416003f, 0.18991330f, 0.26858172f,
-       0.01952259f, 0.01017922f, 0.02159843f, -4.95165400e-003f, -0.04368168f,
-       -0.12721671f, -0.06673957f, -0.11275250f, 0.04413409f, 0.05578312f,
-       0.03896771f, 0.03566417f, -0.05871816f, -0.07388090f, -0.17965563f,
-       -0.08570268f, -0.15273231f, -0.06022318f, -0.06999847f,
-       -6.81510568e-003f, 0.06294262f, -6.54901436e-004f, -0.01128654f,
-       -0.02289657f, 0.04849290f, 0.04140804f, 0.23681939f, 0.14545733f,
-       0.01989965f, 0.12032662f, 3.87463090e-003f, -6.02597650e-003f,
-       -0.05919775f, -0.03067224f, -0.07787777f, 0.10834727f, 0.02153730f,
-       0.02765649f, 0.03975543f, -0.12182906f, -0.04900113f, -0.09940100f,
-       -0.06453611f, -0.13757215f, -0.03721382f, 0.02827376f, -0.04351249f,
-       0.01907038f, -0.10284120f, -0.05671160f, -0.10760647f, -0.09624009f,
-       -0.09565596f, -0.01303654f, 0.03080539f, 0.01416511f, 0.05846142f,
-       -5.42971538e-003f, 0.06221476f, -0.03320325f, -0.06791797f,
-       -0.05791342f, 0.12851369f, 0.14990346f, 0.03634374f, 0.14262885f,
-       0.04330391f, 0.05032569f, -0.05631914f, 0.01606137f, 0.04387223f,
-       0.22344995f, 0.15722635f, -0.04693628f, 0.03006579f, -2.52882647e-003f,
-       0.05717621f, -0.07529724f, -0.02848588f, -0.06868757f,
-       -4.51729307e-003f, 0.06466042f, -0.05935378f, -0.04704857f,
-       -0.07363959f, 0.04843248f, -0.13421375f, -0.09789340f, -0.10255270f,
-       0.03509852f, 0.04751543f, -0.03822323f, 0.09740467f, 0.04762916f,
-       0.03940146f, -0.08283259f, 0.09552965f, 0.05038739f, 0.21258622f,
-       0.09646992f, 0.03241193f, 0.05167701f, 0.04614570f, 0.04330090f,
-       -0.02671840f, -0.06259909f, -0.02301898f, 0.18829170f, 0.10522786f,
-       0.04313190f, 0.01670948f, -0.08421925f, 0.05911417f, -0.10582602f,
-       -0.04855484f, -0.08373898f, 0.07775915f, 0.03723533f, -0.12047344f,
-       4.86345543e-003f, -0.10520902f, 0.06571782f, -0.07528137f,
-       -0.03245651f, -0.09869066f, -0.02917477f, -0.18293270f, 0.14810945f,
-       9.24033765e-003f, -0.04354914f, 0.02266885f, -0.11872729f,
-       -0.04016589f, 0.02830229f, 0.22539048f, 0.20565644f, 0.16701797f,
-       0.09019924f, 0.01300652f, 0.09760600f, -0.03675831f, -0.01935448f,
-       -0.06894835f, 0.08077277f, 0.19047537f, 0.11312226f, 0.04106043f,
-       -0.11187182f, 0.04312806f, -0.18548580f, -0.11287174f, -0.08794551f,
-       0.02078281f, -0.15295486f, 0.11806386f, -0.01103218f, -0.15971117f,
-       0.02153538f, -0.05232147f, -0.10835317f, -0.13910367f, 0.05920752f,
-       -0.10122602f, 0.20174250f, 0.09105796f, -0.01881348f, 0.09559010f,
-       -0.03725745f, -0.09442931f, -0.09763174f, 0.05854454f, 0.08287182f,
-       0.12919849f, 0.08594352f, -2.49806582e-003f, 0.02398440f,
-       5.67950122e-003f, -0.06296340f, -0.12993270f, 0.03855852f, 0.05186560f,
-       0.10839908f, -0.03380463f, -0.12654832f, -0.05399339f, -0.07456800f,
-       -0.04736232f, -0.10164231f, 0.07496139f, 0.08125214f, 0.07656177f,
-       -0.04999603f, -0.12823077f, -0.07692395f, -0.11317524f, -0.09118655f,
-       -0.05695669f, 0.10477209f, 0.07468581f, 0.01630048f, -8.00961629e-003f,
-       -0.06582128f, -0.04019095f, -0.04682907f, -0.01907842f, -0.10997720f,
-       0.04911406f, 0.02931030f, 0.04197735f, -0.05773980f, -0.09670641f,
-       -0.03594951f, -0.03402121f, -0.07149299f, -0.10566200f, 0.10601286f,
-       0.06340689f, -0.01518632f, -5.96402306e-003f, -0.07628012f,
-       -3.52779147e-003f, -0.02683854f, -0.10265494f, -0.02680815f,
-       0.16338381f, 0.03103515f, 0.02296976f, 0.01624348f, -0.10831620f,
-       -0.02314233f, -0.04789969f, -0.05530700f, -0.06461314f, 0.10494506f,
-       0.04642856f, -0.07592955f, -0.06197905f, -0.09042154f, -0.01445521f,
-       -0.04297818f, -0.11262015f, -0.11430512f, 0.03174541f, -0.03677487f,
-       -0.02963996f, -0.06610169f, -0.13292049f, -0.07059067f, -0.08444111f,
-       -0.02640536f, -0.07136250f, 0.04559967f, 0.01459980f, 0.17989251f,
-       0.04435328f, -0.12464730f, -0.02871115f, -0.10752209f, -0.03393742f,
-       -0.03791408f, 0.02548251f, 0.01956050f, 0.19245651f, 0.13963254f,
-       -0.05904696f, -0.07424626f, -0.10411884f, 1.54176133e-003f,
-       0.01797429f, 0.13025844f, 0.04547642f, -0.05710349f, -0.10697161f,
-       -0.13489437f, -0.06515755f, -0.06406886f, -4.08572936e-003f,
-       -0.01336483f, 0.04368737f, -0.11259720f, -0.05701635f, -0.06469971f,
-       -0.08346602f, -0.04166770f, -0.05795543f, -0.08247511f, -0.05742628f,
-       0.08452254f, -0.03350224f, 0.13980860f, 0.13252275f, 0.07589617f,
-       0.07539988f, 0.12155797f, 0.19087289f, 0.15050751f, 0.21250245f,
-       0.14206800f, 0.01298489f, 0.07450245f, 0.06559097f, 0.01700557f,
-       0.04512971f, 0.16950700f, 0.10261577f, 0.16389982f, 0.05505059f,
-       -0.03453077f, 0.08622462f, 0.07935954f, 0.03976260f, 0.02036091f,
-       3.95744899e-003f, 0.03267065f, 0.15235919f, 0.01297494f, -0.08109194f,
-       0.01407558f, 4.40693414e-003f, -0.15157418f, -0.11390478f,
-       -0.07487597f, -7.81322457e-003f, -0.02749545f, -0.10181408f,
-       0.13755716f, 0.14007211f, 0.13482562f, 0.27517235f, 0.34251109f,
-       0.07639657f, 0.07268607f, 0.19823882f, 0.16135791f, -0.04186463f,
-       -0.12784107f, -0.09846287f, 0.03169041f, 0.10974082f, -0.15051922f,
-       -0.08916726f, -0.07138767f, -0.04153349f, 6.25418453e-003f,
-       0.01266654f, 0.10533249f, 0.12749144f, 0.15148053f, 0.01498513f,
-       0.06305949f, -0.01247123f, -0.08778401f, -0.08551880f, -0.11955146f,
-       -0.08493572f, -0.02901620f, -0.02394859f, -0.13427313f, -0.11053200f,
-       -0.14413260f, -0.15203285f, 0.03972760f, -3.72127310e-004f,
-       -0.04200919f, 0.06105104f, 0.01904975f, -0.01106191f,
-       -7.27445772e-003f, -0.01520341f, 1.10228511e-003f, -0.04949187f,
-       -0.08013099f, 5.72071038e-003f, 0.08415454f, -0.06523152f, 0.03664081f,
-       -0.02673042f, -0.12066154f, -0.03702074f, 0.06006580f, 0.01628682f,
-       -6.17772620e-003f, 0.08192339f, -3.41629819e-003f, 0.02870512f,
-       0.05807141f, 0.04959986f, 0.04618251f, -0.04901629f, -0.10579574f,
-       0.02274442f, 0.12070961f, 2.23597488e-003f, 0.09831765f, -0.03019848f,
-       -0.11181970f, -0.04961075f, 0.02498928f, -0.03714991f, -0.01619653f,
-       0.02643486f, -7.62964319e-003f, -0.02882290f, -0.06242594f,
-       -0.08439861f, 0.07220893f, 0.07263952f, 0.01561574f, 0.03091968f,
-       0.01708712f, -0.03797151f, -3.18561122e-003f, 0.01624021f,
-       -0.02828573f, 0.11284444f, -1.32280716e-003f, -0.07784860f,
-       -0.07209100f, 0.03372242f, 0.12154529f, 0.02278104f, -0.05275500f,
-       -0.01918484f, 0.12989293f, 0.05424401f, 0.02333086f, 0.04029022f,
-       0.12392918f, 0.09495489f, 0.09190340f, 0.07935889f, 8.76816828e-003f,
-       0.17148446f, -8.51302687e-003f, -0.08011249f, -0.06796283f,
-       0.04884845f, 0.01112272f, -0.07835306f, -1.14811445e-003f,
-       -0.03440760f, 0.02845243f, 0.07695542f, -0.07069533f, -0.01151784f,
-       -8.53884313e-003f, -0.01662786f, -0.04163864f, 0.05400505f,
-       0.02859163f, 0.02921852f, 0.05003135f, -6.85718050e-003f, -0.01632611f,
-       0.07780217f, 0.04042810f, -0.01216440f, 3.60914599e-003f, -0.06322435f,
-       0.09516726f, 0.12877031f, -9.69162490e-003f, 0.01031179f, 0.05180895f,
-       -9.34659224e-003f, -0.01644533f, -0.04849347f, -0.04343236f,
-       0.10514783f, 0.08046635f, -0.04615205f, -0.03975486f, -0.01485525f,
-       0.13096830f, -0.01517950f, -0.06571898f, -0.04016372f, 0.01849786f,
-       0.02439670f, 0.08067258f, 1.74824719e-003f, 0.07053747f, 0.08819518f,
-       -5.08352555e-003f, -0.06550863f, -0.08266170f, -0.07780605f,
-       0.01453450f, -0.08756890f, 0.01096501f, -8.71319138e-003f, 0.10110464f,
-       0.02420769f, -0.06708383f, 0.02007811f, 5.93133038e-003f, 0.05398923f,
-       0.07538138f, 0.02049227f, 0.02242589f, 0.04011070f, -1.44875818e-003f,
-       -4.19115182e-003f, 0.06367654f, 0.02506934f, 0.02434536f, 0.05879405f,
-       -8.22952855e-003f, -0.01242441f, 0.04224926f, -0.01754923f,
-       0.05958161f, 0.03818886f, -0.01830363f, -0.04308917f, -0.04422197f,
-       -0.02432721f, 0.02264866f, 2.03751423e-003f, 0.01197031f, 0.04439203f,
-       0.12169247f, 0.03602713f, -0.02599251f, -1.98226492e-003f, 0.02046336f,
-       -0.02639058f, -1.91242550e-003f, -0.09334669f, -0.03595153f,
-       -9.88179818e-003f, -0.06848445f, -0.04666303f, -0.09955736f,
-       -0.04206430f, 0.02609075f, 9.09005292e-003f, -0.07138551f,
-       -4.22313227e-004f, 0.01766645f, 0.02756404f, 0.01308276f, 0.04052891f,
-       0.02387515f, 0.05337298f, 0.02500631f, -0.04970853f, -0.12467445f,
-       0.17604403f, 0.12256411f, -0.07512254f, 8.70451052e-003f, -0.05697548f,
-       -0.03626474f, -8.76623299e-003f, -0.01210897f, -0.09451522f,
-       0.07490732f, -0.02008001f, -0.02681278f, -0.06463405f, -0.01517507f,
-       7.33757764e-003f, 6.07147906e-003f, -0.09316964f, -0.04575328f,
-       0.13261597f, 0.15424870f, -0.01655918f, -0.02772390f, -0.05243644f,
-       -0.02356456f, -0.02351753f, -0.10211615f, -0.12873036f, 0.14549787f,
-       0.12519856f, 4.38762689e-003f, 0.02795992f, 0.05170322f, 0.09223596f,
-       0.05890015f, 0.02376701f, -0.02777346f, 0.09506908f, 0.02328936f,
-       -0.02319928f, -0.03218696f, -0.01527841f, -0.01016694f, -0.02674719f,
-       0.05137179f, 0.01980666f, 0.06544447f, -0.01746171f, 0.01026380f,
-       0.01561806f, 7.97004555e-004f, 0.07601810f, 0.01907250f, -0.03083035f,
-       -0.05987392f, 0.09242783f, 0.14555025f, 0.01035827f, 0.03092401f,
-       -0.09562709f, -0.03802354f, 0.02531144f, 0.03079449f, -0.07100715f,
-       0.03330721f, -2.69116857e-003f, 0.03167490f, 0.05744999f, 0.03259895f,
-       1.91266940e-003f, 0.03194578f, 0.07389776f, 0.02198060f, 0.07633314f,
-       0.03293105f, -0.09103648f, 0.04718142f, 0.06102672f, -0.01003063f,
-       5.85481385e-003f, -0.01522574f, 0.02323526f, 0.10584345f,
-       4.35879454e-003f, 0.06107873f, 0.05868603f, -0.03115531f, 0.01214679f,
-       0.08567052f, 3.93926632e-003f, -0.02521488f, -1.88425183e-003f,
-       0.02038053f, -6.26854831e-004f, 0.04897438f, -0.04280585f,
-       -0.04819689f, -0.04812867f, -0.01451186f, 0.05101469f,
-       -9.01125465e-003f, -0.03333859f, 0.03917955f, 0.04196448f, 0.04292135f,
-       0.02809529f, 0.02999715f, 0.04081348f, 9.10039060e-003f, 0.09703232f,
-       0.10379741f, 0.02348725f, -4.72756615e-003f, 0.01027325f, 0.10402658f,
-       0.12071823f, 0.09817299f, -0.02612033f, 0.03638414f, 0.05896405f,
-       0.04865025f, 0.04793910f, -0.03882321f, -0.02962117f, -0.01222268f,
-       0.04071597f, 0.01922777f, -0.02287866f, 0.03328381f, 0.01859092f,
-       0.09024994f, 0.03804455f, -0.01424510f, 0.01953739f, 0.02509617f,
-       -0.03390914f, -0.05663941f, -0.01641979f, 0.05848591f, 0.04639670f,
-       0.02092116f, 0.12911791f, 0.19918139f, 0.07739855f, -7.25806039e-003f,
-       0.04074838f, 0.03183993f, 1.39251316e-003f, -0.01428625f, 0.01865480f,
-       0.08529541f, 0.13547510f, 0.11189661f, 0.03998901f, 0.09575938f,
-       -0.02631102f, -0.03458253f, -0.04749985f, -0.06070716f,
-       4.71884012e-003f, 0.06445789f, -0.02450038f, -0.05483776f,
-       -0.04657237f, -0.02030717f, -0.03480766f, -0.09397731f, -0.06399718f,
-       -0.01804585f, 5.62348310e-003f, -6.64811488e-003f, -0.06517869f,
-       6.96210237e-003f, -0.01860148f, -0.04245830f, -0.05850367f,
-       -3.24417115e-003f, 0.07700698f, 0.11290991f, 0.09923030f, -0.02970599f,
-       0.05592411f, 0.04813979f, -0.09811195f, -0.09357996f, -0.03276114f,
-       0.05218338f, 0.04141375f, 3.92977800e-003f, -0.05047480f, 0.15960084f,
-       0.04612800f, -0.03114098f, -0.04650044f, -0.03249795f, -0.02425641f,
-       -0.04311355f, 0.04307659f, -0.09401883f, -0.04742785f, -0.01254499f,
-       -0.06598741f, 3.41369561e-003f, -0.05620445f, -7.28127593e-003f,
-       -0.05998361f, -0.03274450f, -0.07376868f, 3.19015374e-003f,
-       -0.07733069f, 0.05815864f, -0.02471071f, 0.03850617f, 0.13838784f,
-       0.15399861f, 0.01731321f, -0.01477586f, 0.10393341f, 0.05159833f,
-       -0.01945555f, -0.03427503f, -0.04867341f, 0.09237480f, 0.10732719f,
-       0.06071450f, -0.01355071f, 0.01844356f, -0.03480803f, -0.03796671f,
-       2.15628621e-004f, -0.05440186f, 0.01889855f, -0.01443413f,
-       -0.02607902f, -0.02938001f, 0.02720689f, -0.06228397f, -0.02970936f,
-       -0.03426210f, -0.10280876f, -0.06739304f, -0.05227850f, 0.03360292f,
-       -0.11278441f, -0.06966180f, -0.13937433f, 9.10932291e-003f,
-       2.52020749e-004f, -4.07359656e-003f, 0.12310639f, 0.09343060f,
-       0.07302511f, 0.03222093f, 0.07532879f, 0.03792387f, -0.04985180f,
-       0.01804602f, 0.02694195f, 0.13481498f, 0.04601225f, 0.04106982f,
-       0.08511057f, 0.12314661f, 0.01320830f, 0.05044121f, -5.52943908e-003f,
-       -0.08992624f, -0.02249301f, -0.08181777f, 0.06165213f, -0.03256603f,
-       -0.01068920f, -0.01323473f, -0.11970232f, -0.04616347f, -0.12088681f,
-       -0.06762606f, -0.08676834f, -0.06434575f, 0.01772529f, 0.03469615f,
-       -0.10926618f, 0.03013873f, 0.14030397f, 0.16130108f, 0.17985588f,
-       0.11281928f, 0.10530639f, 0.08905948f, 0.07733764f, 0.06695238f,
-       0.02142088f, 0.06438877f, 0.09794453f, 0.05745072f, 0.02788557f,
-       0.02632830f, 0.07985807f, 4.24902979e-003f, 8.47890321e-003f,
-       -0.02679466f, -5.28812688e-003f, -0.02162580f, -0.07490715f,
-       -0.08251337f, -0.02056576f, -0.01026194f, -1.15492963e-003f,
-       -5.75720915e-004f, -0.07210591f, -0.07320981f, -0.04883312f,
-       -0.10897151f, -0.07477258f, -0.08867134f, -0.09222437f, -0.10924666f,
-       -0.10430276f, 0.07953499f, 0.02767959f, 0.11393359f, 0.18779543f,
-       0.03313421f, 0.02143700f, 0.05852016f, -2.12067598e-003f,
-       -3.76984011e-003f, 0.02774167f, -0.03124610f, 0.01465141f, 0.01616004f,
-       -0.01391913f, -0.04404102f, -0.05444227f, -0.14684731f, -0.15016587f,
-       0.04509468f, 1.29563001e-003f, 0.01398350f, 0.05610404f, -0.04868806f,
-       -0.04776716f, -8.16873740e-003f, -2.30126386e-003f, -0.02286313f,
-       0.11983398f, -0.04703261f, -0.08814441f, -0.07585249f, -0.10799607f,
-       -0.03232087f, 0.01509786f, -0.04843464f, -0.03967846f, 0.09589416f,
-       0.01352560f, -0.01458119f, 0.01050829f, -0.03038946f, 0.01608388f,
-       1.11975556e-003f, -0.01250656f, 2.86211423e-003f, 0.04333691f,
-       -0.14603497f, -0.01946543f, -0.02327525f, -0.01973944f, 0.07944400f,
-       -0.02224544f, -0.06701808f, 0.03476532f, 0.11505594f, -0.02712801f,
-       -0.01665113f, 0.06315716f, -0.08205860f, 0.07431999f, 0.04915778f,
-       -0.04468752f, -0.01490402f, 0.07400476f, -0.11650901f, 0.05102430f,
-       0.04559118f, -0.05916039f, 0.08840760f, -0.01587902f, -0.14890194f,
-       0.07857784f, 0.04710254f, -0.05381983f, -0.07331945f, -0.03604643f,
-       0.15611970f, 0.07649943f, -0.05959348f, -0.02776607f, 0.11098688f,
-       0.03758875f, -0.04446875f, 0.04933187f, 0.01345535f, 0.06921103f,
-       0.07364785f, 0.05518956f, 0.02899585f, 0.09375840f, 0.10518434f,
-       -0.04420241f, 0.01915282f, -3.56386811e-003f, 0.14586878f, 0.10286101f,
-       -0.04360626f, -0.12723237f, 0.09076386f, 0.11119842f, -0.06035013f,
-       0.09674817f, 0.08938243f, 0.07065924f, 0.02603180f, 5.84815582e-003f,
-       -0.05922065f, 0.12360309f, 3.59695964e-003f, 2.99844006e-003f,
-       0.03697936f, 0.02043072f, 0.04168725f, 0.01025975f, -0.01359980f,
-       -0.01600920f, 0.02581056f, 0.02329250f, 2.98100687e-003f, 0.01629762f,
-       0.06652115f, 0.05855627f, 0.01237463f, -0.01297135f, 0.01761587f,
-       0.05090865f, 0.06549342f, -0.04425945f, 2.43203156e-003f,
-       3.07327788e-003f, 0.06678630f, -0.04303836f, 0.01082393f, -0.06476044f,
-       0.04077786f, 0.12441979f, 0.08237778f, 0.07424165f, 0.04065890f,
-       0.06905543f, 0.09556347f, 0.12724875f, -0.02132082f, 0.08514154f,
-       -0.04175328f, -0.02666954f, 0.01897836f, 0.03317382f, 9.45465732e-003f,
-       -0.01238974f, -0.04242500f, -0.01419479f, -0.03545213f, -0.02440874f,
-       0.08684119f, 0.04212951f, 0.02462858f, -0.01104825f, -5.01706870e-003f,
-       0.02968982f, 0.02597476f, -0.01568939f, 0.04514892f, 0.06974549f,
-       0.08670278f, 0.06828108f, 0.10238872f, 0.05405957f, 0.06548470f,
-       -0.03763957f, 0.01366090f, 0.07069602f, 0.05363748f, 0.04798120f,
-       0.11706422f, 0.05466456f, -0.01869259f, 0.06344382f, 0.03106543f,
-       0.08432506f, -0.02061096f, 0.03821088f, -6.92190882e-003f,
-       6.40467042e-003f, -0.01271779f, 6.89014705e-005f, 0.04541415f,
-       -0.01899539f, -0.05020239f, 0.03000903f, 0.01090422f, 4.52452758e-003f,
-       0.02573632f, -0.02388454f, -0.04200457f, 1.72783900e-003f,
-       -0.05978370f, -0.02720562f, 0.06573715f, 0.01154317f, 0.01265615f,
-       0.07375994f, -9.19828378e-003f, -0.04914120f, 0.02124831f, 0.06455322f,
-       0.04372910f, -0.03310043f, 0.03605788f, -6.78055827e-003f,
-       9.36202332e-003f, 0.01747596f, -0.06406314f, -0.06812935f, 0.08080816f,
-       -0.02778088f, 0.02735260f, 0.06393493f, 0.06652229f, 0.05676993f,
-       0.08640018f, -7.59188086e-003f, -0.02012847f, -0.04741159f,
-       -0.01657069f, -0.01624399f, 0.05547778f, -2.33309763e-003f,
-       0.01120033f, 0.06141156f, -0.06285004f, -0.08732341f, -0.09313398f,
-       -0.04267832f, 5.57443965e-003f, 0.04809862f, 0.01773641f,
-       5.37361018e-003f, 0.14842421f, -0.06298012f, -0.02935147f, 0.11443478f,
-       -0.05034208f, 5.65494271e-003f, 0.02076526f, -0.04577984f,
-       -0.04735741f, 0.02961071f, -0.09307127f, -0.04417921f, -0.04990027f,
-       -0.03940028f, 0.01306016f, 0.06267900f, 0.03758737f, 0.08460117f,
-       0.13858789f, 0.04862388f, -0.06319809f, -0.05655516f, 0.01885816f,
-       -0.03285607f, 0.03371567f, -0.07040928f, -0.04514049f, 0.01392166f,
-       0.08184422f, -0.07230316f, 0.02386871f, 0.02184591f, 0.02605764f,
-       -0.01033954f, 9.29878280e-003f, 7.67351175e-003f, 0.15189242f,
-       0.02069071f, -0.09738296f, -0.08894105f, -0.07768748f, 0.02332268f,
-       -0.01778995f, -0.03258888f, -0.08180822f, -0.08492987f, 0.02290156f,
-       -0.11368170f, -0.03554465f, -0.04533844f, -0.02861580f, 0.06782424f,
-       0.01113123f, 0.02453644f, 0.12721945f, 0.08084814f, -0.03607795f,
-       0.01109122f, 0.04803548f, -0.03489929f, 0.03399536f, -0.05682014f,
-       8.59533902e-003f, -4.27904585e-003f, 0.03230887f, -0.01300198f,
-       -0.01038137f, -0.07930113f, 8.33097473e-003f, 0.02296994f,
-       -0.01306500f, -0.01881626f, 0.04413369f, 0.05729880f, -0.03761553f,
-       0.01942326f, 1.64540811e-003f, -0.03811319f, 0.04190650f, -0.14978096f,
-       -0.04514487f, 0.01209545f, -5.46460645e-003f, -0.01647195f,
-       7.63064111e-003f, -0.07494587f, 0.08415288f, 0.10020141f, -0.01228561f,
-       0.06553826f, 0.04554005f, 0.07890417f, 0.03041138f, 0.01752007f,
-       0.09208256f, -3.74419295e-004f, 0.10549527f, 0.04686913f, 0.01894833f,
-       -0.02651412f, -4.34682379e-003f, 5.44942822e-003f, 0.01444484f,
-       0.05882156f, -0.03336544f, 0.04603891f, -0.10432546f, 0.01923928f,
-       0.01842845f, -0.01712168f, -0.02222766f, 0.04693324f, -0.06202956f,
-       -0.01422159f, 0.08732220f, -0.07706107f, 0.02661049f, -0.04300238f,
-       -0.03092422f, -0.03552184f, -0.01886088f, -0.04979934f, 0.03906401f,
-       0.04608644f, 0.04966111f, 0.04275464f, -0.04621769f, -0.02653212f,
-       8.57011229e-003f, 0.03839684f, 0.05818764f, 0.03880796f,
-       -2.76100676e-004f, 0.03076511f, -0.03266929f, -0.05374557f,
-       0.04986527f, -9.45429131e-003f, 0.03582499f, -2.64564669e-003f,
-       -1.07461517e-003f, 0.02962313f, -0.01483363f, 0.03060869f, 0.02448327f,
-       0.01845641f, 0.03282966f, -0.03534438f, -0.01084059f, -0.01119136f,
-       -1.85360224e-003f, -5.94652840e-004f, -0.04451817f, 2.98327743e-003f,
-       0.06272484f, -0.02152076f, -3.05971340e-003f, -0.05070828f,
-       0.01531762f, 0.01282815f, 0.05167150f, 9.46266949e-003f,
-       -3.34558333e-003f, 0.11442288f, -0.03906701f, -2.67325155e-003f,
-       0.03069184f, -0.01134165f, 0.02949462f, 0.02879886f, 0.03855566f,
-       -0.03450781f, 0.09142872f, -0.02156654f, 0.06075062f, -0.06220816f,
-       0.01944680f, 6.68372354e-003f, -0.06656796f, 8.70784000e-003f,
-       0.03456013f, 0.02434320f, -0.13236357f, -0.04177035f, -0.02069627f,
-       0.01068112f, 0.01505432f, -0.07517391f, -3.83571628e-003f,
-       -0.06298508f, -0.02881260f, -0.13101046f, -0.07221562f,
-       -5.79945277e-003f, -8.57300125e-003f, 0.03782469f, 0.02762164f,
-       0.04942456f, -0.02936396f, 0.09597211f, 0.01921411f, 0.06101191f,
-       -0.04787507f, -0.01379578f, -7.40224449e-003f, -0.02220136f,
-       -0.01313756f, 7.77558051e-003f, 0.12296968f, 0.02939998f, 0.03594062f,
-       -0.07788624f, -0.01133144f, 3.99316690e-004f, -0.06090347f,
-       -0.01122066f, -4.68682544e-003f, 0.07633100f, -0.06748922f,
-       -0.05640298f, -0.05265681f, -0.01139122f, -0.01624347f, -0.04715714f,
-       -0.01099092f, 0.01048561f, 3.28499987e-003f, -0.05810167f,
-       -0.07699911f, -0.03330683f, 0.04185145f, 0.03478536f, 0.02275165f,
-       0.02304766f, 6.66040834e-003f, 0.10968148f, -5.93013782e-003f,
-       -0.04858336f, -0.04203213f, -0.09316786f, -6.13074889e-003f,
-       -0.02544625f, 0.01366201f, 9.18555818e-003f, -0.01846578f,
-       -0.05622401f, -0.03989377f, -0.07810296f, 6.91275718e-003f,
-       0.05957597f, -0.03901334f, 0.01572002f, -0.01193903f,
-       -6.89400872e-003f, -0.03093356f, -0.04136098f, -0.01562869f,
-       -0.04604580f, 0.02865234f, -0.08678447f, -0.03232484f, -0.05364593f,
-       -0.01445016f, -0.07003860f, -0.08669746f, -0.04520775f, 0.04274122f,
-       0.03117515f, 0.08175703f, 0.01081109f, 0.06379741f, 0.06199206f,
-       0.02865988f, 0.02360346f, 0.06725410f, -0.03248780f, -9.37702879e-003f,
-       0.08265898f, -0.02245839f, 0.05125763f, -0.01862395f, 0.01973453f,
-       -0.01994494f, -0.10770868f, 0.03180375f, 3.23935156e-003f,
-       -0.02142080f, -0.04256190f, 0.04760900f, 0.04282863f, 0.05635953f,
-       -0.01870849f, 0.05540622f, -0.03042666f, 0.01455277f, -0.06630179f,
-       -0.05843807f, -0.03739681f, -0.09739155f, -0.03220233f, -0.05620182f,
-       -0.10381401f, 0.07400211f, 4.20676917e-003f, 0.03258535f,
-       2.14308966e-003f, 0.05121966f, -0.01274337f, 0.02384761f, 0.06335578f,
-       -0.07905591f, 0.08375625f, -0.07898903f, -0.06508528f, -0.02498444f,
-       0.06535810f, 0.03970535f, 0.04895468f, -0.01169566f, -0.03980601f,
-       0.05682293f, 0.05925463f, -0.01165808f, -0.07936699f, -0.04208954f,
-       0.01333987f, 0.09051196f, 0.10098671f, -0.03974256f, 0.01238771f,
-       -0.07501741f, -0.03655440f, -0.04301528f, 0.09216860f,
-       4.63579083e-004f, 0.02851115f, 0.02142735f, 1.28244064e-004f,
-       0.02879687f, -0.08554889f, -0.04838862f, 0.08135369f, -0.05756533f,
-       0.01413900f, 0.03451880f, -0.06619488f, -0.03053130f, 0.02961676f,
-       -0.07384635f, 0.01135692f, 0.05283910f, -0.07778034f, -0.02107482f,
-       -0.05511716f, -0.13473752f, 0.03030157f, 0.06722020f, -0.06218817f,
-       -0.05826827f, 0.06254654f, 0.02895772f, -0.01664000f, -0.03620280f,
-       -0.01612278f, -1.46097376e-003f, 0.14013411f, -8.96181818e-003f,
-       -0.03250246f, 3.38630192e-003f, 2.64779478e-003f, 0.03359732f,
-       -0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f };
-    return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
-}
-
-#endif
diff --git a/modules/cuda/src/precomp.hpp b/modules/cuda/src/precomp.hpp
index 60c71b52bd..7feeadddc1 100644
--- a/modules/cuda/src/precomp.hpp
+++ b/modules/cuda/src/precomp.hpp
@@ -47,7 +47,6 @@
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudawarping.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"
 
 #include "opencv2/core/private.cuda.hpp"
 #include "opencv2/core/utility.hpp"
diff --git a/modules/cuda/test/test_precomp.hpp b/modules/cuda/test/test_precomp.hpp
index a0abfd2285..e3b33017a7 100644
--- a/modules/cuda/test/test_precomp.hpp
+++ b/modules/cuda/test/test_precomp.hpp
@@ -60,7 +60,6 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/opengl.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"
 
 #include "cvconfig.h"
 
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index 98ebfbef88..6e475db985 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -130,12 +130,6 @@ This function, in contrast to divide, uses a round-down rounding mode.
  */
 CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
 
-//! computes element-wise weighted reciprocal of an array (dst = scale/src2)
-static inline void divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
-{
-    divide(src1, src2, dst, 1.0, dtype, stream);
-}
-
 /** @brief Computes per-element absolute difference of two matrices (or of a matrix and scalar).
 
 @param src1 First source matrix or scalar.
@@ -530,116 +524,53 @@ CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bot
 @param src1 Source matrix. Any matrices except 64F are supported.
 @param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now.
 @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 @sa norm
  */
-CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline double norm(InputArray src, int normType)
-{
-    GpuMat buf;
-    return norm(src, normType, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline double norm(InputArray src, int normType, GpuMat& buf)
-{
-    return norm(src, normType, GpuMat(), buf);
-}
+CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcNorm(InputArray src, OutputArray dst, int normType, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
 /** @brief Returns the difference of two matrices.
 
 @param src1 Source matrix. Any matrices except 64F are supported.
 @param src2 Second source matrix (if any) with the same size and type as src1.
 @param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 @sa norm
  */
-CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2);
-/** @overload
-uses new buffer
-*/
-static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2)
-{
-    GpuMat buf;
-    return norm(src1, src2, buf, normType);
-}
+CV_EXPORTS double norm(InputArray src1, InputArray src2, int normType=NORM_L2);
+/** @overload */
+CV_EXPORTS void calcNormDiff(InputArray src1, InputArray src2, OutputArray dst, int normType=NORM_L2, Stream& stream = Stream::Null());
 
 /** @brief Returns the sum of matrix elements.
 
 @param src Source image of any depth except for CV_64F .
 @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 @sa sum
  */
-CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline Scalar sum(InputArray src)
-{
-    GpuMat buf;
-    return sum(src, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline Scalar sum(InputArray src, GpuMat& buf)
-{
-    return sum(src, GpuMat(), buf);
-}
+CV_EXPORTS Scalar sum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
 /** @brief Returns the sum of absolute values for matrix elements.
 
 @param src Source image of any depth except for CV_64F .
 @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
  */
-CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline Scalar absSum(InputArray src)
-{
-    GpuMat buf;
-    return absSum(src, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline Scalar absSum(InputArray src, GpuMat& buf)
-{
-    return absSum(src, GpuMat(), buf);
-}
+CV_EXPORTS Scalar absSum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcAbsSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
 /** @brief Returns the squared sum of matrix elements.
 
 @param src Source image of any depth except for CV_64F .
 @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
  */
-CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline Scalar sqrSum(InputArray src)
-{
-    GpuMat buf;
-    return sqrSum(src, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline Scalar sqrSum(InputArray src, GpuMat& buf)
-{
-    return sqrSum(src, GpuMat(), buf);
-}
+CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcSqrSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
 /** @brief Finds global minimum and maximum matrix elements and returns their values.
 
@@ -647,21 +578,14 @@ static inline Scalar sqrSum(InputArray src, GpuMat& buf)
 @param minVal Pointer to the returned minimum value. Use NULL if not required.
 @param maxVal Pointer to the returned maximum value. Use NULL if not required.
 @param mask Optional mask to select a sub-matrix.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 The function does not work with CV_64F images on GPUs with the compute capability \< 1.3.
 
 @sa minMaxLoc
  */
-CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer
-*/
-static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
-{
-    GpuMat buf;
-    minMax(src, minVal, maxVal, mask, buf);
-}
+CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void findMinMax(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
 /** @brief Finds global minimum and maximum matrix elements and returns their values with locations.
 
@@ -671,44 +595,28 @@ static inline void minMax(InputArray src, double* minVal, double* maxVal=0, Inpu
 @param minLoc Pointer to the returned minimum location. Use NULL if not required.
 @param maxLoc Pointer to the returned maximum location. Use NULL if not required.
 @param mask Optional mask to select a sub-matrix.
-@param valbuf Optional values buffer to avoid extra memory allocations. It is resized
-automatically.
-@param locbuf Optional locations buffer to avoid extra memory allocations. It is resized
-automatically.
+
 The function does not work with CV_64F images on GPU with the compute capability \< 1.3.
 
 @sa minMaxLoc
  */
 CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                          InputArray mask, GpuMat& valbuf, GpuMat& locbuf);
-/** @overload
-uses new buffer
-*/
-static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
-                             InputArray mask=noArray())
-{
-    GpuMat valBuf, locBuf;
-    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
-}
+                          InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void findMinMaxLoc(InputArray src, OutputArray minMaxVals, OutputArray loc,
+                              InputArray mask = noArray(), Stream& stream = Stream::Null());
 
 /** @brief Counts non-zero matrix elements.
 
 @param src Single-channel source image.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 The function does not work with CV_64F images on GPUs with the compute capability \< 1.3.
 
 @sa countNonZero
  */
-CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf);
-/** @overload
-uses new buffer
-*/
-static inline int countNonZero(const GpuMat& src)
-{
-    GpuMat buf;
-    return countNonZero(src, buf);
-}
+CV_EXPORTS int countNonZero(InputArray src);
+/** @overload */
+CV_EXPORTS void countNonZero(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
 /** @brief Reduces a matrix to a vector.
 
@@ -743,19 +651,12 @@ CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, i
 @param mtx Source matrix. CV_8UC1 matrices are supported for now.
 @param mean Mean value.
 @param stddev Standard deviation value.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 
 @sa meanStdDev
  */
-CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
-/** @overload
-uses new buffer
-*/
-static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev)
-{
-    GpuMat buf;
-    meanStdDev(src, mean, stddev, buf);
-}
+CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev);
+/** @overload */
+CV_EXPORTS void meanStdDev(InputArray mtx, OutputArray dst, Stream& stream = Stream::Null());
 
 /** @brief Computes a standard deviation of integral images.
 
@@ -779,64 +680,32 @@ normalization.
 @param dtype When negative, the output array has the same type as src; otherwise, it has the same
 number of channels as src and the depth =CV_MAT_DEPTH(dtype).
 @param mask Optional operation mask.
-@param norm_buf Optional buffer to avoid extra memory allocations. It is resized automatically.
-@param cvt_buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+@param stream Stream for the asynchronous version.
 
 @sa normalize
  */
 CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta,
-                          int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf);
-/** @overload
-uses new buffers
-*/
-static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0,
-                             int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
-{
-    GpuMat norm_buf;
-    GpuMat cvt_buf;
-    normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf);
-}
+                          int norm_type, int dtype, InputArray mask = noArray(),
+                          Stream& stream = Stream::Null());
 
 /** @brief Computes an integral image.
 
 @param src Source image. Only CV_8UC1 images are supported for now.
 @param sum Integral image containing 32-bit unsigned integer values packed into CV_32SC1 .
-@param buffer Optional buffer to avoid extra memory allocations. It is resized automatically.
 @param stream Stream for the asynchronous version.
 
 @sa integral
  */
-CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null());
-static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
-{
-    integral(src, sum, buffer, stream);
-}
-/** @overload
-uses new buffer
-*/
-static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
-{
-    GpuMat buffer;
-    integral(src, sum, buffer, stream);
-}
+CV_EXPORTS void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null());
 
 /** @brief Computes a squared integral image.
 
 @param src Source image. Only CV_8UC1 images are supported for now.
 @param sqsum Squared integral image containing 64-bit unsigned integer values packed into
 CV_64FC1 .
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 @param stream Stream for the asynchronous version.
  */
-CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null());
-/** @overload
-uses new buffer
-*/
-static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
-{
-    GpuMat buffer;
-    sqrIntegral(src, sqsum, buffer, stream);
-}
+CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null());
 
 //! @} cudaarithm_reduce
 
diff --git a/modules/cudaarithm/perf/perf_reductions.cpp b/modules/cudaarithm/perf/perf_reductions.cpp
index 470df48a3f..78699c0a74 100644
--- a/modules/cudaarithm/perf/perf_reductions.cpp
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@@ -108,10 +108,9 @@ PERF_TEST_P(Sz_Norm, NormDiff,
     {
         const cv::cuda::GpuMat d_src1(src1);
         const cv::cuda::GpuMat d_src2(src2);
-        cv::cuda::GpuMat d_buf;
         double gpu_dst;
 
-        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, d_buf, normType);
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, normType);
 
         SANITY_CHECK(gpu_dst);
 
@@ -146,10 +145,9 @@ PERF_TEST_P(Sz_Depth_Cn, Sum,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
         cv::Scalar gpu_dst;
 
-        TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src);
 
         SANITY_CHECK(gpu_dst, 1e-5, ERROR_RELATIVE);
     }
@@ -183,10 +181,9 @@ PERF_TEST_P(Sz_Depth_Cn, SumAbs,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
         cv::Scalar gpu_dst;
 
-        TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src);
 
         SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
     }
@@ -216,10 +213,9 @@ PERF_TEST_P(Sz_Depth_Cn, SumSqr,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
         cv::Scalar gpu_dst;
 
-        TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src);
 
         SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
     }
@@ -248,10 +244,9 @@ PERF_TEST_P(Sz_Depth, MinMax,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
         double gpu_minVal, gpu_maxVal;
 
-        TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat(), d_buf);
+        TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat());
 
         SANITY_CHECK(gpu_minVal, 1e-10);
         SANITY_CHECK(gpu_maxVal, 1e-10);
@@ -286,11 +281,10 @@ PERF_TEST_P(Sz_Depth, MinMaxLoc,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_valbuf, d_locbuf;
         double gpu_minVal, gpu_maxVal;
         cv::Point gpu_minLoc, gpu_maxLoc;
 
-        TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc, cv::cuda::GpuMat(), d_valbuf, d_locbuf);
+        TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc);
 
         SANITY_CHECK(gpu_minVal, 1e-10);
         SANITY_CHECK(gpu_maxVal, 1e-10);
@@ -323,10 +317,9 @@ PERF_TEST_P(Sz_Depth, CountNonZero,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
         int gpu_dst = 0;
 
-        TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src);
 
         SANITY_CHECK(gpu_dst);
     }
@@ -414,9 +407,8 @@ PERF_TEST_P(Sz_Depth_NormType, Normalize,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_norm_buf, d_cvt_buf;
 
-        TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat(), d_norm_buf, d_cvt_buf);
+        TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat());
 
         CUDA_SANITY_CHECK(dst, 1e-6);
     }
@@ -445,11 +437,10 @@ PERF_TEST_P(Sz, MeanStdDev,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
         cv::Scalar gpu_mean;
         cv::Scalar gpu_stddev;
 
-        TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev, d_buf);
+        TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev);
 
         SANITY_CHECK(gpu_mean);
         SANITY_CHECK(gpu_stddev);
@@ -481,9 +472,8 @@ PERF_TEST_P(Sz, Integral,
     {
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;
 
-        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
+        TEST_CYCLE() cv::cuda::integral(d_src, dst);
 
         CUDA_SANITY_CHECK(dst);
     }
@@ -511,9 +501,9 @@ PERF_TEST_P(Sz, IntegralSqr,
     if (PERF_RUN_CUDA())
     {
         const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst, buf;
+        cv::cuda::GpuMat dst;
 
-        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
+        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst);
 
         CUDA_SANITY_CHECK(dst);
     }
diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp
index 63246abd57..08de4e4288 100644
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@@ -169,9 +169,9 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
 #else
     // CUBLAS works with column-major matrices
 
-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
-    GpuMat src3 = _src3.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
+    GpuMat src3 = getInputMat(_src3, stream);
 
     CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 );
     CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) );
@@ -200,8 +200,7 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
     CV_Assert( src1Size.width == src2Size.height );
     CV_Assert( src3.empty() || src3Size == dstSize );
 
-    _dst.create(dstSize, src1.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, dstSize, src1.type(), stream);
 
     if (beta != 0)
     {
@@ -281,6 +280,8 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
     }
 
     cublasSafeCall( cublasDestroy_v2(handle) );
+
+    syncOutput(dst, _dst, stream);
 #endif
 }
 
@@ -297,7 +298,7 @@ void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags,
     (void) stream;
     throw_no_cuda();
 #else
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
 
@@ -314,13 +315,20 @@ void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags,
     // We don't support real-to-real transform
     CV_Assert( is_complex_input || is_complex_output );
 
-    GpuMat src_cont = src;
-
     // Make sure here we work with the continuous input,
     // as CUFFT can't handle gaps
-    createContinuous(src.rows, src.cols, src.type(), src_cont);
-    if (src_cont.data != src.data)
+    GpuMat src_cont;
+    if (src.isContinuous())
+    {
+        src_cont = src;
+    }
+    else
+    {
+        BufferPool pool(stream);
+        src_cont.allocator = pool.getAllocator();
+        createContinuous(src.rows, src.cols, src.type(), src_cont);
         src.copyTo(src_cont, stream);
+    }
 
     Size dft_size_opt = dft_size;
     if (is_1d_input && !is_row_dft)
@@ -462,16 +470,15 @@ namespace
 
     void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
     {
-        GpuMat image = _image.getGpuMat();
-        GpuMat templ = _templ.getGpuMat();
+        GpuMat image = getInputMat(_image, _stream);
+        GpuMat templ = getInputMat(_templ, _stream);
 
         CV_Assert( image.type() == CV_32FC1 );
         CV_Assert( templ.type() == CV_32FC1 );
 
         create(image.size(), templ.size());
 
-        _result.create(result_size, CV_32FC1);
-        GpuMat result = _result.getGpuMat();
+        GpuMat result = getOutputMat(_result, result_size, CV_32FC1, _stream);
 
         cudaStream_t stream = StreamAccessor::getStream(_stream);
 
@@ -520,6 +527,8 @@ namespace
 
         cufftSafeCall( cufftDestroy(planR2C) );
         cufftSafeCall( cufftDestroy(planC2R) );
+
+        syncOutput(result, _result, _stream);
     }
 }
 
diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp
index eb71d6a4ec..7dd51f9781 100644
--- a/modules/cudaarithm/src/core.cpp
+++ b/modules/cudaarithm/src/core.cpp
@@ -119,15 +119,17 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
         {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
     CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
 
     _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
     funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaarithm/src/cuda/add_weighted.cu b/modules/cudaarithm/src/cuda/add_weighted.cu
index d5c00f6072..929301076d 100644
--- a/modules/cudaarithm/src/cuda/add_weighted.cu
+++ b/modules/cudaarithm/src/cuda/add_weighted.cu
@@ -50,7 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
@@ -63,7 +66,7 @@ namespace
 
         __device__ __forceinline__ D operator ()(T1 a, T2 b) const
         {
-            return saturate_cast<D>(a * alpha + b * beta + gamma);
+            return cudev::saturate_cast<D>(a * alpha + b * beta + gamma);
         }
     };
 
@@ -555,8 +558,8 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou
         }
     };
 
-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
 
     int sdepth1 = src1.depth();
     int sdepth2 = src2.depth();
@@ -564,19 +567,18 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou
     ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
     const int cn = src1.channels();
 
-    CV_DbgAssert( src2.size() == src1.size() && src2.channels() == cn );
-    CV_DbgAssert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.channels() == cn );
+    CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
 
-    _dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_MAKE_TYPE(ddepth, cn), stream);
 
-    GpuMat src1_ = src1.reshape(1);
-    GpuMat src2_ = src2.reshape(1);
-    GpuMat dst_ = dst.reshape(1);
+    GpuMat src1_single = src1.reshape(1);
+    GpuMat src2_single = src2.reshape(1);
+    GpuMat dst_single = dst.reshape(1);
 
     if (sdepth1 > sdepth2)
     {
-        src1_.swap(src2_);
+        src1_single.swap(src2_single);
         std::swap(alpha, beta);
         std::swap(sdepth1, sdepth2);
     }
@@ -586,7 +588,9 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src1_, alpha, src2_, beta, gamma, dst_, stream);
+    func(src1_single, alpha, src2_single, beta, gamma, dst_single, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/bitwise_mat.cu b/modules/cudaarithm/src/cuda/bitwise_mat.cu
index b2bf288be7..f151c1a486 100644
--- a/modules/cudaarithm/src/cuda/bitwise_mat.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_mat.cu
@@ -50,7 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
@@ -60,16 +63,15 @@ void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& m
 
 void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
+    GpuMat mask = getInputMat(_mask, stream);
 
     const int depth = src.depth();
 
     CV_DbgAssert( depth <= CV_32F );
     CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
     if (mask.empty())
     {
@@ -125,6 +127,8 @@ void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask,
             gridTransformUnary(vsrc, vdst, bit_not<uchar>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
         }
     }
+
+    syncOutput(dst, _dst, stream);
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/modules/cudaarithm/src/cuda/copy_make_border.cu b/modules/cudaarithm/src/cuda/copy_make_border.cu
index f7dd91f987..ce9cda36cf 100644
--- a/modules/cudaarithm/src/cuda/copy_make_border.cu
+++ b/modules/cudaarithm/src/cuda/copy_make_border.cu
@@ -50,7 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
@@ -133,7 +136,7 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo
         {    copyMakeBorderImpl<float , 1>  , 0 /*copyMakeBorderImpl<float , 2>*/,     copyMakeBorderImpl<float , 3>  ,     copyMakeBorderImpl<float  ,4>  }
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     const int depth = src.depth();
     const int cn = src.channels();
@@ -141,8 +144,7 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo
     CV_Assert( depth <= CV_32F && cn <= 4 );
     CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
 
-    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.rows + top + bottom, src.cols + left + right, src.type(), stream);
 
     const func_t func = funcs[depth][cn - 1];
 
@@ -150,6 +152,8 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
     func(src, dst, top, left, borderType, value, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/countnonzero.cu b/modules/cudaarithm/src/cuda/countnonzero.cu
index 5de2609093..fb7324660a 100644
--- a/modules/cudaarithm/src/cuda/countnonzero.cu
+++ b/modules/cudaarithm/src/cuda/countnonzero.cu
@@ -50,47 +50,64 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
 {
-    template <typename T>
-    int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
+    template <typename T, typename D>
+    void countNonZeroImpl(const GpuMat& _src, GpuMat& _dst, Stream& stream)
     {
         const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
-        GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
+        GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
 
-        gridCountNonZero(src, buf);
-
-        int data;
-        buf.download(cv::Mat(1, 1, buf.type(), &data));
-
-        return data;
+        gridCountNonZero(src, dst, stream);
     }
 }
 
-int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
+void cv::cuda::countNonZero(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Stream& stream);
     static const func_t funcs[] =
     {
-        countNonZeroImpl<uchar>,
-        countNonZeroImpl<schar>,
-        countNonZeroImpl<ushort>,
-        countNonZeroImpl<short>,
-        countNonZeroImpl<int>,
-        countNonZeroImpl<float>,
-        countNonZeroImpl<double>
+        countNonZeroImpl<uchar, int>,
+        countNonZeroImpl<schar, int>,
+        countNonZeroImpl<ushort, int>,
+        countNonZeroImpl<short, int>,
+        countNonZeroImpl<int, int>,
+        countNonZeroImpl<float, int>,
+        countNonZeroImpl<double, int>,
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
+    CV_Assert( src.depth() <= CV_64F );
     CV_Assert( src.channels() == 1 );
 
-    const func_t func = funcs[src.depth()];
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_32SC1, stream);
 
-    return func(src, buf);
+    const func_t func = funcs[src.depth()];
+    func(src, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+int cv::cuda::countNonZero(InputArray _src)
+{
+    Stream& stream = Stream::Null();
+
+    BufferPool pool(stream);
+    GpuMat buf = pool.getBuffer(1, 1, CV_32SC1);
+
+    countNonZero(_src, buf, stream);
+
+    int data;
+    buf.download(Mat(1, 1, CV_32SC1, &data));
+
+    return data;
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/integral.cu b/modules/cudaarithm/src/cuda/integral.cu
index db554eb301..4a70ab0de8 100644
--- a/modules/cudaarithm/src/cuda/integral.cu
+++ b/modules/cudaarithm/src/cuda/integral.cu
@@ -50,51 +50,58 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 ////////////////////////////////////////////////////////////////////////
 // integral
 
-void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& stream)
+void cv::cuda::integral(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.type() == CV_8UC1 );
 
-    GpuMat_<int>& res = (GpuMat_<int>&) buffer;
+    BufferPool pool(stream);
+    GpuMat_<int> res(src.size(), pool.getAllocator());
 
     gridIntegral(globPtr<uchar>(src), res, stream);
 
-    _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_32SC1, stream);
 
     dst.setTo(Scalar::all(0), stream);
 
     GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
     res.copyTo(inner, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // sqrIntegral
 
-void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& stream)
+void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.type() == CV_8UC1 );
 
-    GpuMat_<double>& res = (GpuMat_<double>&) buf;
+    BufferPool pool(Stream::Null());
+    GpuMat_<double> res(pool.getBuffer(src.size(), CV_64FC1));
 
     gridIntegral(sqr_(cvt_<int>(globPtr<uchar>(src))), res, stream);
 
-    _dst.create(src.rows + 1, src.cols + 1, CV_64FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_64FC1, stream);
 
     dst.setTo(Scalar::all(0), stream);
 
     GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
     res.copyTo(inner, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/lut.cu b/modules/cudaarithm/src/cuda/lut.cu
index 0b1fe8b0d5..56efb8fa88 100644
--- a/modules/cudaarithm/src/cuda/lut.cu
+++ b/modules/cudaarithm/src/cuda/lut.cu
@@ -50,8 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
 using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
@@ -165,7 +167,7 @@ namespace
 
     void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& stream)
     {
-        GpuMat src = _src.getGpuMat();
+        GpuMat src = getInputMat(_src, stream);
 
         const int cn = src.channels();
         const int lut_cn = d_lut.channels();
@@ -173,8 +175,7 @@ namespace
         CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
         CV_Assert( lut_cn == 1 || lut_cn == cn );
 
-        _dst.create(src.size(), src.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
         if (lut_cn == 1)
         {
@@ -196,6 +197,8 @@ namespace
 
             dst3.assign(lut_(src3, tbl), stream);
         }
+
+        syncOutput(dst, _dst, stream);
     }
 }
 
diff --git a/modules/cudaarithm/src/cuda/math.cu b/modules/cudaarithm/src/cuda/math.cu
index 39f822081d..41d762f6a6 100644
--- a/modules/cudaarithm/src/cuda/math.cu
+++ b/modules/cudaarithm/src/cuda/math.cu
@@ -50,7 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
@@ -92,16 +95,15 @@ void cv::cuda::abs(InputArray _src, OutputArray _dst, Stream& stream)
         absMat<double>
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );
 
-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
 
-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -113,7 +115,7 @@ namespace
     {
         __device__ __forceinline__ T operator ()(T x) const
         {
-            return saturate_cast<T>(x * x);
+            return cudev::saturate_cast<T>(x * x);
         }
     };
 
@@ -138,16 +140,15 @@ void cv::cuda::sqr(InputArray _src, OutputArray _dst, Stream& stream)
         sqrMat<double>
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );
 
-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
 
-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -176,16 +177,15 @@ void cv::cuda::sqrt(InputArray _src, OutputArray _dst, Stream& stream)
         sqrtMat<double>
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );
 
-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
 
-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -198,7 +198,7 @@ namespace
         __device__ __forceinline__ T operator ()(T x) const
         {
             exp_func<T> f;
-            return saturate_cast<T>(f(x));
+            return cudev::saturate_cast<T>(f(x));
         }
     };
 
@@ -223,16 +223,15 @@ void cv::cuda::exp(InputArray _src, OutputArray _dst, Stream& stream)
         expMat<double>
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );
 
-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
 
-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -261,16 +260,15 @@ void cv::cuda::log(InputArray _src, OutputArray _dst, Stream& stream)
         logMat<double>
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );
 
-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);
 
-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -284,7 +282,7 @@ namespace
 
         __device__ __forceinline__ T operator()(T e) const
         {
-            return saturate_cast<T>(__powf((float)e, power));
+            return cudev::saturate_cast<T>(__powf((float)e, power));
         }
     };
     template<typename T> struct PowOp<T, true> : unary_function<T, T>
@@ -293,7 +291,7 @@ namespace
 
         __device__ __forceinline__ T operator()(T e) const
         {
-            T res = saturate_cast<T>(__powf((float)e, power));
+            T res = cudev::saturate_cast<T>(__powf((float)e, power));
 
             if ((e < 0) && (1 & static_cast<int>(power)))
                 res *= -1;
@@ -344,16 +342,15 @@ void cv::cuda::pow(InputArray _src, double power, OutputArray _dst, Stream& stre
         powMat<double>
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );
 
-    CV_DbgAssert(depth <= CV_64F);
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), stream);
 
-    funcs[depth](src.reshape(1), power, dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/minmax.cu b/modules/cudaarithm/src/cuda/minmax.cu
index 084bed8706..517427073a 100644
--- a/modules/cudaarithm/src/cuda/minmax.cu
+++ b/modules/cudaarithm/src/cuda/minmax.cu
@@ -50,62 +50,140 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
 {
-    template <typename T>
-    void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal)
+    template <typename T, typename R>
+    void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
     {
-        typedef typename SelectIf<
-                TypesEquals<T, double>::value,
-                double,
-                typename SelectIf<TypesEquals<T, float>::value, float, int>::type
-                >::type work_type;
-
         const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
-        GpuMat_<work_type>& buf = (GpuMat_<work_type>&) _buf;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
 
         if (mask.empty())
-            gridFindMinMaxVal(src, buf);
+            gridFindMinMaxVal(src, dst, stream);
         else
-            gridFindMinMaxVal(src, buf, globPtr<uchar>(mask));
+            gridFindMinMaxVal(src, dst, globPtr<uchar>(mask), stream);
+    }
 
-        work_type data[2];
-        buf.download(cv::Mat(1, 2, buf.type(), data));
+    template <typename T, typename R>
+    void minMaxImpl(const GpuMat& src, const GpuMat& mask, double* minVal, double* maxVal)
+    {
+        BufferPool pool(Stream::Null());
+        GpuMat buf(pool.getBuffer(1, 2, DataType<R>::type));
 
-        if (minVal)
-            *minVal = data[0];
+        minMaxImpl<T, R>(src, mask, buf, Stream::Null());
+
+        R data[2];
+        buf.download(Mat(1, 2, buf.type(), data));
 
-        if (maxVal)
-            *maxVal = data[1];
     }
 }
 
-void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
+void cv::cuda::findMinMax(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal);
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
     static const func_t funcs[] =
     {
-        minMaxImpl<uchar>,
-        minMaxImpl<schar>,
-        minMaxImpl<ushort>,
-        minMaxImpl<short>,
-        minMaxImpl<int>,
-        minMaxImpl<float>,
-        minMaxImpl<double>
+        minMaxImpl<uchar, int>,
+        minMaxImpl<schar, int>,
+        minMaxImpl<ushort, int>,
+        minMaxImpl<short, int>,
+        minMaxImpl<int, int>,
+        minMaxImpl<float, float>,
+        minMaxImpl<double, double>
     };
 
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
 
     CV_Assert( src.channels() == 1 );
-    CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    const int src_depth = src.depth();
+    const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth;
+
+    GpuMat dst = getOutputMat(_dst, 1, 2, dst_depth, stream);
 
     const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
 
-    func(src, mask, buf, minVal, maxVal);
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    findMinMax(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().convertTo(Mat(1, 2, CV_64FC1, &vals[0]), CV_64F);
+
+    if (minVal)
+        *minVal = vals[0];
+
+    if (maxVal)
+        *maxVal = vals[1];
+}
+
+namespace cv { namespace cuda { namespace internal {
+
+void findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream);
+
+}}}
+
+namespace
+{
+    template <typename T, typename R>
+    void findMaxAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
+
+        if (mask.empty())
+            gridFindMaxVal(abs_(src), dst, stream);
+        else
+            gridFindMaxVal(abs_(src), dst, globPtr<uchar>(mask), stream);
+    }
+}
+
+void cv::cuda::internal::findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        findMaxAbsImpl<uchar, int>,
+        findMaxAbsImpl<schar, int>,
+        findMaxAbsImpl<ushort, int>,
+        findMaxAbsImpl<short, int>,
+        findMaxAbsImpl<int, int>,
+        findMaxAbsImpl<float, float>,
+        findMaxAbsImpl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    const int src_depth = src.depth();
+    const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth;
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, dst_depth, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/minmaxloc.cu b/modules/cudaarithm/src/cuda/minmaxloc.cu
index 6f8cc53d6b..b7c5ec872f 100644
--- a/modules/cudaarithm/src/cuda/minmaxloc.cu
+++ b/modules/cudaarithm/src/cuda/minmaxloc.cu
@@ -50,78 +50,110 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
 {
-    template <typename T>
-    void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc)
+    template <typename T, typename R>
+    void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream)
     {
-        typedef typename SelectIf<
-                TypesEquals<T, double>::value,
-                double,
-                typename SelectIf<TypesEquals<T, float>::value, float, int>::type
-                >::type work_type;
-
         const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
-        GpuMat_<work_type>& valBuf = (GpuMat_<work_type>&) _valBuf;
+        GpuMat_<R>& valBuf = (GpuMat_<R>&) _valBuf;
         GpuMat_<int>& locBuf = (GpuMat_<int>&) _locBuf;
 
         if (mask.empty())
-            gridMinMaxLoc(src, valBuf, locBuf);
+            gridMinMaxLoc(src, valBuf, locBuf, stream);
         else
-            gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask));
-
-        cv::Mat_<work_type> h_valBuf;
-        cv::Mat_<int> h_locBuf;
-
-        valBuf.download(h_valBuf);
-        locBuf.download(h_locBuf);
-
-        if (minVal)
-            *minVal = h_valBuf(0, 0);
-
-        if (maxVal)
-            *maxVal = h_valBuf(1, 0);
-
-        if (minLoc)
-        {
-            const int idx = h_locBuf(0, 0);
-            *minLoc = cv::Point(idx % src.cols, idx / src.cols);
-        }
-
-        if (maxLoc)
-        {
-            const int idx = h_locBuf(1, 0);
-            *maxLoc = cv::Point(idx % src.cols, idx / src.cols);
-        }
+            gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask), stream);
     }
 }
 
-void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
+void cv::cuda::findMinMaxLoc(InputArray _src, OutputArray _minMaxVals, OutputArray _loc, InputArray _mask, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc);
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream);
     static const func_t funcs[] =
     {
-        minMaxLocImpl<uchar>,
-        minMaxLocImpl<schar>,
-        minMaxLocImpl<ushort>,
-        minMaxLocImpl<short>,
-        minMaxLocImpl<int>,
-        minMaxLocImpl<float>,
-        minMaxLocImpl<double>
+        minMaxLocImpl<uchar, int>,
+        minMaxLocImpl<schar, int>,
+        minMaxLocImpl<ushort, int>,
+        minMaxLocImpl<short, int>,
+        minMaxLocImpl<int, int>,
+        minMaxLocImpl<float, float>,
+        minMaxLocImpl<double, double>
     };
 
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
 
     CV_Assert( src.channels() == 1 );
-    CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
 
-    const func_t func = funcs[src.depth()];
+    const int src_depth = src.depth();
 
-    func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc);
+    BufferPool pool(stream);
+    GpuMat valBuf(pool.getAllocator());
+    GpuMat locBuf(pool.getAllocator());
+
+    const func_t func = funcs[src_depth];
+    func(src, mask, valBuf, locBuf, stream);
+
+    GpuMat minMaxVals = valBuf.colRange(0, 1);
+    GpuMat loc = locBuf.colRange(0, 1);
+
+    if (_minMaxVals.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        minMaxVals.copyTo(_minMaxVals, stream);
+    }
+    else
+    {
+        minMaxVals.download(_minMaxVals, stream);
+    }
+
+    if (_loc.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        loc.copyTo(_loc, stream);
+    }
+    else
+    {
+        loc.download(_loc, stream);
+    }
+}
+
+void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem minMaxVals, locVals;
+    findMinMaxLoc(_src, minMaxVals, locVals, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    minMaxVals.createMatHeader().convertTo(Mat(minMaxVals.size(), CV_64FC1, &vals[0]), CV_64F);
+
+    int locs[2];
+    locVals.createMatHeader().copyTo(Mat(locVals.size(), CV_32SC1, &locs[0]));
+    Size size = _src.size();
+    cv::Point locs2D[] = {
+        cv::Point(locs[0] % size.width, locs[0] / size.width),
+        cv::Point(locs[1] % size.width, locs[1] / size.width),
+    };
+
+    if (minVal)
+        *minVal = vals[0];
+
+    if (maxVal)
+        *maxVal = vals[1];
+
+    if (minLoc)
+        *minLoc = locs2D[0];
+
+    if (maxLoc)
+        *maxLoc = locs2D[1];
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/mul_spectrums.cu b/modules/cudaarithm/src/cuda/mul_spectrums.cu
index b060904816..bd62f99030 100644
--- a/modules/cudaarithm/src/cuda/mul_spectrums.cu
+++ b/modules/cudaarithm/src/cuda/mul_spectrums.cu
@@ -50,7 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 //////////////////////////////////////////////////////////////////////////////
@@ -120,33 +123,33 @@ void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst
 {
     (void) flags;
 
-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
 
     CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
     CV_Assert( src1.size() == src2.size() );
 
-    _dst.create(src1.size(), CV_32FC2);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream);
 
     if (conjB)
         gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul_conj(), stream);
     else
         gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul(), stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
 {
     (void) flags;
 
-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
 
     CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
     CV_Assert( src1.size() == src2.size() );
 
-    _dst.create(src1.size(), CV_32FC2);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream);
 
     if (conjB)
     {
@@ -160,6 +163,8 @@ void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputAr
         op.scale = scale;
         gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
     }
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/norm.cu b/modules/cudaarithm/src/cuda/norm.cu
index bda6b45815..baf76a6db3 100644
--- a/modules/cudaarithm/src/cuda/norm.cu
+++ b/modules/cudaarithm/src/cuda/norm.cu
@@ -50,70 +50,140 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
 {
-    double normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
+    void normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
     {
         const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
         const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
-        GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
+        GpuMat_<int>& dst = (GpuMat_<int>&) _dst;
 
-        gridFindMinMaxVal(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
-
-        int data[2];
-        buf.download(cv::Mat(1, 2, buf.type(), data));
-
-        return data[1];
+        gridFindMaxVal(abs_(cvt_<int>(src1) - cvt_<int>(src2)), dst, stream);
     }
 
-    double normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
+    void normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
     {
         const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
         const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
-        GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
+        GpuMat_<int>& dst = (GpuMat_<int>&) _dst;
 
-        gridCalcSum(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
-
-        int data;
-        buf.download(cv::Mat(1, 1, buf.type(), &data));
-
-        return data;
+        gridCalcSum(abs_(cvt_<int>(src1) - cvt_<int>(src2)), dst, stream);
     }
 
-    double normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
+    void normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
     {
         const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
         const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
-        GpuMat_<double>& buf = (GpuMat_<double>&) _buf;
+        GpuMat_<double>& dst = (GpuMat_<double>&) _dst;
 
-        gridCalcSum(sqr_(cvt_<double>(src1) - cvt_<double>(src2)), buf);
+        BufferPool pool(stream);
+        GpuMat_<double> buf(1, 1, pool.getAllocator());
 
-        double data;
-        buf.download(cv::Mat(1, 1, buf.type(), &data));
-
-        return std::sqrt(data);
+        gridCalcSum(sqr_(cvt_<double>(src1) - cvt_<double>(src2)), buf, stream);
+        gridTransformUnary(buf, dst, sqrt_func<double>(), stream);
     }
 }
 
-double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
+void cv::cuda::calcNormDiff(InputArray _src1, InputArray _src2, OutputArray _dst, int normType, Stream& stream)
 {
-    typedef double (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf);
+    typedef void (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream);
     static const func_t funcs[] =
     {
         0, normDiffInf, normDiffL1, 0, normDiffL2
     };
 
-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
 
     CV_Assert( src1.type() == CV_8UC1 );
     CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
     CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
 
-    return funcs[normType](src1, src2, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, normType == NORM_L2 ? CV_64FC1 : CV_32SC1, stream);
+
+    const func_t func = funcs[normType];
+    func(src1, src2, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+double cv::cuda::norm(InputArray _src1, InputArray _src2, int normType)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcNormDiff(_src1, _src2, dst, normType, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F);
+
+    return val;
+}
+
+namespace cv { namespace cuda { namespace internal {
+
+void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+}}}
+
+namespace
+{
+    template <typename T, typename R>
+    void normL2Impl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
+
+        BufferPool pool(stream);
+        GpuMat_<double> buf(1, 1, pool.getAllocator());
+
+        if (mask.empty())
+        {
+            gridCalcSum(sqr_(cvt_<double>(src)), buf, stream);
+        }
+        else
+        {
+            gridCalcSum(sqr_(cvt_<double>(src)), buf, globPtr<uchar>(mask), stream);
+        }
+
+        gridTransformUnary(buf, dst, sqrt_func<double>(), stream);
+    }
+}
+
+void cv::cuda::internal::normL2(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        normL2Impl<uchar, double>,
+        normL2Impl<schar, double>,
+        normL2Impl<ushort, double>,
+        normL2Impl<short, double>,
+        normL2Impl<int, double>,
+        normL2Impl<float, double>,
+        normL2Impl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC1, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/normalize.cu b/modules/cudaarithm/src/cuda/normalize.cu
new file mode 100644
index 0000000000..efbc94ecce
--- /dev/null
+++ b/modules/cudaarithm/src/cuda/normalize.cu
@@ -0,0 +1,290 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace {
+
+template <typename T, typename R, typename I>
+struct ConvertorMinMax : unary_function<T, R>
+{
+    typedef typename LargerType<T, R>::type larger_type1;
+    typedef typename LargerType<larger_type1, I>::type larger_type2;
+    typedef typename LargerType<larger_type2, float>::type scalar_type;
+
+    scalar_type dmin, dmax;
+    const I* minMaxVals;
+
+    __device__ R operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        const scalar_type smin = minMaxVals[0];
+        const scalar_type smax = minMaxVals[1];
+
+        const scalar_type scale = (dmax - dmin) * (smax - smin > numeric_limits<scalar_type>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        const scalar_type shift = dmin - smin * scale;
+
+        return cudev::saturate_cast<R>(scale * src + shift);
+    }
+};
+
+template <typename T, typename R, typename I>
+void normalizeMinMax(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream)
+{
+    const GpuMat_<T>& src = (const GpuMat_<T>&)_src;
+    GpuMat_<R>& dst = (GpuMat_<R>&)_dst;
+
+    BufferPool pool(stream);
+    GpuMat_<I> minMaxVals(1, 2, pool.getAllocator());
+
+    if (mask.empty())
+    {
+        gridFindMinMaxVal(src, minMaxVals, stream);
+    }
+    else
+    {
+        gridFindMinMaxVal(src, minMaxVals, globPtr<uchar>(mask), stream);
+    }
+
+    ConvertorMinMax<T, R, I> cvt;
+    cvt.dmin = std::min(a, b);
+    cvt.dmax = std::max(a, b);
+    cvt.minMaxVals = minMaxVals[0];
+
+    if (mask.empty())
+    {
+        gridTransformUnary(src, dst, cvt, stream);
+    }
+    else
+    {
+        dst.setTo(Scalar::all(0), stream);
+        gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+    }
+}
+
+template <typename T, typename R, typename I, bool normL2>
+struct ConvertorNorm : unary_function<T, R>
+{
+    typedef typename LargerType<T, R>::type larger_type1;
+    typedef typename LargerType<larger_type1, I>::type larger_type2;
+    typedef typename LargerType<larger_type2, float>::type scalar_type;
+
+    scalar_type a;
+    const I* normVal;
+
+    __device__ R operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        sqrt_func<scalar_type> sqrt;
+
+        scalar_type scale = normL2 ? sqrt(*normVal) : *normVal;
+        scale = scale > numeric_limits<scalar_type>::epsilon() ? a / scale : 0.0;
+
+        return cudev::saturate_cast<R>(scale * src);
+    }
+};
+
+template <typename T, typename R, typename I>
+void normalizeNorm(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream)
+{
+    const GpuMat_<T>& src = (const GpuMat_<T>&)_src;
+    GpuMat_<R>& dst = (GpuMat_<R>&)_dst;
+
+    BufferPool pool(stream);
+    GpuMat_<I> normVal(1, 1, pool.getAllocator());
+
+    if (normType == NORM_L1)
+    {
+        if (mask.empty())
+        {
+            gridCalcSum(abs_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridCalcSum(abs_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+    else if (normType == NORM_L2)
+    {
+        if (mask.empty())
+        {
+            gridCalcSum(sqr_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridCalcSum(sqr_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+    else // NORM_INF
+    {
+        if (mask.empty())
+        {
+            gridFindMaxVal(abs_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridFindMaxVal(abs_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+
+    if (normType == NORM_L2)
+    {
+        ConvertorNorm<T, R, I, true> cvt;
+        cvt.a = a;
+        cvt.normVal = normVal[0];
+
+        if (mask.empty())
+        {
+            gridTransformUnary(src, dst, cvt, stream);
+        }
+        else
+        {
+            dst.setTo(Scalar::all(0), stream);
+            gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+        }
+    }
+    else
+    {
+        ConvertorNorm<T, R, I, false> cvt;
+        cvt.a = a;
+        cvt.normVal = normVal[0];
+
+        if (mask.empty())
+        {
+            gridTransformUnary(src, dst, cvt, stream);
+        }
+        else
+        {
+            dst.setTo(Scalar::all(0), stream);
+            gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+        }
+    }
+}
+
+} // namespace
+
+void cv::cuda::normalize(InputArray _src, OutputArray _dst, double a, double b, int normType, int dtype, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_minmax_t)(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream);
+    typedef void (*func_norm_t)(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream);
+
+    static const func_minmax_t funcs_minmax[] =
+    {
+        normalizeMinMax<uchar, float, float>,
+        normalizeMinMax<schar, float, float>,
+        normalizeMinMax<ushort, float, float>,
+        normalizeMinMax<short, float, float>,
+        normalizeMinMax<int, float, float>,
+        normalizeMinMax<float, float, float>,
+        normalizeMinMax<double, double, double>
+    };
+
+    static const func_norm_t funcs_norm[] =
+    {
+        normalizeNorm<uchar, float, float>,
+        normalizeNorm<schar, float, float>,
+        normalizeNorm<ushort, float, float>,
+        normalizeNorm<short, float, float>,
+        normalizeNorm<int, float, float>,
+        normalizeNorm<float, float, float>,
+        normalizeNorm<double, double, double>
+    };
+
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_MINMAX );
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    dtype = CV_MAT_DEPTH(dtype);
+
+    const int src_depth = src.depth();
+    const int tmp_depth = src_depth <= CV_32F ? CV_32F : src_depth;
+
+    GpuMat dst;
+    if (dtype == tmp_depth)
+    {
+        _dst.create(src.size(), tmp_depth);
+        dst = getOutputMat(_dst, src.size(), tmp_depth, stream);
+    }
+    else
+    {
+        BufferPool pool(stream);
+        dst = pool.getBuffer(src.size(), tmp_depth);
+    }
+
+    if (normType == NORM_MINMAX)
+    {
+        const func_minmax_t func = funcs_minmax[src_depth];
+        func(src, dst, a, b, mask, stream);
+    }
+    else
+    {
+        const func_norm_t func = funcs_norm[src_depth];
+        func(src, dst, a, normType, mask, stream);
+    }
+
+    if (dtype == tmp_depth)
+    {
+        syncOutput(dst, _dst, stream);
+    }
+    else
+    {
+        dst.convertTo(_dst, dtype, stream);
+    }
+}
+
+#endif
diff --git a/modules/cudaarithm/src/cuda/polar_cart.cu b/modules/cudaarithm/src/cuda/polar_cart.cu
index 200b79c055..0a949b42ed 100644
--- a/modules/cudaarithm/src/cuda/polar_cart.cu
+++ b/modules/cudaarithm/src/cuda/polar_cart.cu
@@ -50,55 +50,59 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 void cv::cuda::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
 
-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
 
-    _dst.create(x.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);
 
     GpuMat_<float> xc(x.reshape(1));
     GpuMat_<float> yc(y.reshape(1));
     GpuMat_<float> magc(dst.reshape(1));
 
     gridTransformBinary(xc, yc, magc, magnitude_func<float>(), stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 void cv::cuda::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
 
-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
 
-    _dst.create(x.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);
 
     GpuMat_<float> xc(x.reshape(1));
     GpuMat_<float> yc(y.reshape(1));
     GpuMat_<float> magc(dst.reshape(1));
 
     gridTransformBinary(xc, yc, magc, magnitude_sqr_func<float>(), stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
 
-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
 
-    _dst.create(x.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);
 
     GpuMat_<float> xc(x.reshape(1));
     GpuMat_<float> yc(y.reshape(1));
@@ -108,21 +112,20 @@ void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleI
         gridTransformBinary(xc, yc, anglec, direction_func<float, true>(), stream);
     else
         gridTransformBinary(xc, yc, anglec, direction_func<float, false>(), stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
 void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);
 
-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );
 
-    _mag.create(x.size(), CV_32FC1);
-    GpuMat mag = _mag.getGpuMat();
-
-    _angle.create(x.size(), CV_32FC1);
-    GpuMat angle = _angle.getGpuMat();
+    GpuMat mag = getOutputMat(_mag, x.size(), CV_32FC1, stream);
+    GpuMat angle = getOutputMat(_angle, x.size(), CV_32FC1, stream);
 
     GpuMat_<float> xc(x.reshape(1));
     GpuMat_<float> yc(y.reshape(1));
@@ -147,6 +150,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu
                                binaryTupleAdapter<0, 1>(direction_func<float, false>())),
                            stream);
     }
+
+    syncOutput(mag, _mag, stream);
+    syncOutput(angle, _angle, stream);
 }
 
 namespace
@@ -173,17 +179,14 @@ namespace
 
 void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& _stream)
 {
-    GpuMat mag = _mag.getGpuMat();
-    GpuMat angle = _angle.getGpuMat();
+    GpuMat mag = getInputMat(_mag, _stream);
+    GpuMat angle = getInputMat(_angle, _stream);
 
-    CV_DbgAssert( angle.depth() == CV_32F );
-    CV_DbgAssert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) );
+    CV_Assert( angle.depth() == CV_32F );
+    CV_Assert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) );
 
-    _x.create(angle.size(), CV_32FC1);
-    GpuMat x = _x.getGpuMat();
-
-    _y.create(angle.size(), CV_32FC1);
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getOutputMat(_x, angle.size(), CV_32FC1, _stream);
+    GpuMat y = getOutputMat(_y, angle.size(), CV_32FC1, _stream);
 
     GpuMat_<float> xc(x.reshape(1));
     GpuMat_<float> yc(y.reshape(1));
@@ -204,6 +207,9 @@ void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, O
 
     CV_CUDEV_SAFE_CALL( cudaGetLastError() );
 
+    syncOutput(x, _x, _stream);
+    syncOutput(y, _y, _stream);
+
     if (stream == 0)
         CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
 }
diff --git a/modules/cudaarithm/src/cuda/reduce.cu b/modules/cudaarithm/src/cuda/reduce.cu
index 2cb2dacc73..5fb90287a9 100644
--- a/modules/cudaarithm/src/cuda/reduce.cu
+++ b/modules/cudaarithm/src/cuda/reduce.cu
@@ -50,7 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
@@ -125,7 +128,7 @@ namespace
 
 void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.channels() <= 4 );
     CV_Assert( dim == 0 || dim == 1 );
@@ -134,8 +137,7 @@ void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp,
     if (dtype < 0)
         dtype = src.depth();
 
-    _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, 1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()), stream);
 
     if (dim == 0)
     {
@@ -292,6 +294,8 @@ void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp,
 
         func(src, dst, reduceOp, stream);
     }
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/split_merge.cu b/modules/cudaarithm/src/cuda/split_merge.cu
index 13d6a349fb..5b3af10775 100644
--- a/modules/cudaarithm/src/cuda/split_merge.cu
+++ b/modules/cudaarithm/src/cuda/split_merge.cu
@@ -50,7 +50,10 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 ////////////////////////////////////////////////////////////////////////
@@ -92,20 +95,18 @@ namespace
 
     void mergeImpl(const GpuMat* src, size_t n, cv::OutputArray _dst, Stream& stream)
     {
-        CV_DbgAssert( src != 0 );
-        CV_DbgAssert( n > 0 && n <= 4 );
+        CV_Assert( src != 0 );
+        CV_Assert( n > 0 && n <= 4 );
 
         const int depth = src[0].depth();
         const cv::Size size = src[0].size();
 
-#ifdef _DEBUG
         for (size_t i = 0; i < n; ++i)
         {
             CV_Assert( src[i].size() == size );
             CV_Assert( src[i].depth() == depth );
             CV_Assert( src[i].channels() == 1 );
         }
-#endif
 
         if (n == 1)
         {
@@ -123,8 +124,7 @@ namespace
 
             const int channels = static_cast<int>(n);
 
-            _dst.create(size, CV_MAKE_TYPE(depth, channels));
-            GpuMat dst = _dst.getGpuMat();
+            GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(depth, channels), stream);
 
             const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];
 
@@ -132,6 +132,8 @@ namespace
                 CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
 
             func(src, dst, stream);
+
+            syncOutput(dst, _dst, stream);
         }
     }
 }
@@ -203,12 +205,12 @@ namespace
             {SplitFunc<4, uchar>::call, SplitFunc<4, ushort>::call, SplitFunc<4, int>::call, 0, SplitFunc<4, double>::call}
         };
 
-        CV_DbgAssert( dst != 0 );
+        CV_Assert( dst != 0 );
 
         const int depth = src.depth();
         const int channels = src.channels();
 
-        CV_DbgAssert( channels <= 4 );
+        CV_Assert( channels <= 4 );
 
         if (channels == 0)
             return;
@@ -233,13 +235,13 @@ namespace
 
 void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
     splitImpl(src, dst, stream);
 }
 
 void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
     dst.resize(src.channels());
     if (src.channels() > 0)
         splitImpl(src, &dst[0], stream);
diff --git a/modules/cudaarithm/src/cuda/sum.cu b/modules/cudaarithm/src/cuda/sum.cu
index cced9c56e8..0160449039 100644
--- a/modules/cudaarithm/src/cuda/sum.cu
+++ b/modules/cudaarithm/src/cuda/sum.cu
@@ -50,126 +50,153 @@
 
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
 {
     template <typename T, typename R, int cn>
-    cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
+    void sumImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
     {
         typedef typename MakeVec<T, cn>::type src_type;
         typedef typename MakeVec<R, cn>::type res_type;
 
         const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
-        GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;
 
         if (mask.empty())
-            gridCalcSum(src, buf);
+            gridCalcSum(src, dst, stream);
         else
-            gridCalcSum(src, buf, globPtr<uchar>(mask));
-
-        cv::Scalar_<R> res;
-        cv::Mat res_mat(buf.size(), buf.type(), res.val);
-        buf.download(res_mat);
-
-        return res;
+            gridCalcSum(src, dst, globPtr<uchar>(mask), stream);
     }
 
     template <typename T, typename R, int cn>
-    cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
+    void sumAbsImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
     {
         typedef typename MakeVec<T, cn>::type src_type;
         typedef typename MakeVec<R, cn>::type res_type;
 
         const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
-        GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;
 
         if (mask.empty())
-            gridCalcSum(abs_(cvt_<res_type>(src)), buf);
+            gridCalcSum(abs_(cvt_<res_type>(src)), dst, stream);
         else
-            gridCalcSum(abs_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
-
-        cv::Scalar_<R> res;
-        cv::Mat res_mat(buf.size(), buf.type(), res.val);
-        buf.download(res_mat);
-
-        return res;
+            gridCalcSum(abs_(cvt_<res_type>(src)), dst, globPtr<uchar>(mask), stream);
     }
 
     template <typename T, typename R, int cn>
-    cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
+    void sumSqrImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
     {
         typedef typename MakeVec<T, cn>::type src_type;
         typedef typename MakeVec<R, cn>::type res_type;
 
         const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
-        GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;
 
         if (mask.empty())
-            gridCalcSum(sqr_(cvt_<res_type>(src)), buf);
+            gridCalcSum(sqr_(cvt_<res_type>(src)), dst, stream);
         else
-            gridCalcSum(sqr_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
-
-        cv::Scalar_<R> res;
-        cv::Mat res_mat(buf.size(), buf.type(), res.val);
-        buf.download(res_mat);
-
-        return res;
+            gridCalcSum(sqr_(cvt_<res_type>(src)), dst, globPtr<uchar>(mask), stream);
     }
 }
 
-cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
+void cv::cuda::calcSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
 {
-    typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
     static const func_t funcs[7][4] =
     {
-        {sumImpl<uchar , uint  , 1>, sumImpl<uchar , uint  , 2>, sumImpl<uchar , uint  , 3>, sumImpl<uchar , uint  , 4>},
-        {sumImpl<schar , int   , 1>, sumImpl<schar , int   , 2>, sumImpl<schar , int   , 3>, sumImpl<schar , int   , 4>},
-        {sumImpl<ushort, uint  , 1>, sumImpl<ushort, uint  , 2>, sumImpl<ushort, uint  , 3>, sumImpl<ushort, uint  , 4>},
-        {sumImpl<short , int   , 1>, sumImpl<short , int   , 2>, sumImpl<short , int   , 3>, sumImpl<short , int   , 4>},
-        {sumImpl<int   , int   , 1>, sumImpl<int   , int   , 2>, sumImpl<int   , int   , 3>, sumImpl<int   , int   , 4>},
-        {sumImpl<float , float , 1>, sumImpl<float , float , 2>, sumImpl<float , float , 3>, sumImpl<float , float , 4>},
+        {sumImpl<uchar , double, 1>, sumImpl<uchar , double, 2>, sumImpl<uchar , double, 3>, sumImpl<uchar , double, 4>},
+        {sumImpl<schar , double, 1>, sumImpl<schar , double, 2>, sumImpl<schar , double, 3>, sumImpl<schar , double, 4>},
+        {sumImpl<ushort, double, 1>, sumImpl<ushort, double, 2>, sumImpl<ushort, double, 3>, sumImpl<ushort, double, 4>},
+        {sumImpl<short , double, 1>, sumImpl<short , double, 2>, sumImpl<short , double, 3>, sumImpl<short , double, 4>},
+        {sumImpl<int   , double, 1>, sumImpl<int   , double, 2>, sumImpl<int   , double, 3>, sumImpl<int   , double, 4>},
+        {sumImpl<float , double, 1>, sumImpl<float , double, 2>, sumImpl<float , double, 3>, sumImpl<float , double, 4>},
         {sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
     };
 
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
 
-    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
-    const func_t func = funcs[src.depth()][src.channels() - 1];
+    const int src_depth = src.depth();
+    const int channels = src.channels();
 
-    return func(src, mask, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
-cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
+cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask)
 {
-    typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
+}
+
+void cv::cuda::calcAbsSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
     static const func_t funcs[7][4] =
     {
-        {sumAbsImpl<uchar , uint  , 1>, sumAbsImpl<uchar , uint  , 2>, sumAbsImpl<uchar , uint  , 3>, sumAbsImpl<uchar , uint  , 4>},
-        {sumAbsImpl<schar , int   , 1>, sumAbsImpl<schar , int   , 2>, sumAbsImpl<schar , int   , 3>, sumAbsImpl<schar , int   , 4>},
-        {sumAbsImpl<ushort, uint  , 1>, sumAbsImpl<ushort, uint  , 2>, sumAbsImpl<ushort, uint  , 3>, sumAbsImpl<ushort, uint  , 4>},
-        {sumAbsImpl<short , int   , 1>, sumAbsImpl<short , int   , 2>, sumAbsImpl<short , int   , 3>, sumAbsImpl<short , int   , 4>},
-        {sumAbsImpl<int   , int   , 1>, sumAbsImpl<int   , int   , 2>, sumAbsImpl<int   , int   , 3>, sumAbsImpl<int   , int   , 4>},
-        {sumAbsImpl<float , float , 1>, sumAbsImpl<float , float , 2>, sumAbsImpl<float , float , 3>, sumAbsImpl<float , float , 4>},
+        {sumAbsImpl<uchar , double, 1>, sumAbsImpl<uchar , double, 2>, sumAbsImpl<uchar , double, 3>, sumAbsImpl<uchar , double, 4>},
+        {sumAbsImpl<schar , double, 1>, sumAbsImpl<schar , double, 2>, sumAbsImpl<schar , double, 3>, sumAbsImpl<schar , double, 4>},
+        {sumAbsImpl<ushort, double, 1>, sumAbsImpl<ushort, double, 2>, sumAbsImpl<ushort, double, 3>, sumAbsImpl<ushort, double, 4>},
+        {sumAbsImpl<short , double, 1>, sumAbsImpl<short , double, 2>, sumAbsImpl<short , double, 3>, sumAbsImpl<short , double, 4>},
+        {sumAbsImpl<int   , double, 1>, sumAbsImpl<int   , double, 2>, sumAbsImpl<int   , double, 3>, sumAbsImpl<int   , double, 4>},
+        {sumAbsImpl<float , double, 1>, sumAbsImpl<float , double, 2>, sumAbsImpl<float , double, 3>, sumAbsImpl<float , double, 4>},
         {sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
     };
 
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
 
-    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
-    const func_t func = funcs[src.depth()][src.channels() - 1];
+    const int src_depth = src.depth();
+    const int channels = src.channels();
 
-    return func(src, mask, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
 }
 
-cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
+cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask)
 {
-    typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcAbsSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
+}
+
+void cv::cuda::calcSqrSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
     static const func_t funcs[7][4] =
     {
         {sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
@@ -181,14 +208,35 @@ cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
         {sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
     };
 
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
 
-    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
-    const func_t func = funcs[src.depth()][src.channels() - 1];
+    const int src_depth = src.depth();
+    const int channels = src.channels();
 
-    return func(src, mask, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcSqrSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
 }
 
 #endif
diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 21665cbe73..a5b8f07ce3 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -52,6 +52,8 @@
 #include "opencv2/cudev.hpp"
 #include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 namespace
@@ -95,15 +97,14 @@ namespace
 
 double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     const int depth = src.depth();
 
-    CV_DbgAssert( src.channels() == 1 && depth <= CV_64F );
-    CV_DbgAssert( type <= 4 /*THRESH_TOZERO_INV*/ );
+    CV_Assert( src.channels() == 1 && depth <= CV_64F );
+    CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ );
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
     if (depth == CV_32F && type == 2 /*THRESH_TRUNC*/)
     {
@@ -142,6 +143,8 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
         funcs[depth](src, dst, thresh, maxVal, type, stream);
     }
 
+    syncOutput(dst, _dst, stream);
+
     return thresh;
 }
 
diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu
index aa85004425..bfe50bd34f 100644
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -52,18 +52,19 @@
 #include "opencv2/cudev.hpp"
 #include "opencv2/core/private.cuda.hpp"
 
+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;
 
 void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     const size_t elemSize = src.elemSize();
 
     CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
 
-    _dst.create( src.cols, src.rows, src.type() );
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
 
     if (elemSize == 1)
     {
@@ -87,6 +88,8 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
     {
         gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
     }
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp
index 795d7ffaa7..f88119502d 100644
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -107,11 +107,11 @@ namespace
 
         GpuMat src1;
         if (!isScalar1)
-            src1 = _src1.getGpuMat();
+            src1 = getInputMat(_src1, stream);
 
         GpuMat src2;
         if (!isScalar2)
-            src2 = _src2.getGpuMat();
+            src2 = getInputMat(_src2, stream);
 
         Mat scalar;
         if (isScalar1)
@@ -126,7 +126,7 @@ namespace
             scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
         }
 
-        GpuMat mask = _mask.getGpuMat();
+        GpuMat mask = getInputMat(_mask, stream);
 
         const int sdepth = src1.empty() ? src2.depth() : src1.depth();
         const int cn = src1.empty() ? src2.channels() : src1.channels();
@@ -147,8 +147,7 @@ namespace
                 CV_Error(Error::StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        _dst.create(size, CV_MAKE_TYPE(ddepth, cn));
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(ddepth, cn), stream);
 
         if (isScalar1)
             mat_scalar_func(src2, val, true, dst, mask, scale, stream, op);
@@ -156,6 +155,8 @@ namespace
             mat_scalar_func(src1, val, false, dst, mask, scale, stream, op);
         else
             mat_mat_func(src1, src2, dst, mask, scale, stream, op);
+
+        syncOutput(dst, _dst, stream);
     }
 }
 
@@ -196,27 +197,29 @@ void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, do
 {
     if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
     {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
 
         CV_Assert( src1.size() == src2.size() );
 
-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
 
         mulMat_8uc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
     }
     else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
     {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
 
         CV_Assert( src1.size() == src2.size() );
 
-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
 
         mulMat_16sc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
     }
     else
     {
@@ -237,27 +240,29 @@ void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, doub
 {
     if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
     {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
 
         CV_Assert( src1.size() == src2.size() );
 
-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
 
         divMat_8uc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
     }
     else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
     {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);
 
         CV_Assert( src1.size() == src2.size() );
 
-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);
 
         divMat_16sc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
     }
     else
     {
@@ -389,15 +394,16 @@ void cv::cuda::rshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
         {NppShift<CV_32S, 1, nppiRShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R>::call},
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.depth() < CV_32F );
     CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
     funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }
 
 void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream& stream)
@@ -412,15 +418,16 @@ void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
         {NppShift<CV_32S, 1, nppiLShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R>::call},
     };
 
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S );
     CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
 
-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);
 
     funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -475,22 +482,24 @@ namespace
 
 void cv::cuda::magnitude(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    _dst.create(src.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
 
     npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }
 
 void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
 
-    _dst.create(src.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);
 
     npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp
index c1e2af4ed3..8d0add4537 100644
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -47,110 +47,106 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-double cv::cuda::norm(InputArray, int, InputArray, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::cuda::norm(InputArray, InputArray, GpuMat&, int) { throw_no_cuda(); return 0.0; }
+double cv::cuda::norm(InputArray, int, InputArray) { throw_no_cuda(); return 0.0; }
+void cv::cuda::calcNorm(InputArray, OutputArray, int, InputArray, Stream&) { throw_no_cuda(); }
+double cv::cuda::norm(InputArray, InputArray, int) { throw_no_cuda(); return 0.0; }
+void cv::cuda::calcNormDiff(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
 
-Scalar cv::cuda::sum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::cuda::absSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::cuda::sqrSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::cuda::sum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+Scalar cv::cuda::absSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcAbsSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+Scalar cv::cuda::sqrSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcSqrSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
 
-void cv::cuda::minMax(InputArray, double*, double*, InputArray, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::minMax(InputArray, double*, double*, InputArray) { throw_no_cuda(); }
+void cv::cuda::findMinMax(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray) { throw_no_cuda(); }
+void cv::cuda::findMinMaxLoc(InputArray, OutputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
 
-int cv::cuda::countNonZero(InputArray, GpuMat&) { throw_no_cuda(); return 0; }
+int cv::cuda::countNonZero(InputArray) { throw_no_cuda(); return 0; }
+void cv::cuda::countNonZero(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 void cv::cuda::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
 
-void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&) { throw_no_cuda(); }
+void cv::cuda::meanStdDev(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 void cv::cuda::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); }
 
-void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, Stream&) { throw_no_cuda(); }
 
-void cv::cuda::integral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::sqrIntegral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::integral(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::sqrIntegral(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
 #else
 
-namespace
-{
-    class DeviceBuffer
-    {
-    public:
-        explicit DeviceBuffer(int count_ = 1) : count(count_)
-        {
-            cudaSafeCall( cudaMalloc(&pdev, count * sizeof(double)) );
-        }
-        ~DeviceBuffer()
-        {
-            cudaSafeCall( cudaFree(pdev) );
-        }
-
-        operator double*() {return pdev;}
-
-        void download(double* hptr)
-        {
-            double hbuf;
-            cudaSafeCall( cudaMemcpy(&hbuf, pdev, sizeof(double), cudaMemcpyDeviceToHost) );
-            *hptr = hbuf;
-        }
-        void download(double** hptrs)
-        {
-            AutoBuffer<double, 2 * sizeof(double)> hbuf(count);
-            cudaSafeCall( cudaMemcpy((void*)hbuf, pdev, count * sizeof(double), cudaMemcpyDeviceToHost) );
-            for (int i = 0; i < count; ++i)
-                *hptrs[i] = hbuf[i];
-        }
-
-    private:
-        double* pdev;
-        int count;
-    };
-}
-
 ////////////////////////////////////////////////////////////////////////
 // norm
 
-double cv::cuda::norm(InputArray _src, int normType, InputArray _mask, GpuMat& buf)
-{
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+namespace cv { namespace cuda { namespace internal {
 
+void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+void findMaxAbs(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+}}}
+
+void cv::cuda::calcNorm(InputArray _src, OutputArray dst, int normType, InputArray mask, Stream& stream)
+{
     CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1) );
+
+    GpuMat src = getInputMat(_src, stream);
 
     GpuMat src_single_channel = src.reshape(1);
 
     if (normType == NORM_L1)
-        return cuda::absSum(src_single_channel, mask, buf)[0];
+    {
+        calcAbsSum(src_single_channel, dst, mask, stream);
+    }
+    else if (normType == NORM_L2)
+    {
+        internal::normL2(src_single_channel, dst, mask, stream);
+    }
+    else // NORM_INF
+    {
+        internal::findMaxAbs(src_single_channel, dst, mask, stream);
+    }
+}
 
-    if (normType == NORM_L2)
-        return std::sqrt(cuda::sqrSum(src_single_channel, mask, buf)[0]);
+double cv::cuda::norm(InputArray _src, int normType, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
 
-    // NORM_INF
-    double min_val, max_val;
-    cuda::minMax(src_single_channel, &min_val, &max_val, mask, buf);
-    return std::max(std::abs(min_val), std::abs(max_val));
+    HostMem dst;
+    calcNorm(_src, dst, normType, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F);
+
+    return val;
 }
 
 ////////////////////////////////////////////////////////////////////////
 // meanStdDev
 
-void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& buf)
+void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
+        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+
+    const GpuMat src = getInputMat(_src, stream);
 
     CV_Assert( src.type() == CV_8UC1 );
 
-    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
-        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+    GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);
 
     NppiSize sz;
     sz.width  = src.cols;
     sz.height = src.rows;
 
-    DeviceBuffer dbuf(2);
-
     int bufSize;
 #if (CUDA_VERSION <= 4020)
     nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
@@ -158,14 +154,30 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat&
     nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
 #endif
 
-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+    BufferPool pool(stream);
+    GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1);
 
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
+    NppStreamHandler h(StreamAccessor::getStream(stream));
 
-    cudaSafeCall( cudaDeviceSynchronize() );
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );
 
-    double* ptrs[2] = {mean.val, stddev.val};
-    dbuf.download(ptrs);
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    meanStdDev(_src, dst, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(Mat(1, 2, CV_64FC1, &vals[0]));
+
+    mean = Scalar(vals[0]);
+    stddev = Scalar(vals[1]);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -173,13 +185,12 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat&
 
 void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream)
 {
-    GpuMat src = _src.getGpuMat();
-    GpuMat sqr = _sqr.getGpuMat();
+    GpuMat src = getInputMat(_src, _stream);
+    GpuMat sqr = getInputMat(_sqr, _stream);
 
     CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 );
 
-    _dst.create(src.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, _stream);
 
     NppiSize sz;
     sz.width = src.cols;
@@ -200,45 +211,8 @@ void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Re
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
-}
 
-////////////////////////////////////////////////////////////////////////
-// normalize
-
-void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-{
-    GpuMat src = _src.getGpuMat();
-
-    double scale = 1, shift = 0;
-
-    if (norm_type == NORM_MINMAX)
-    {
-        double smin = 0, smax = 0;
-        double dmin = std::min(a, b), dmax = std::max(a, b);
-        cuda::minMax(src, &smin, &smax, mask, norm_buf);
-        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
-        shift = dmin - smin * scale;
-    }
-    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
-    {
-        scale = cuda::norm(src, norm_type, mask, norm_buf);
-        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
-        shift = 0;
-    }
-    else
-    {
-        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
-    }
-
-    if (mask.empty())
-    {
-        src.convertTo(dst, dtype, scale, shift);
-    }
-    else
-    {
-        src.convertTo(cvt_buf, dtype, scale, shift);
-        cvt_buf.copyTo(dst, mask);
-    }
+    syncOutput(dst, _dst, _stream);
 }
 
 #endif
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index 4a43d9d306..a4a16ea89f 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -1329,7 +1329,7 @@ CUDA_TEST_P(Divide_Scalar_First, Accuracy)
         try
         {
             cv::cuda::GpuMat dst;
-            cv::cuda::divide(scale, loadMat(mat), dst, depth.second);
+            cv::cuda::divide(scale, loadMat(mat), dst, 1.0, depth.second);
         }
         catch (const cv::Exception& e)
         {
@@ -1339,7 +1339,7 @@ CUDA_TEST_P(Divide_Scalar_First, Accuracy)
     else
     {
         cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
-        cv::cuda::divide(scale, loadMat(mat, useRoi), dst, depth.second);
+        cv::cuda::divide(scale, loadMat(mat, useRoi), dst, 1.0, depth.second);
 
         cv::Mat dst_gold;
         cv::divide(scale, mat, dst_gold, depth.second);
diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp
index e3c54055df..a95d007b81 100644
--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -74,8 +74,27 @@ CUDA_TEST_P(Norm, Accuracy)
     cv::Mat src = randomMat(size, depth);
     cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
 
-    cv::cuda::GpuMat d_buf;
-    double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi), d_buf);
+    double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi));
+
+    double val_gold = cv::norm(src, normCode, mask);
+
+    EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
+}
+
+CUDA_TEST_P(Norm, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcNorm(loadMat(src, useRoi), dst, normCode, loadMat(mask, useRoi), stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(cv::Mat(1, 1, CV_64FC1, &val), CV_64F);
 
     double val_gold = cv::norm(src, normCode, mask);
 
@@ -127,6 +146,27 @@ CUDA_TEST_P(NormDiff, Accuracy)
     EXPECT_NEAR(val_gold, val, 0.0);
 }
 
+CUDA_TEST_P(NormDiff, Async)
+{
+    cv::Mat src1 = randomMat(size, CV_8UC1);
+    cv::Mat src2 = randomMat(size, CV_8UC1);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcNormDiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, normCode, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    const cv::Mat val_mat(1, 1, CV_64FC1, &val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    double val_gold = cv::norm(src1, src2, normCode);
+
+    EXPECT_NEAR(val_gold, val, 0.0);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, NormDiff, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -247,6 +287,24 @@ CUDA_TEST_P(Sum, Simple)
     EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }
 
+CUDA_TEST_P(Sum, Simple_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = cv::sum(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
 CUDA_TEST_P(Sum, Abs)
 {
     cv::Scalar val = cv::cuda::absSum(loadMat(src, useRoi));
@@ -256,6 +314,24 @@ CUDA_TEST_P(Sum, Abs)
     EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }
 
+CUDA_TEST_P(Sum, Abs_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcAbsSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = absSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
 CUDA_TEST_P(Sum, Sqr)
 {
     cv::Scalar val = cv::cuda::sqrSum(loadMat(src, useRoi));
@@ -265,6 +341,24 @@ CUDA_TEST_P(Sum, Sqr)
     EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }
 
+CUDA_TEST_P(Sum, Sqr_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcSqrSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = sqrSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Sum, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -321,6 +415,28 @@ CUDA_TEST_P(MinMax, WithoutMask)
     }
 }
 
+CUDA_TEST_P(MinMax, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::findMinMax(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    const cv::Mat vals_mat(1, 2, CV_64FC1, &vals[0]);
+    dst.createMatHeader().convertTo(vals_mat, CV_64F);
+
+    double minVal_gold, maxVal_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, vals[0]);
+    EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]);
+}
+
 CUDA_TEST_P(MinMax, WithMask)
 {
     cv::Mat src = randomMat(size, depth);
@@ -471,6 +587,41 @@ CUDA_TEST_P(MinMaxLoc, WithoutMask)
     }
 }
 
+CUDA_TEST_P(MinMaxLoc, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem minMaxVals, locVals;
+    cv::cuda::findMinMaxLoc(loadMat(src, useRoi), minMaxVals, locVals, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    const cv::Mat vals_mat(2, 1, CV_64FC1, &vals[0]);
+    minMaxVals.createMatHeader().convertTo(vals_mat, CV_64F);
+
+    int locs[2];
+    const cv::Mat locs_mat(2, 1, CV_32SC1, &locs[0]);
+    locVals.createMatHeader().copyTo(locs_mat);
+
+    cv::Point locs2D[] = {
+        cv::Point(locs[0] % src.cols, locs[0] / src.cols),
+        cv::Point(locs[1] % src.cols, locs[1] / src.cols),
+    };
+
+    double minVal_gold, maxVal_gold;
+    cv::Point minLoc_gold, maxLoc_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, vals[0]);
+    EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]);
+
+    expectEqual(src, minLoc_gold, locs2D[0]);
+    expectEqual(src, maxLoc_gold, locs2D[1]);
+}
+
 CUDA_TEST_P(MinMaxLoc, WithMask)
 {
     cv::Mat src = randomMat(size, depth);
@@ -564,6 +715,7 @@ PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
     int depth;
     bool useRoi;
 
+    cv::Mat src;
 
     virtual void SetUp()
     {
@@ -573,15 +725,14 @@ PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
         useRoi = GET_PARAM(3);
 
         cv::cuda::setDevice(devInfo.deviceID());
+
+        cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5);
+        srcBase.convertTo(src, depth);
     }
 };
 
 CUDA_TEST_P(CountNonZero, Accuracy)
 {
-    cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5);
-    cv::Mat src;
-    srcBase.convertTo(src, depth);
-
     if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
     {
         try
@@ -603,6 +754,24 @@ CUDA_TEST_P(CountNonZero, Accuracy)
     }
 }
 
+CUDA_TEST_P(CountNonZero, Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::countNonZero(loadMat(src, useRoi), dst, stream);
+
+    stream.waitForCompletion();
+
+    int val;
+    const cv::Mat val_mat(1, 1, CV_32SC1, &val);
+    dst.createMatHeader().copyTo(val_mat);
+
+    int val_gold = cv::countNonZero(src);
+
+    ASSERT_EQ(val_gold, val);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, CountNonZero, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -750,7 +919,7 @@ CUDA_TEST_P(Normalize, WithMask)
     dst_gold.setTo(cv::Scalar::all(0));
     cv::normalize(src, dst_gold, alpha, beta, norm_type, type, mask);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+    EXPECT_MAT_NEAR(dst_gold, dst, type < CV_32F ? 1.0 : 1e-4);
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Normalize, testing::Combine(
@@ -811,6 +980,28 @@ CUDA_TEST_P(MeanStdDev, Accuracy)
     }
 }
 
+CUDA_TEST_P(MeanStdDev, Async)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::meanStdDev(loadMat(src, useRoi), dst, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(cv::Mat(1, 2, CV_64FC1, &vals[0]));
+
+    cv::Scalar mean_gold;
+    cv::Scalar stddev_gold;
+    cv::meanStdDev(src, mean_gold, stddev_gold);
+
+    EXPECT_SCALAR_NEAR(mean_gold, cv::Scalar(vals[0]), 1e-5);
+    EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
diff --git a/modules/cudabgsegm/src/fgd.cpp b/modules/cudabgsegm/src/fgd.cpp
index 68f03a3e16..237f1c05fa 100644
--- a/modules/cudabgsegm/src/fgd.cpp
+++ b/modules/cudabgsegm/src/fgd.cpp
@@ -266,7 +266,7 @@ namespace
 {
     int bgfgClassification(const GpuMat& prevFrame, const GpuMat& curFrame,
                            const GpuMat& Ftd, const GpuMat& Fbd,
-                           GpuMat& foreground, GpuMat& countBuf,
+                           GpuMat& foreground,
                            const FGDParams& params, int out_cn)
     {
         typedef void (*func_t)(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
@@ -298,7 +298,7 @@ namespace
                                                                              deltaC, deltaCC, params.alpha2,
                                                                              params.N1c, params.N1cc, 0);
 
-        int count = cuda::countNonZero(foreground, countBuf);
+        int count = cuda::countNonZero(foreground);
 
         cuda::multiply(foreground, Scalar::all(255), foreground);
 
@@ -605,8 +605,6 @@ namespace
         GpuMat hist_;
         GpuMat histBuf_;
 
-        GpuMat countBuf_;
-
         GpuMat buf_;
         GpuMat filterBrd_;
 
@@ -649,7 +647,7 @@ namespace
         changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_);
         changeDetection(background_, curFrame, Fbd_, hist_, histBuf_);
 
-        int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, countBuf_, params_, 4);
+        int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, params_, 4);
 
 #ifdef HAVE_OPENCV_CUDAFILTERS
         if (params_.perform_morphing > 0)
diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index f61d2dfd00..1d7f4e4e43 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -48,6 +48,7 @@
 #endif
 
 #include "opencv2/core/cuda.hpp"
+#include "opencv2/features2d.hpp"
 #include "opencv2/cudafilters.hpp"
 
 /**
@@ -62,262 +63,396 @@ namespace cv { namespace cuda {
 //! @addtogroup cudafeatures2d
 //! @{
 
-/** @brief Brute-force descriptor matcher.
+//
+// DescriptorMatcher
+//
 
-For each descriptor in the first set, this matcher finds the closest descriptor in the second set
-by trying each one. This descriptor matcher supports masking permissible matches between descriptor
-sets.
+/** @brief Abstract base class for matching keypoint descriptors.
 
-The class BFMatcher_CUDA has an interface similar to the class DescriptorMatcher. It has two groups
-of match methods: for matching descriptors of one image with another image or with an image set.
-Also, all functions have an alternative to save results either to the GPU memory or to the CPU
-memory.
-
-@sa DescriptorMatcher, BFMatcher
+It has two groups of match methods: for matching descriptors of an image with another image or with
+an image set.
  */
-class CV_EXPORTS BFMatcher_CUDA
+class CV_EXPORTS DescriptorMatcher : public cv::Algorithm
 {
 public:
-    explicit BFMatcher_CUDA(int norm = cv::NORM_L2);
+    //
+    // Factories
+    //
 
-    //! Add descriptors to train descriptor collection
-    void add(const std::vector<GpuMat>& descCollection);
+    /** @brief Brute-force descriptor matcher.
 
-    //! Get train descriptors collection
-    const std::vector<GpuMat>& getTrainDescriptors() const;
+    For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+    by trying each one. This descriptor matcher supports masking permissible matches of descriptor
+    sets.
 
-    //! Clear train descriptors collection
-    void clear();
+    @param normType One of NORM_L1, NORM_L2, NORM_HAMMING. L1 and L2 norms are
+    preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
+    BRIEF).
+     */
+    static Ptr<DescriptorMatcher> createBFMatcher(int normType = cv::NORM_L2);
 
-    //! Return true if there are not train descriptors in collection
-    bool empty() const;
+    //
+    // Utility
+    //
 
-    //! Return true if the matcher supports mask in match methods
-    bool isMaskSupported() const;
+    /** @brief Returns true if the descriptor matcher supports masking permissible matches.
+     */
+    virtual bool isMaskSupported() const = 0;
 
-    //! Find one best match for each query descriptor
-    void matchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    //
+    // Descriptor collection
+    //
 
-    //! Download trainIdx and distance and convert it to CPU vector with DMatch
-    static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    //! Convert trainIdx and distance to vector with DMatch
-    static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);
+    /** @brief Adds descriptors to train a descriptor collection.
 
-    //! Find one best match for each query descriptor
-    void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());
+    If the collection is not empty, the new descriptors are added to existing train descriptors.
 
-    //! Make gpu collection of trains and masks in suitable format for matchCollection function
-    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+    @param descriptors Descriptors to add. Each descriptors[i] is a set of descriptors from the same
+    train image.
+     */
+    virtual void add(const std::vector<GpuMat>& descriptors) = 0;
 
-    //! Find one best match from train collection for each query descriptor
-    void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
-        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-        const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Returns a constant link to the train descriptor collection.
+     */
+    virtual const std::vector<GpuMat>& getTrainDescriptors() const = 0;
 
-    //! Download trainIdx, imgIdx and distance and convert it to vector with DMatch
-    static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    //! Convert trainIdx, imgIdx and distance to vector with DMatch
-    static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);
+    /** @brief Clears the train descriptor collection.
+     */
+    virtual void clear() = 0;
 
-    //! Find one best match from train collection for each query descriptor.
-    void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+    /** @brief Returns true if there are no train descriptors in the collection.
+     */
+    virtual bool empty() const = 0;
 
-    //! Find k best matches for each query descriptor (in increasing order of distances)
-    void knnMatchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Trains a descriptor matcher.
 
-    //! Download trainIdx and distance and convert it to vector with DMatch
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx and distance to vector with DMatch
-    static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    Trains a descriptor matcher (for example, the flann index). In all methods to match, the method
+    train() is run every time before matching.
+     */
+    virtual void train() = 0;
 
-    //! Find k best matches for each query descriptor (in increasing order of distances).
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    void knnMatch(const GpuMat& query, const GpuMat& train,
-        std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),
-        bool compactResult = false);
+    //
+    // 1 to 1 match
+    //
 
-    //! Find k best matches from train collection for each query descriptor (in increasing order of distances)
-    void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
-        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-        const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Finds the best match for each descriptor from a query set (blocking version).
 
-    //! Download trainIdx and distance and convert it to vector with DMatch
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    //! @see BFMatcher_CUDA::knnMatchDownload
-    static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx and distance to vector with DMatch
-    //! @see BFMatcher_CUDA::knnMatchConvert
-    static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
 
-    //! Find k best matches  for each query descriptor (in increasing order of distances).
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
+                       std::vector<DMatch>& matches,
+                       InputArray mask = noArray()) = 0;
 
-    //! Find best matches for each query descriptor which have distance less than maxDistance.
-    //! nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
-    //! carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
-    //! because it didn't have enough memory.
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
-    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    //! Matches doesn't sorted.
-    void radiusMatchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    /** @overload
+     */
+    virtual void match(InputArray queryDescriptors,
+                       std::vector<DMatch>& matches,
+                       const std::vector<GpuMat>& masks = std::vector<GpuMat>()) = 0;
 
-    //! Download trainIdx, nMatches and distance and convert it to vector with DMatch.
-    //! matches will be sorted in increasing order of distances.
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx, nMatches and distance to vector with DMatch.
-    static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    /** @brief Finds the best match for each descriptor from a query set (asynchronous version).
 
-    //! Find best matches for each query descriptor which have distance less than maxDistance
-    //! in increasing order of distances).
-    void radiusMatch(const GpuMat& query, const GpuMat& train,
-        std::vector< std::vector<DMatch> >& matches, float maxDistance,
-        const GpuMat& mask = GpuMat(), bool compactResult = false);
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::matchConvert method to retrieve results in standard representation.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
 
-    //! Find best matches for each query descriptor which have distance less than maxDistance.
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
-    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    //! Matches doesn't sorted.
-    void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                            OutputArray matches,
+                            InputArray mask = noArray(),
+                            Stream& stream = Stream::Null()) = 0;
 
-    //! Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
-    //! matches will be sorted in increasing order of distances.
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx, nMatches and distance to vector with DMatch.
-    static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    /** @overload
+     */
+    virtual void matchAsync(InputArray queryDescriptors,
+                            OutputArray matches,
+                            const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                            Stream& stream = Stream::Null()) = 0;
 
-    //! Find best matches from train collection for each query descriptor which have distance less than
-    //! maxDistance (in increasing order of distances).
-    void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+    /** @brief Converts matches array from internal representation to standard matches vector.
 
-    int norm;
+    The method is supposed to be used with DescriptorMatcher::matchAsync to get final result.
+    Call this method only after DescriptorMatcher::matchAsync is completed (ie. after synchronization).
 
-private:
-    std::vector<GpuMat> trainDescCollection;
+    @param gpu_matches Matches, returned from DescriptorMatcher::matchAsync.
+    @param matches Vector of DMatch objects.
+     */
+    virtual void matchConvert(InputArray gpu_matches,
+                              std::vector<DMatch>& matches) = 0;
+
+    //
+    // knn match
+    //
+
+    /** @brief Finds the k best matches for each descriptor from a query set (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    These extended variants of DescriptorMatcher::match methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::match
+    for the details about query and train descriptors.
+     */
+    virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                          std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          InputArray mask = noArray(),
+                          bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    virtual void knnMatch(InputArray queryDescriptors,
+                          std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                          bool compactResult = false) = 0;
+
+    /** @brief Finds the k best matches for each descriptor from a query set (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::knnMatchConvert method to retrieve results in standard representation.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
+
+    These extended variants of DescriptorMatcher::matchAsync methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::matchAsync
+    for the details about query and train descriptors.
+     */
+    virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                               OutputArray matches,
+                               int k,
+                               InputArray mask = noArray(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    virtual void knnMatchAsync(InputArray queryDescriptors,
+                               OutputArray matches,
+                               int k,
+                               const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::knnMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::knnMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::knnMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    virtual void knnMatchConvert(InputArray gpu_matches,
+                                 std::vector< std::vector<DMatch> >& matches,
+                                 bool compactResult = false) = 0;
+
+    //
+    // radius match
+    //
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Found matches.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                             std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             InputArray mask = noArray(),
+                             bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    virtual void radiusMatch(InputArray queryDescriptors,
+                             std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                             bool compactResult = false) = 0;
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::radiusMatchConvert method to retrieve results in standard representation.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  InputArray mask = noArray(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    virtual void radiusMatchAsync(InputArray queryDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::radiusMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::radiusMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::radiusMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    virtual void radiusMatchConvert(InputArray gpu_matches,
+                                    std::vector< std::vector<DMatch> >& matches,
+                                    bool compactResult = false) = 0;
 };
 
-/** @brief Class used for corner detection using the FAST algorithm. :
+//
+// Feature2DAsync
+//
+
+/** @brief Abstract base class for CUDA asynchronous 2D image feature detectors and descriptor extractors.
  */
-class CV_EXPORTS FAST_CUDA
+class CV_EXPORTS Feature2DAsync
+{
+public:
+    virtual ~Feature2DAsync();
+
+    /** @brief Detects keypoints in an image.
+
+    @param image Image.
+    @param keypoints The detected keypoints.
+    @param mask Mask specifying where to look for keypoints (optional). It must be a 8-bit integer
+    matrix with non-zero values in the region of interest.
+    @param stream CUDA stream.
+     */
+    virtual void detectAsync(InputArray image,
+                             OutputArray keypoints,
+                             InputArray mask = noArray(),
+                             Stream& stream = Stream::Null());
+
+    /** @brief Computes the descriptors for a set of keypoints detected in an image.
+
+    @param image Image.
+    @param keypoints Input collection of keypoints.
+    @param descriptors Computed descriptors. Row j is the descriptor for j-th keypoint.
+    @param stream CUDA stream.
+     */
+    virtual void computeAsync(InputArray image,
+                              OutputArray keypoints,
+                              OutputArray descriptors,
+                              Stream& stream = Stream::Null());
+
+    /** Detects keypoints and computes the descriptors. */
+    virtual void detectAndComputeAsync(InputArray image,
+                                       InputArray mask,
+                                       OutputArray keypoints,
+                                       OutputArray descriptors,
+                                       bool useProvidedKeypoints = false,
+                                       Stream& stream = Stream::Null());
+
+    /** Converts keypoints array from internal representation to standard vector. */
+    virtual void convert(InputArray gpu_keypoints,
+                         std::vector<KeyPoint>& keypoints) = 0;
+};
+
+//
+// FastFeatureDetector
+//
+
+/** @brief Wrapping class for feature detection using the FAST method.
+ */
+class CV_EXPORTS FastFeatureDetector : public cv::FastFeatureDetector, public Feature2DAsync
 {
 public:
     enum
     {
         LOCATION_ROW = 0,
         RESPONSE_ROW,
-        ROWS_COUNT
+        ROWS_COUNT,
+
+        FEATURE_SIZE = 7
     };
 
-    //! all features have same size
-    static const int FEATURE_SIZE = 7;
+    static Ptr<FastFeatureDetector> create(int threshold=10,
+                                           bool nonmaxSuppression=true,
+                                           int type=FastFeatureDetector::TYPE_9_16,
+                                           int max_npoints = 5000);
 
-    /** @brief Constructor.
-
-    @param threshold Threshold on difference between intensity of the central pixel and pixels on a
-    circle around this pixel.
-    @param nonmaxSuppression If it is true, non-maximum suppression is applied to detected corners
-    (keypoints).
-    @param keypointsRatio Inner buffer size for keypoints store is determined as (keypointsRatio \*
-    image_width \* image_height).
-     */
-    explicit FAST_CUDA(int threshold, bool nonmaxSuppression = true, double keypointsRatio = 0.05);
-
-    /** @brief Finds the keypoints using FAST detector.
-
-    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
-    supported.
-    @param mask Optional input mask that marks the regions where we should detect features.
-    @param keypoints The output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU
-    memory:
-    -   keypoints.ptr\<Vec2s\>(LOCATION_ROW)[i] will contain location of i'th point
-    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] will contain response of i'th point (if non-maximum
-    suppression is applied)
-     */
-    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
-    /** @overload */
-    void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Download keypoints from GPU to CPU memory.
-    */
-    static void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
-    */
-    static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Releases inner buffer memory.
-    */
-    void release();
-
-    bool nonmaxSuppression;
-
-    int threshold;
-
-    //! max keypoints = keypointsRatio * img.size().area()
-    double keypointsRatio;
-
-    /** @brief Find keypoints and compute it's response if nonmaxSuppression is true.
-
-    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
-    supported.
-    @param mask Optional input mask that marks the regions where we should detect features.
-
-    The function returns count of detected keypoints.
-     */
-    int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);
-
-    /** @brief Gets final array of keypoints.
-
-    @param keypoints The output vector of keypoints.
-
-    The function performs non-max suppression if needed and returns final count of keypoints.
-     */
-    int getKeyPoints(GpuMat& keypoints);
-
-private:
-    GpuMat kpLoc_;
-    int count_;
-
-    GpuMat score_;
-
-    GpuMat d_keypoints_;
+    virtual void setMaxNumPoints(int max_npoints) = 0;
+    virtual int getMaxNumPoints() const = 0;
 };
 
-/** @brief Class for extracting ORB features and descriptors from an image. :
+//
+// ORB
+//
+
+/** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
+ *
+ * @sa cv::ORB
  */
-class CV_EXPORTS ORB_CUDA
+class CV_EXPORTS ORB : public cv::ORB, public Feature2DAsync
 {
 public:
     enum
@@ -331,113 +466,20 @@ public:
         ROWS_COUNT
     };
 
-    enum
-    {
-        DEFAULT_FAST_THRESHOLD = 20
-    };
-
-    /** @brief Constructor.
-
-    @param nFeatures The number of desired features.
-    @param scaleFactor Coefficient by which we divide the dimensions from one scale pyramid level to
-    the next.
-    @param nLevels The number of levels in the scale pyramid.
-    @param edgeThreshold How far from the boundary the points should be.
-    @param firstLevel The level at which the image is given. If 1, that means we will also look at the
-    image scaleFactor times bigger.
-    @param WTA_K
-    @param scoreType
-    @param patchSize
-     */
-    explicit ORB_CUDA(int nFeatures = 500, float scaleFactor = 1.2f, int nLevels = 8, int edgeThreshold = 31,
-                     int firstLevel = 0, int WTA_K = 2, int scoreType = 0, int patchSize = 31);
-
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
-
-    /** @brief Detects keypoints and computes descriptors for them.
-
-    @param image Input 8-bit grayscale image.
-    @param mask Optional input mask that marks the regions where we should detect features.
-    @param keypoints The input/output vector of keypoints. Can be stored both in CPU and GPU memory.
-    For GPU memory:
-    -   keypoints.ptr\<float\>(X_ROW)[i] contains x coordinate of the i'th feature.
-    -   keypoints.ptr\<float\>(Y_ROW)[i] contains y coordinate of the i'th feature.
-    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] contains the response of the i'th feature.
-    -   keypoints.ptr\<float\>(ANGLE_ROW)[i] contains orientation of the i'th feature.
-    -   keypoints.ptr\<float\>(OCTAVE_ROW)[i] contains the octave of the i'th feature.
-    -   keypoints.ptr\<float\>(SIZE_ROW)[i] contains the size of the i'th feature.
-    @param descriptors Computed descriptors. if blurForDescriptor is true, image will be blurred
-    before descriptors calculation.
-     */
-    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);
-
-    /** @brief Download keypoints from GPU to CPU memory.
-    */
-    static void downloadKeyPoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
-    */
-    static void convertKeyPoints(const Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-    //! returns the descriptor size in bytes
-    inline int descriptorSize() const { return kBytes; }
-
-    inline void setFastParams(int threshold, bool nonmaxSuppression = true)
-    {
-        fastDetector_.threshold = threshold;
-        fastDetector_.nonmaxSuppression = nonmaxSuppression;
-    }
-
-    /** @brief Releases inner buffer memory.
-    */
-    void release();
+    static Ptr<ORB> create(int nfeatures=500,
+                           float scaleFactor=1.2f,
+                           int nlevels=8,
+                           int edgeThreshold=31,
+                           int firstLevel=0,
+                           int WTA_K=2,
+                           int scoreType=ORB::HARRIS_SCORE,
+                           int patchSize=31,
+                           int fastThreshold=20,
+                           bool blurForDescriptor=false);
 
     //! if true, image will be blurred before descriptors calculation
-    bool blurForDescriptor;
-
-private:
-    enum { kBytes = 32 };
-
-    void buildScalePyramids(const GpuMat& image, const GpuMat& mask);
-
-    void computeKeyPointsPyramid();
-
-    void computeDescriptors(GpuMat& descriptors);
-
-    void mergeKeyPoints(GpuMat& keypoints);
-
-    int nFeatures_;
-    float scaleFactor_;
-    int nLevels_;
-    int edgeThreshold_;
-    int firstLevel_;
-    int WTA_K_;
-    int scoreType_;
-    int patchSize_;
-
-    //! The number of desired features per scale
-    std::vector<size_t> n_features_per_level_;
-
-    //! Points to compute BRIEF descriptors from
-    GpuMat pattern_;
-
-    std::vector<GpuMat> imagePyr_;
-    std::vector<GpuMat> maskPyr_;
-
-    GpuMat buf_;
-
-    std::vector<GpuMat> keyPointsPyr_;
-    std::vector<int> keyPointsCount_;
-
-    FAST_CUDA fastDetector_;
-
-    Ptr<cuda::Filter> blurFilter;
-
-    GpuMat d_keypoints_;
+    virtual void setBlurForDescriptor(bool blurForDescriptor) = 0;
+    virtual bool getBlurForDescriptor() const = 0;
 };
 
 //! @}
diff --git a/modules/cudafeatures2d/perf/perf_features2d.cpp b/modules/cudafeatures2d/perf/perf_features2d.cpp
index 26eb434f44..9d81348164 100644
--- a/modules/cudafeatures2d/perf/perf_features2d.cpp
+++ b/modules/cudafeatures2d/perf/perf_features2d.cpp
@@ -64,15 +64,18 @@ PERF_TEST_P(Image_Threshold_NonMaxSuppression, FAST,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::FAST_CUDA d_fast(threshold, nonMaxSuppersion, 0.5);
+        cv::Ptr<cv::cuda::FastFeatureDetector> d_fast =
+                cv::cuda::FastFeatureDetector::create(threshold, nonMaxSuppersion,
+                                                      cv::FastFeatureDetector::TYPE_9_16,
+                                                      0.5 * img.size().area());
 
         const cv::cuda::GpuMat d_img(img);
         cv::cuda::GpuMat d_keypoints;
 
-        TEST_CYCLE() d_fast(d_img, cv::cuda::GpuMat(), d_keypoints);
+        TEST_CYCLE() d_fast->detectAsync(d_img, d_keypoints);
 
         std::vector<cv::KeyPoint> gpu_keypoints;
-        d_fast.downloadKeypoints(d_keypoints, gpu_keypoints);
+        d_fast->convert(d_keypoints, gpu_keypoints);
 
         sortKeyPoints(gpu_keypoints);
 
@@ -106,15 +109,15 @@ PERF_TEST_P(Image_NFeatures, ORB,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::ORB_CUDA d_orb(nFeatures);
+        cv::Ptr<cv::cuda::ORB> d_orb = cv::cuda::ORB::create(nFeatures);
 
         const cv::cuda::GpuMat d_img(img);
         cv::cuda::GpuMat d_keypoints, d_descriptors;
 
-        TEST_CYCLE() d_orb(d_img, cv::cuda::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_orb->detectAndComputeAsync(d_img, cv::noArray(), d_keypoints, d_descriptors);
 
         std::vector<cv::KeyPoint> gpu_keypoints;
-        d_orb.downloadKeyPoints(d_keypoints, gpu_keypoints);
+        d_orb->convert(d_keypoints, gpu_keypoints);
 
         cv::Mat gpu_descriptors(d_descriptors);
 
@@ -164,16 +167,16 @@ PERF_TEST_P(DescSize_Norm, BFMatch,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
 
         const cv::cuda::GpuMat d_query(query);
         const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance;
+        cv::cuda::GpuMat d_matches;
 
-        TEST_CYCLE() d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        TEST_CYCLE() d_matcher->matchAsync(d_query, d_train, d_matches);
 
         std::vector<cv::DMatch> gpu_matches;
-        d_matcher.matchDownload(d_trainIdx, d_distance, gpu_matches);
+        d_matcher->matchConvert(d_matches, gpu_matches);
 
         SANITY_CHECK_MATCHES(gpu_matches);
     }
@@ -223,16 +226,16 @@ PERF_TEST_P(DescSize_K_Norm, BFKnnMatch,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
 
         const cv::cuda::GpuMat d_query(query);
         const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance, d_allDist;
+        cv::cuda::GpuMat d_matches;
 
-        TEST_CYCLE() d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        TEST_CYCLE() d_matcher->knnMatchAsync(d_query, d_train, d_matches, k);
 
         std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.knnMatchDownload(d_trainIdx, d_distance, matchesTbl);
+        d_matcher->knnMatchConvert(d_matches, matchesTbl);
 
         std::vector<cv::DMatch> gpu_matches;
         toOneRowMatches(matchesTbl, gpu_matches);
@@ -277,16 +280,16 @@ PERF_TEST_P(DescSize_Norm, BFRadiusMatch,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
 
         const cv::cuda::GpuMat d_query(query);
         const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_nMatches, d_distance;
+        cv::cuda::GpuMat d_matches;
 
-        TEST_CYCLE() d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, maxDistance);
+        TEST_CYCLE() d_matcher->radiusMatchAsync(d_query, d_train, d_matches, maxDistance);
 
         std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.radiusMatchDownload(d_trainIdx, d_distance, d_nMatches, matchesTbl);
+        d_matcher->radiusMatchConvert(d_matches, matchesTbl);
 
         std::vector<cv::DMatch> gpu_matches;
         toOneRowMatches(matchesTbl, gpu_matches);
diff --git a/modules/cudafeatures2d/src/brute_force_matcher.cpp b/modules/cudafeatures2d/src/brute_force_matcher.cpp
index 5de0b06e32..a00537c8eb 100644
--- a/modules/cudafeatures2d/src/brute_force_matcher.cpp
+++ b/modules/cudafeatures2d/src/brute_force_matcher.cpp
@@ -47,37 +47,7 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::BFMatcher_CUDA::BFMatcher_CUDA(int) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::add(const std::vector<GpuMat>&) { throw_no_cuda(); }
-const std::vector<GpuMat>& cv::cuda::BFMatcher_CUDA::getTrainDescriptors() const { throw_no_cuda(); return trainDescCollection; }
-void cv::cuda::BFMatcher_CUDA::clear() { throw_no_cuda(); }
-bool cv::cuda::BFMatcher_CUDA::empty() const { throw_no_cuda(); return true; }
-bool cv::cuda::BFMatcher_CUDA::isMaskSupported() const { throw_no_cuda(); return true; }
-void cv::cuda::BFMatcher_CUDA::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat&, const Mat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat&, const GpuMat&, std::vector<DMatch>&, const GpuMat&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::makeGpuCollection(GpuMat&, GpuMat&, const std::vector<GpuMat>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat&, const Mat&, const Mat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch2Collection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch2Download(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch2Convert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchCollection(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const std::vector<GpuMat>&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_no_cuda(); }
+Ptr<cv::cuda::DescriptorMatcher> cv::cuda::DescriptorMatcher::createBFMatcher(int) { throw_no_cuda(); return Ptr<cv::cuda::DescriptorMatcher>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -155,857 +125,953 @@ namespace cv { namespace cuda { namespace device
     }
 }}}
 
-////////////////////////////////////////////////////////////////////
-// Train collection
-
-cv::cuda::BFMatcher_CUDA::BFMatcher_CUDA(int norm_) : norm(norm_)
-{
-}
-
-void cv::cuda::BFMatcher_CUDA::add(const std::vector<GpuMat>& descCollection)
-{
-    trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
-}
-
-const std::vector<GpuMat>& cv::cuda::BFMatcher_CUDA::getTrainDescriptors() const
-{
-    return trainDescCollection;
-}
-
-void cv::cuda::BFMatcher_CUDA::clear()
-{
-    trainDescCollection.clear();
-}
-
-bool cv::cuda::BFMatcher_CUDA::empty() const
-{
-    return trainDescCollection.empty();
-}
-
-bool cv::cuda::BFMatcher_CUDA::isMaskSupported() const
-{
-    return true;
-}
-
-////////////////////////////////////////////////////////////////////
-// Match
-
-void cv::cuda::BFMatcher_CUDA::matchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance,
-    const GpuMat& mask, Stream& stream)
-{
-    if (query.empty() || train.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.cols == query.cols && train.type() == query.type());
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32F, distance);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-
-    matchConvert(trainIdxCPU, distanceCPU, matches);
-}
-
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const float* distance_ptr =  distance.ptr<float>();
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
-    {
-        int train_idx = *trainIdx_ptr;
-
-        if (train_idx == -1)
-            continue;
-
-        float distance_local = *distance_ptr;
-
-        DMatch m(queryIdx, train_idx, 0, distance_local);
-
-        matches.push_back(m);
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat& query, const GpuMat& train,
-    std::vector<DMatch>& matches, const GpuMat& mask)
-{
-    GpuMat trainIdx, distance;
-    matchSingle(query, train, trainIdx, distance, mask);
-    matchDownload(trainIdx, distance, matches);
-}
-
-void cv::cuda::BFMatcher_CUDA::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
-    const std::vector<GpuMat>& masks)
-{
-    if (empty())
-        return;
-
-    if (masks.empty())
-    {
-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
-
-        PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
-
-        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr)
-            *trainCollectionCPU_ptr = trainDescCollection[i];
-
-        trainCollection.upload(trainCollectionCPU);
-        maskCollection.release();
-    }
-    else
-    {
-        CV_Assert(masks.size() == trainDescCollection.size());
-
-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
-        Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepb)));
-
-        PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
-        PtrStepb* maskCollectionCPU_ptr = maskCollectionCPU.ptr<PtrStepb>();
-
-        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr)
-        {
-            const GpuMat& train = trainDescCollection[i];
-            const GpuMat& mask = masks[i];
-
-            CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.cols == train.rows));
-
-            *trainCollectionCPU_ptr = train;
-            *maskCollectionCPU_ptr = mask;
-        }
-
-        trainCollection.upload(trainCollectionCPU);
-        maskCollection.upload(maskCollectionCPU);
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::matchCollection(const GpuMat& query, const GpuMat& trainCollection,
-    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-    const GpuMat& masks, Stream& stream)
-{
-    if (query.empty() || trainCollection.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32F, distance);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-
-    matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
-}
-
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.cols == trainIdx.cols);
-    CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const int* imgIdx_ptr = imgIdx.ptr<int>();
-    const float* distance_ptr =  distance.ptr<float>();
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-    {
-        int _trainIdx = *trainIdx_ptr;
-
-        if (_trainIdx == -1)
-            continue;
-
-        int _imgIdx = *imgIdx_ptr;
-
-        float _distance = *distance_ptr;
-
-        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
-
-        matches.push_back(m);
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks)
-{
-    GpuMat trainCollection;
-    GpuMat maskCollection;
-
-    makeGpuCollection(trainCollection, maskCollection, masks);
-
-    GpuMat trainIdx, imgIdx, distance;
-
-    matchCollection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection);
-    matchDownload(trainIdx, imgIdx, distance, matches);
-}
-
-////////////////////////////////////////////////////////////////////
-// KnnMatch
-
-void cv::cuda::BFMatcher_CUDA::knnMatchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
-    const GpuMat& mask, Stream& stream)
-{
-    if (query.empty() || train.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_knnmatch;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
-                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
-    if (k == 2)
-    {
-        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
-        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
-    }
-    else
-    {
-        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
-        ensureSizeIsEnough(nQuery, k, CV_32F, distance);
-        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
-    }
-
-    trainIdx.setTo(Scalar::all(-1), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-
-    knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatchConvert(const Mat& trainIdx, const Mat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC2 || trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC2 || distance.type() == CV_32FC1);
-    CV_Assert(distance.size() == trainIdx.size());
-    CV_Assert(trainIdx.isContinuous() && distance.isContinuous());
-
-    const int nQuery = trainIdx.type() == CV_32SC2 ? trainIdx.cols : trainIdx.rows;
-    const int k = trainIdx.type() == CV_32SC2 ? 2 :trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const float* distance_ptr = distance.ptr<float>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(k);
-
-        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-
-            if (_trainIdx != -1)
-            {
-                float _distance = *distance_ptr;
-
-                DMatch m(queryIdx, _trainIdx, 0, _distance);
-
-                curMatches.push_back(m);
-            }
-        }
-
-        if (compactResult && curMatches.empty())
-            matches.pop_back();
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat& query, const GpuMat& train,
-    std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask, bool compactResult)
-{
-    GpuMat trainIdx, distance, allDist;
-    knnMatchSingle(query, train, trainIdx, distance, allDist, k, mask);
-    knnMatchDownload(trainIdx, distance, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
-    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-    const GpuMat& maskCollection, Stream& stream)
-{
-    if (query.empty() || trainCollection.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_knnmatch;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
-                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
-        match2L1_gpu<unsigned short>, match2L1_gpu<short>,
-        match2L1_gpu<int>, match2L1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
-        0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
-        0/*match2L2_gpu<int>*/, match2L2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
-        match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
-        match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
-
-    trainIdx.setTo(Scalar::all(-1), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-
-    knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC2);
-    CV_Assert(imgIdx.type() == CV_32SC2 && imgIdx.cols == trainIdx.cols);
-    CV_Assert(distance.type() == CV_32FC2 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const int* imgIdx_ptr = imgIdx.ptr<int>();
-    const float* distance_ptr = distance.ptr<float>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(2);
-
-        for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-
-            if (_trainIdx != -1)
-            {
-                int _imgIdx = *imgIdx_ptr;
-
-                float _distance = *distance_ptr;
-
-                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
-
-                curMatches.push_back(m);
-            }
-        }
-
-        if (compactResult && curMatches.empty())
-            matches.pop_back();
-    }
-}
-
 namespace
 {
-    struct ImgIdxSetter
+    static void makeGpuCollection(const std::vector<GpuMat>& trainDescCollection,
+                                  const std::vector<GpuMat>& masks,
+                                  GpuMat& trainCollection,
+                                  GpuMat& maskCollection)
     {
-        explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
-        inline void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
-        int imgIdx;
-    };
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
-    const std::vector<GpuMat>& masks, bool compactResult)
-{
-    if (k == 2)
-    {
-        GpuMat trainCollection;
-        GpuMat maskCollection;
-
-        makeGpuCollection(trainCollection, maskCollection, masks);
-
-        GpuMat trainIdx, imgIdx, distance;
-
-        knnMatch2Collection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection);
-        knnMatch2Download(trainIdx, imgIdx, distance, matches);
-    }
-    else
-    {
-        if (query.empty() || empty())
+        if (trainDescCollection.empty())
             return;
 
-        std::vector< std::vector<DMatch> > curMatches;
-        std::vector<DMatch> temp;
-        temp.reserve(2 * k);
-
-        matches.resize(query.rows);
-        for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&std::vector<DMatch>::reserve), k));
-
-        for (size_t imgIdx = 0, size = trainDescCollection.size(); imgIdx < size; ++imgIdx)
+        if (masks.empty())
         {
-            knnMatch(query, trainDescCollection[imgIdx], curMatches, k, masks.empty() ? GpuMat() : masks[imgIdx]);
+            Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
 
-            for (int queryIdx = 0; queryIdx < query.rows; ++queryIdx)
+            PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
+
+            for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr)
+                *trainCollectionCPU_ptr = trainDescCollection[i];
+
+            trainCollection.upload(trainCollectionCPU);
+            maskCollection.release();
+        }
+        else
+        {
+            CV_Assert( masks.size() == trainDescCollection.size() );
+
+            Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
+            Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepb)));
+
+            PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
+            PtrStepb* maskCollectionCPU_ptr = maskCollectionCPU.ptr<PtrStepb>();
+
+            for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr)
             {
-                std::vector<DMatch>& localMatch = curMatches[queryIdx];
-                std::vector<DMatch>& globalMatch = matches[queryIdx];
+                const GpuMat& train = trainDescCollection[i];
+                const GpuMat& mask = masks[i];
 
-                for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast<int>(imgIdx)));
+                CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.cols == train.rows) );
 
-                temp.clear();
-                merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));
+                *trainCollectionCPU_ptr = train;
+                *maskCollectionCPU_ptr = mask;
+            }
 
-                globalMatch.clear();
-                const size_t count = std::min((size_t)k, temp.size());
-                copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch));
+            trainCollection.upload(trainCollectionCPU);
+            maskCollection.upload(maskCollectionCPU);
+        }
+    }
+
+    class BFMatcher_Impl : public cv::cuda::DescriptorMatcher
+    {
+    public:
+        explicit BFMatcher_Impl(int norm) : norm_(norm)
+        {
+            CV_Assert( norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING );
+        }
+
+        virtual bool isMaskSupported() const { return true; }
+
+        virtual void add(const std::vector<GpuMat>& descriptors)
+        {
+            trainDescCollection_.insert(trainDescCollection_.end(), descriptors.begin(), descriptors.end());
+        }
+
+        virtual const std::vector<GpuMat>& getTrainDescriptors() const
+        {
+            return trainDescCollection_;
+        }
+
+        virtual void clear()
+        {
+            trainDescCollection_.clear();
+        }
+
+        virtual bool empty() const
+        {
+            return trainDescCollection_.empty();
+        }
+
+        virtual void train()
+        {
+        }
+
+        virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
+                           std::vector<DMatch>& matches,
+                           InputArray mask = noArray());
+
+        virtual void match(InputArray queryDescriptors,
+                           std::vector<DMatch>& matches,
+                           const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+
+        virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                OutputArray matches,
+                                InputArray mask = noArray(),
+                                Stream& stream = Stream::Null());
+
+        virtual void matchAsync(InputArray queryDescriptors,
+                                OutputArray matches,
+                                const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                Stream& stream = Stream::Null());
+
+        virtual void matchConvert(InputArray gpu_matches,
+                                  std::vector<DMatch>& matches);
+
+        virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                              std::vector<std::vector<DMatch> >& matches,
+                              int k,
+                              InputArray mask = noArray(),
+                              bool compactResult = false);
+
+        virtual void knnMatch(InputArray queryDescriptors,
+                              std::vector<std::vector<DMatch> >& matches,
+                              int k,
+                              const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                              bool compactResult = false);
+
+        virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                   OutputArray matches,
+                                   int k,
+                                   InputArray mask = noArray(),
+                                   Stream& stream = Stream::Null());
+
+        virtual void knnMatchAsync(InputArray queryDescriptors,
+                                   OutputArray matches,
+                                   int k,
+                                   const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                   Stream& stream = Stream::Null());
+
+        virtual void knnMatchConvert(InputArray gpu_matches,
+                                     std::vector< std::vector<DMatch> >& matches,
+                                     bool compactResult = false);
+
+        virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                                 std::vector<std::vector<DMatch> >& matches,
+                                 float maxDistance,
+                                 InputArray mask = noArray(),
+                                 bool compactResult = false);
+
+        virtual void radiusMatch(InputArray queryDescriptors,
+                                 std::vector<std::vector<DMatch> >& matches,
+                                 float maxDistance,
+                                 const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                 bool compactResult = false);
+
+        virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                      OutputArray matches,
+                                      float maxDistance,
+                                      InputArray mask = noArray(),
+                                      Stream& stream = Stream::Null());
+
+        virtual void radiusMatchAsync(InputArray queryDescriptors,
+                                      OutputArray matches,
+                                      float maxDistance,
+                                      const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                      Stream& stream = Stream::Null());
+
+        virtual void radiusMatchConvert(InputArray gpu_matches,
+                                        std::vector< std::vector<DMatch> >& matches,
+                                        bool compactResult = false);
+
+    private:
+        int norm_;
+        std::vector<GpuMat> trainDescCollection_;
+    };
+
+    //
+    // 1 to 1 match
+    //
+
+    void BFMatcher_Impl::match(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                               std::vector<DMatch>& matches,
+                               InputArray _mask)
+    {
+        GpuMat d_matches;
+        matchAsync(_queryDescriptors, _trainDescriptors, d_matches, _mask);
+        matchConvert(d_matches, matches);
+    }
+
+    void BFMatcher_Impl::match(InputArray _queryDescriptors,
+                               std::vector<DMatch>& matches,
+                               const std::vector<GpuMat>& masks)
+    {
+        GpuMat d_matches;
+        matchAsync(_queryDescriptors, d_matches, masks);
+        matchConvert(d_matches, matches);
+    }
+
+    void BFMatcher_Impl::matchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                    OutputArray _matches,
+                                    InputArray _mask,
+                                    Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(2, nQuery, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC1, matches.ptr(0));
+        GpuMat distance(1, nQuery, CV_32FC1, matches.ptr(1));
+
+        func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::matchAsync(InputArray _queryDescriptors,
+                                    OutputArray _matches,
+                                    const std::vector<GpuMat>& masks,
+                                    Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3, nQuery, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC1, matches.ptr(0));
+        GpuMat imgIdx(1, nQuery, CV_32SC1, matches.ptr(1));
+        GpuMat distance(1, nQuery, CV_32FC1, matches.ptr(2));
+
+        func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::matchConvert(InputArray _gpu_matches,
+                                      std::vector<DMatch>& matches)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( (gpu_matches.type() == CV_32SC1) && (gpu_matches.rows == 2 || gpu_matches.rows == 3) );
+
+        const int nQuery = gpu_matches.cols;
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+
+        if (gpu_matches.rows == 2)
+        {
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(1);
+        }
+        else
+        {
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            imgIdxPtr =  gpu_matches.ptr<int>(1);
+            distancePtr =  gpu_matches.ptr<float>(2);
+        }
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            const int trainIdx = trainIdxPtr[queryIdx];
+            if (trainIdx == -1)
+                continue;
+
+            const int imgIdx = imgIdxPtr ? imgIdxPtr[queryIdx] : 0;
+            const float distance = distancePtr[queryIdx];
+
+            DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+            matches.push_back(m);
+        }
+    }
+
+    //
+    // knn match
+    //
+
+    void BFMatcher_Impl::knnMatch(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                  std::vector<std::vector<DMatch> >& matches,
+                                  int k,
+                                  InputArray _mask,
+                                  bool compactResult)
+    {
+        GpuMat d_matches;
+        knnMatchAsync(_queryDescriptors, _trainDescriptors, d_matches, k, _mask);
+        knnMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::knnMatch(InputArray _queryDescriptors,
+                                  std::vector<std::vector<DMatch> >& matches,
+                                  int k,
+                                  const std::vector<GpuMat>& masks,
+                                  bool compactResult)
+    {
+        if (k == 2)
+        {
+            GpuMat d_matches;
+            knnMatchAsync(_queryDescriptors, d_matches, k, masks);
+            knnMatchConvert(d_matches, matches, compactResult);
+        }
+        else
+        {
+            const GpuMat query = _queryDescriptors.getGpuMat();
+
+            if (query.empty() || trainDescCollection_.empty())
+            {
+                matches.clear();
+                return;
+            }
+
+            CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+            std::vector< std::vector<DMatch> > curMatches;
+            std::vector<DMatch> temp;
+            temp.reserve(2 * k);
+
+            matches.resize(query.rows);
+            for (size_t i = 0; i < matches.size(); ++i)
+                matches[i].reserve(k);
+
+            for (size_t imgIdx = 0; imgIdx < trainDescCollection_.size(); ++imgIdx)
+            {
+                knnMatch(query, trainDescCollection_[imgIdx], curMatches, k, masks.empty() ? GpuMat() : masks[imgIdx]);
+
+                for (int queryIdx = 0; queryIdx < query.rows; ++queryIdx)
+                {
+                    std::vector<DMatch>& localMatch = curMatches[queryIdx];
+                    std::vector<DMatch>& globalMatch = matches[queryIdx];
+
+                    for (size_t i = 0; i < localMatch.size(); ++i)
+                        localMatch[i].imgIdx = imgIdx;
+
+                    temp.clear();
+                    std::merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), std::back_inserter(temp));
+
+                    globalMatch.clear();
+                    const size_t count = std::min(static_cast<size_t>(k), temp.size());
+                    std::copy(temp.begin(), temp.begin() + count, std::back_inserter(globalMatch));
+                }
+            }
+
+            if (compactResult)
+            {
+                std::vector< std::vector<DMatch> >::iterator new_end = std::remove_if(matches.begin(), matches.end(), std::mem_fun_ref(&std::vector<DMatch>::empty));
+                matches.erase(new_end, matches.end());
             }
         }
+    }
 
-        if (compactResult)
+    void BFMatcher_Impl::knnMatchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                       OutputArray _matches,
+                                       int k,
+                                       InputArray _mask,
+                                       Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_knnmatch;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
         {
-            std::vector< std::vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), mem_fun_ref(&std::vector<DMatch>::empty));
-            matches.erase(new_end, matches.end());
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+                                 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+        const int nTrain = train.rows;
+
+        GpuMat trainIdx, distance, allDist;
+        if (k == 2)
+        {
+            _matches.create(2, nQuery, CV_32SC2);
+            GpuMat matches = _matches.getGpuMat();
+
+            trainIdx = GpuMat(1, nQuery, CV_32SC2, matches.ptr(0));
+            distance = GpuMat(1, nQuery, CV_32FC2, matches.ptr(1));
+        }
+        else
+        {
+            _matches.create(2 * nQuery, k, CV_32SC1);
+            GpuMat matches = _matches.getGpuMat();
+
+            trainIdx = GpuMat(nQuery, k, CV_32SC1, matches.ptr(0), matches.step);
+            distance = GpuMat(nQuery, k, CV_32FC1, matches.ptr(nQuery), matches.step);
+
+            BufferPool pool(stream);
+            allDist = pool.getBuffer(nQuery, nTrain, CV_32FC1);
+        }
+
+        trainIdx.setTo(Scalar::all(-1), stream);
+
+        func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::knnMatchAsync(InputArray _queryDescriptors,
+                                       OutputArray _matches,
+                                       int k,
+                                       const std::vector<GpuMat>& masks,
+                                       Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_knnmatch;
+
+        if (k != 2)
+        {
+            CV_Error(Error::StsNotImplemented, "only k=2 mode is supported for now");
+        }
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
+            match2L1_gpu<unsigned short>, match2L1_gpu<short>,
+            match2L1_gpu<int>, match2L1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
+            0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
+            0/*match2L2_gpu<int>*/, match2L2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
+            match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
+            match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3, nQuery, CV_32SC2);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC2, matches.ptr(0));
+        GpuMat imgIdx(1, nQuery, CV_32SC2, matches.ptr(1));
+        GpuMat distance(1, nQuery, CV_32FC2, matches.ptr(2));
+
+        trainIdx.setTo(Scalar::all(-1), stream);
+
+        func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::knnMatchConvert(InputArray _gpu_matches,
+                                         std::vector< std::vector<DMatch> >& matches,
+                                         bool compactResult)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( ((gpu_matches.type() == CV_32SC2) && (gpu_matches.rows == 2 || gpu_matches.rows == 3)) ||
+                   (gpu_matches.type() == CV_32SC1) );
+
+        int nQuery = -1, k = -1;
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+
+        if (gpu_matches.type() == CV_32SC2)
+        {
+            nQuery = gpu_matches.cols;
+            k = 2;
+
+            if (gpu_matches.rows == 2)
+            {
+                trainIdxPtr = gpu_matches.ptr<int>(0);
+                distancePtr =  gpu_matches.ptr<float>(1);
+            }
+            else
+            {
+                trainIdxPtr = gpu_matches.ptr<int>(0);
+                imgIdxPtr =  gpu_matches.ptr<int>(1);
+                distancePtr =  gpu_matches.ptr<float>(2);
+            }
+        }
+        else
+        {
+            nQuery = gpu_matches.rows / 2;
+            k = gpu_matches.cols;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(nQuery);
+        }
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            matches.push_back(std::vector<DMatch>());
+            std::vector<DMatch>& curMatches = matches.back();
+            curMatches.reserve(k);
+
+            for (int i = 0; i < k; ++i)
+            {
+                const int trainIdx = *trainIdxPtr;
+                if (trainIdx == -1)
+                    continue;
+
+                const int imgIdx = imgIdxPtr ? *imgIdxPtr : 0;
+                const float distance = *distancePtr;
+
+                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+                curMatches.push_back(m);
+
+                ++trainIdxPtr;
+                ++distancePtr;
+                if (imgIdxPtr)
+                    ++imgIdxPtr;
+            }
+
+            if (compactResult && curMatches.empty())
+            {
+                matches.pop_back();
+            }
+        }
+    }
+
+    //
+    // radius match
+    //
+
+    void BFMatcher_Impl::radiusMatch(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                     std::vector<std::vector<DMatch> >& matches,
+                                     float maxDistance,
+                                     InputArray _mask,
+                                     bool compactResult)
+    {
+        GpuMat d_matches;
+        radiusMatchAsync(_queryDescriptors, _trainDescriptors, d_matches, maxDistance, _mask);
+        radiusMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::radiusMatch(InputArray _queryDescriptors,
+                                     std::vector<std::vector<DMatch> >& matches,
+                                     float maxDistance,
+                                     const std::vector<GpuMat>& masks,
+                                     bool compactResult)
+    {
+        GpuMat d_matches;
+        radiusMatchAsync(_queryDescriptors, d_matches, maxDistance, masks);
+        radiusMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::radiusMatchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                          OutputArray _matches,
+                                          float maxDistance,
+                                          InputArray _mask,
+                                          Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_radius_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+        const int nTrain = train.rows;
+
+        const int cols = std::max((nTrain / 100), nQuery);
+
+        _matches.create(2 * nQuery + 1, cols, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(nQuery, cols, CV_32SC1, matches.ptr(0), matches.step);
+        GpuMat distance(nQuery, cols, CV_32FC1, matches.ptr(nQuery), matches.step);
+        GpuMat nMatches(1, nQuery, CV_32SC1, matches.ptr(2 * nQuery));
+
+        nMatches.setTo(Scalar::all(0), stream);
+
+        func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::radiusMatchAsync(InputArray _queryDescriptors,
+                                          OutputArray _matches,
+                                          float maxDistance,
+                                          const std::vector<GpuMat>& masks,
+                                          Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_radius_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3 * nQuery + 1, nQuery, CV_32FC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(nQuery, nQuery, CV_32SC1, matches.ptr(0), matches.step);
+        GpuMat imgIdx(nQuery, nQuery, CV_32SC1, matches.ptr(nQuery), matches.step);
+        GpuMat distance(nQuery, nQuery, CV_32FC1, matches.ptr(2 * nQuery), matches.step);
+        GpuMat nMatches(1, nQuery, CV_32SC1, matches.ptr(3 * nQuery));
+
+        nMatches.setTo(Scalar::all(0), stream);
+
+        std::vector<PtrStepSzb> trains_(trainDescCollection_.begin(), trainDescCollection_.end());
+        std::vector<PtrStepSzb> masks_(masks.begin(), masks.end());
+
+        func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
+            trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::radiusMatchConvert(InputArray _gpu_matches,
+                                            std::vector< std::vector<DMatch> >& matches,
+                                            bool compactResult)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( gpu_matches.type() == CV_32SC1 || gpu_matches.type() == CV_32FC1 );
+
+        int nQuery = -1;
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+        const int* nMatchesPtr = NULL;
+
+        if (gpu_matches.type() == CV_32SC1)
+        {
+            nQuery = (gpu_matches.rows - 1) / 2;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(nQuery);
+            nMatchesPtr = gpu_matches.ptr<int>(2 * nQuery);
+        }
+        else
+        {
+            nQuery = (gpu_matches.rows - 1) / 3;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            imgIdxPtr = gpu_matches.ptr<int>(nQuery);
+            distancePtr =  gpu_matches.ptr<float>(2 * nQuery);
+            nMatchesPtr = gpu_matches.ptr<int>(3 * nQuery);
+        }
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            const int nMatched = std::min(nMatchesPtr[queryIdx], gpu_matches.cols);
+
+            if (nMatched == 0)
+            {
+                if (!compactResult)
+                {
+                    matches.push_back(std::vector<DMatch>());
+                }
+            }
+            else
+            {
+                matches.push_back(std::vector<DMatch>(nMatched));
+                std::vector<DMatch>& curMatches = matches.back();
+
+                for (int i = 0; i < nMatched; ++i)
+                {
+                    const int trainIdx = trainIdxPtr[i];
+
+                    const int imgIdx = imgIdxPtr ? imgIdxPtr[i] : 0;
+                    const float distance = distancePtr[i];
+
+                    DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+                    curMatches[i] = m;
+                }
+
+                std::sort(curMatches.begin(), curMatches.end());
+            }
+
+            trainIdxPtr += gpu_matches.cols;
+            distancePtr += gpu_matches.cols;
+            if (imgIdxPtr)
+                imgIdxPtr += gpu_matches.cols;
         }
     }
 }
 
-////////////////////////////////////////////////////////////////////
-// RadiusMatch
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-    const GpuMat& mask, Stream& stream)
+Ptr<cv::cuda::DescriptorMatcher> cv::cuda::DescriptorMatcher::createBFMatcher(int norm)
 {
-    if (query.empty() || train.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_radius_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
-    if (trainIdx.empty())
-    {
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
-    }
-
-    nMatches.setTo(Scalar::all(0), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-    Mat nMatchesCPU(nMatches);
-
-    radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
-    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows);
-
-    const int nQuery = trainIdx.rows;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* nMatches_ptr = nMatches.ptr<int>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
-        const float* distance_ptr = distance.ptr<float>(queryIdx);
-
-        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
-
-        if (nMatched == 0)
-        {
-            if (!compactResult)
-                matches.push_back(std::vector<DMatch>());
-            continue;
-        }
-
-        matches.push_back(std::vector<DMatch>(nMatched));
-        std::vector<DMatch>& curMatches = matches.back();
-
-        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-
-            float _distance = *distance_ptr;
-
-            DMatch m(queryIdx, _trainIdx, 0, _distance);
-
-            curMatches[i] = m;
-        }
-
-        sort(curMatches.begin(), curMatches.end());
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat& query, const GpuMat& train,
-    std::vector< std::vector<DMatch> >& matches, float maxDistance, const GpuMat& mask, bool compactResult)
-{
-    GpuMat trainIdx, distance, nMatches;
-    radiusMatchSingle(query, train, trainIdx, distance, nMatches, maxDistance, mask);
-    radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
-    float maxDistance, const std::vector<GpuMat>& masks, Stream& stream)
-{
-    if (query.empty() || empty())
-        return;
-
-    using namespace cv::cuda::device::bf_radius_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    const int nQuery = query.rows;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size()));
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
-    if (trainIdx.empty())
-    {
-        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx);
-        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx);
-        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
-    }
-
-    nMatches.setTo(Scalar::all(0), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    std::vector<PtrStepSzb> trains_(trainDescCollection.begin(), trainDescCollection.end());
-    std::vector<PtrStepSzb> masks_(masks.begin(), masks.end());
-
-    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
-        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-    Mat nMatchesCPU(nMatches);
-
-    radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.size() == trainIdx.size());
-    CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
-    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows);
-
-    const int nQuery = trainIdx.rows;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* nMatches_ptr = nMatches.ptr<int>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
-        const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
-        const float* distance_ptr = distance.ptr<float>(queryIdx);
-
-        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
-
-        if (nMatched == 0)
-        {
-            if (!compactResult)
-                matches.push_back(std::vector<DMatch>());
-            continue;
-        }
-
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatched);
-
-        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-            int _imgIdx = *imgIdx_ptr;
-            float _distance = *distance_ptr;
-
-            DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
-
-            curMatches.push_back(m);
-        }
-
-        sort(curMatches.begin(), curMatches.end());
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches,
-    float maxDistance, const std::vector<GpuMat>& masks, bool compactResult)
-{
-    GpuMat trainIdx, imgIdx, distance, nMatches;
-    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
-    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
+    return makePtr<BFMatcher_Impl>(norm);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/src/cuda/fast.cu b/modules/cudafeatures2d/src/cuda/fast.cu
index 7aa888ac3f..72235d4e50 100644
--- a/modules/cudafeatures2d/src/cuda/fast.cu
+++ b/modules/cudafeatures2d/src/cuda/fast.cu
@@ -279,7 +279,7 @@ namespace cv { namespace cuda { namespace device
             #endif
         }
 
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold)
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
         {
             void* counter_ptr;
             cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -290,29 +290,29 @@ namespace cv { namespace cuda { namespace device
             grid.x = divUp(img.cols - 6, block.x);
             grid.y = divUp(img.rows - 6, block.y);
 
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
 
             if (score.data)
             {
                 if (mask.data)
-                    calcKeypoints<true><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                 else
-                    calcKeypoints<true><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
             }
             else
             {
                 if (mask.data)
-                    calcKeypoints<false><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                 else
-                    calcKeypoints<false><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
             }
 
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             unsigned int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return count;
         }
@@ -356,7 +356,7 @@ namespace cv { namespace cuda { namespace device
             #endif
         }
 
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
         {
             void* counter_ptr;
             cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -366,15 +366,15 @@ namespace cv { namespace cuda { namespace device
             dim3 grid;
             grid.x = divUp(count, block.x);
 
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
 
-            nonmaxSuppression<<<grid, block>>>(kpLoc, count, score, loc, response);
+            nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             unsigned int new_count;
-            cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return new_count;
         }
diff --git a/modules/cudafeatures2d/src/fast.cpp b/modules/cudafeatures2d/src/fast.cpp
index aa77aa87bd..2095ef7cf6 100644
--- a/modules/cudafeatures2d/src/fast.cpp
+++ b/modules/cudafeatures2d/src/fast.cpp
@@ -47,124 +47,162 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::FAST_CUDA::FAST_CUDA(int, bool, double) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::release() { throw_no_cuda(); }
-int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_no_cuda(); return 0; }
-int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat&) { throw_no_cuda(); return 0; }
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<cv::cuda::FastFeatureDetector>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
-cv::cuda::FAST_CUDA::FAST_CUDA(int _threshold, bool _nonmaxSuppression, double _keypointsRatio) :
-    nonmaxSuppression(_nonmaxSuppression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
-{
-}
-
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    if (image.empty())
-        return;
-
-    (*this)(image, mask, d_keypoints_);
-    downloadKeypoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-        return;
-
-    Mat h_keypoints(d_keypoints);
-    convertKeypoints(h_keypoints, keypoints);
-}
-
-void cv::cuda::FAST_CUDA::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (h_keypoints.empty())
-        return;
-
-    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
-
-    int npoints = h_keypoints.cols;
-
-    keypoints.resize(npoints);
-
-    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
-    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
-
-    for (int i = 0; i < npoints; ++i)
-    {
-        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
-        keypoints[i] = kp;
-    }
-}
-
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
-{
-    calcKeyPointsLocation(img, mask);
-    keypoints.cols = getKeyPoints(keypoints);
-}
-
 namespace cv { namespace cuda { namespace device
 {
     namespace fast
     {
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
     }
 }}}
 
-int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
+namespace
 {
-    using namespace cv::cuda::device::fast;
-
-    CV_Assert(img.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-
-    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
-
-    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
-
-    if (nonmaxSuppression)
+    class FAST_Impl : public cv::cuda::FastFeatureDetector
+    {
+    public:
+        FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints);
+
+        virtual void detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask);
+        virtual void detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual void setThreshold(int threshold) { threshold_ = threshold; }
+        virtual int getThreshold() const { return threshold_; }
+
+        virtual void setNonmaxSuppression(bool f) { nonmaxSuppression_ = f; }
+        virtual bool getNonmaxSuppression() const { return nonmaxSuppression_; }
+
+        virtual void setMaxNumPoints(int max_npoints) { max_npoints_ = max_npoints; }
+        virtual int getMaxNumPoints() const { return max_npoints_; }
+
+        virtual void setType(int type) { CV_Assert( type == TYPE_9_16 ); }
+        virtual int getType() const { return TYPE_9_16; }
+
+    private:
+        int threshold_;
+        bool nonmaxSuppression_;
+        int max_npoints_;
+    };
+
+    FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
+        threshold_(threshold), nonmaxSuppression_(nonmaxSuppression), max_npoints_(max_npoints)
     {
-        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
-        score_.setTo(Scalar::all(0));
     }
 
-    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSuppression ? score_ : PtrStepSzi(), threshold);
-    count_ = std::min(count_, maxKeypoints);
+    void FAST_Impl::detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask)
+    {
+        if (_image.empty())
+        {
+            keypoints.clear();
+            return;
+        }
 
-    return count_;
+        BufferPool pool(Stream::Null());
+        GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_16SC2);
+
+        detectAsync(_image, d_keypoints, _mask, Stream::Null());
+        convert(d_keypoints, keypoints);
+    }
+
+    void FAST_Impl::detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream)
+    {
+        using namespace cv::cuda::device::fast;
+
+        const GpuMat img = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()) );
+
+        BufferPool pool(stream);
+
+        GpuMat kpLoc = pool.getBuffer(1, max_npoints_, CV_16SC2);
+
+        GpuMat score;
+        if (nonmaxSuppression_)
+        {
+            score = pool.getBuffer(img.size(), CV_32SC1);
+            score.setTo(Scalar::all(0), stream);
+        }
+
+        int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
+        count = std::min(count, max_npoints_);
+
+        if (count == 0)
+        {
+            _keypoints.release();
+            return;
+        }
+
+        ensureSizeIsEnough(ROWS_COUNT, count, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
+
+        if (nonmaxSuppression_)
+        {
+            count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
+            if (count == 0)
+            {
+                keypoints.release();
+            }
+            else
+            {
+                keypoints.cols = count;
+            }
+        }
+        else
+        {
+            GpuMat locRow(1, count, kpLoc.type(), keypoints.ptr(0));
+            kpLoc.colRange(0, count).copyTo(locRow, stream);
+            keypoints.row(1).setTo(Scalar::all(0), stream);
+        }
+    }
+
+    void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
+    {
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }
+
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }
+
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.elemSize() == 4 );
+
+        const int npoints = h_keypoints.cols;
+
+        keypoints.resize(npoints);
+
+        const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
+        const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
+
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
+            keypoints[i] = kp;
+        }
+    }
 }
 
-int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat& keypoints)
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int threshold, bool nonmaxSuppression, int type, int max_npoints)
 {
-    using namespace cv::cuda::device::fast;
-
-    if (count_ == 0)
-        return 0;
-
-    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
-
-    if (nonmaxSuppression)
-        return nonmaxSuppression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
-
-    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
-    kpLoc_.colRange(0, count_).copyTo(locRow);
-    keypoints.row(1).setTo(Scalar::all(0));
-
-    return count_;
-}
-
-void cv::cuda::FAST_CUDA::release()
-{
-    kpLoc_.release();
-    score_.release();
-
-    d_keypoints_.release();
+    CV_Assert( type == TYPE_9_16 );
+    return makePtr<FAST_Impl>(threshold, nonmaxSuppression, max_npoints);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/src/feature2d_async.cpp b/modules/cudafeatures2d/src/feature2d_async.cpp
new file mode 100644
index 0000000000..202a725376
--- /dev/null
+++ b/modules/cudafeatures2d/src/feature2d_async.cpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+cv::cuda::Feature2DAsync::~Feature2DAsync()
+{
+}
+
+void cv::cuda::Feature2DAsync::detectAsync(InputArray image,
+                                           OutputArray keypoints,
+                                           InputArray mask,
+                                           Stream& stream)
+{
+    if (image.empty())
+    {
+        keypoints.clear();
+        return;
+    }
+
+    detectAndComputeAsync(image, mask, keypoints, noArray(), false, stream);
+}
+
+void cv::cuda::Feature2DAsync::computeAsync(InputArray image,
+                                            OutputArray keypoints,
+                                            OutputArray descriptors,
+                                            Stream& stream)
+{
+    if (image.empty())
+    {
+        descriptors.release();
+        return;
+    }
+
+    detectAndComputeAsync(image, noArray(), keypoints, descriptors, true, stream);
+}
+
+void cv::cuda::Feature2DAsync::detectAndComputeAsync(InputArray /*image*/,
+                                                     InputArray /*mask*/,
+                                                     OutputArray /*keypoints*/,
+                                                     OutputArray /*descriptors*/,
+                                                     bool /*useProvidedKeypoints*/,
+                                                     Stream& /*stream*/)
+{
+    CV_Error(Error::StsNotImplemented, "");
+}
diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp
index 8d8afe8f04..6bfdd5ac47 100644
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -47,18 +47,7 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::ORB_CUDA::ORB_CUDA(int, float, int, int, int, int, int, int) : fastDetector_(20) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::release() { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat&, const GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::computeKeyPointsPyramid() { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat&) { throw_no_cuda(); }
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int, float, int, int, int, int, int, int, int, bool) { throw_no_cuda(); return Ptr<cv::cuda::ORB>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -346,7 +335,100 @@ namespace
         -1,-6, 0,-11/*mean (0.127148), correlation (0.547401)*/
     };
 
-    void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
+    class ORB_Impl : public cv::cuda::ORB
+    {
+    public:
+        ORB_Impl(int nfeatures,
+                 float scaleFactor,
+                 int nlevels,
+                 int edgeThreshold,
+                 int firstLevel,
+                 int WTA_K,
+                 int scoreType,
+                 int patchSize,
+                 int fastThreshold,
+                 bool blurForDescriptor);
+
+        virtual void detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints);
+        virtual void detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual int descriptorSize() const { return kBytes; }
+        virtual int descriptorType() const { return CV_8U; }
+        virtual int defaultNorm() const { return NORM_HAMMING; }
+
+        virtual void setMaxFeatures(int maxFeatures) { nFeatures_ = maxFeatures; }
+        virtual int getMaxFeatures() const { return nFeatures_; }
+
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }
+
+        virtual void setNLevels(int nlevels) { nLevels_ = nlevels; }
+        virtual int getNLevels() const { return nLevels_; }
+
+        virtual void setEdgeThreshold(int edgeThreshold) { edgeThreshold_ = edgeThreshold; }
+        virtual int getEdgeThreshold() const { return edgeThreshold_; }
+
+        virtual void setFirstLevel(int firstLevel) { firstLevel_ = firstLevel; }
+        virtual int getFirstLevel() const { return firstLevel_; }
+
+        virtual void setWTA_K(int wta_k) { WTA_K_ = wta_k; }
+        virtual int getWTA_K() const { return WTA_K_; }
+
+        virtual void setScoreType(int scoreType) { scoreType_ = scoreType; }
+        virtual int getScoreType() const { return scoreType_; }
+
+        virtual void setPatchSize(int patchSize) { patchSize_ = patchSize; }
+        virtual int getPatchSize() const { return patchSize_; }
+
+        virtual void setFastThreshold(int fastThreshold) { fastThreshold_ = fastThreshold; }
+        virtual int getFastThreshold() const { return fastThreshold_; }
+
+        virtual void setBlurForDescriptor(bool blurForDescriptor) { blurForDescriptor_ = blurForDescriptor; }
+        virtual bool getBlurForDescriptor() const { return blurForDescriptor_; }
+
+    private:
+        int nFeatures_;
+        float scaleFactor_;
+        int nLevels_;
+        int edgeThreshold_;
+        int firstLevel_;
+        int WTA_K_;
+        int scoreType_;
+        int patchSize_;
+        int fastThreshold_;
+        bool blurForDescriptor_;
+
+    private:
+        void buildScalePyramids(InputArray _image, InputArray _mask);
+        void computeKeyPointsPyramid();
+        void computeDescriptors(OutputArray _descriptors);
+        void mergeKeyPoints(OutputArray _keypoints);
+
+    private:
+        Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
+
+        //! The number of desired features per scale
+        std::vector<size_t> n_features_per_level_;
+
+        //! Points to compute BRIEF descriptors from
+        GpuMat pattern_;
+
+        std::vector<GpuMat> imagePyr_;
+        std::vector<GpuMat> maskPyr_;
+
+        GpuMat buf_;
+
+        std::vector<GpuMat> keyPointsPyr_;
+        std::vector<int> keyPointsCount_;
+
+        Ptr<cuda::Filter> blurFilter_;
+
+        GpuMat d_keypoints_;
+    };
+
+    static void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
     {
         RNG rng(0x12345678);
 
@@ -381,7 +463,7 @@ namespace
         }
     }
 
-    void makeRandomPattern(int patchSize, Point* pattern, int npoints)
+    static void makeRandomPattern(int patchSize, Point* pattern, int npoints)
     {
         // we always start with a fixed seed,
         // to make patterns the same on each run
@@ -393,155 +475,189 @@ namespace
             pattern[i].y = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
         }
     }
-}
 
-cv::cuda::ORB_CUDA::ORB_CUDA(int nFeatures, float scaleFactor, int nLevels, int edgeThreshold, int firstLevel, int WTA_K, int scoreType, int patchSize) :
-    nFeatures_(nFeatures), scaleFactor_(scaleFactor), nLevels_(nLevels), edgeThreshold_(edgeThreshold), firstLevel_(firstLevel), WTA_K_(WTA_K),
-    scoreType_(scoreType), patchSize_(patchSize),
-    fastDetector_(DEFAULT_FAST_THRESHOLD)
-{
-    CV_Assert(patchSize_ >= 2);
-
-    // fill the extractors and descriptors for the corresponding scales
-    float factor = 1.0f / scaleFactor_;
-    float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
-
-    n_features_per_level_.resize(nLevels_);
-    size_t sum_n_features = 0;
-    for (int level = 0; level < nLevels_ - 1; ++level)
+    ORB_Impl::ORB_Impl(int nFeatures,
+                       float scaleFactor,
+                       int nLevels,
+                       int edgeThreshold,
+                       int firstLevel,
+                       int WTA_K,
+                       int scoreType,
+                       int patchSize,
+                       int fastThreshold,
+                       bool blurForDescriptor) :
+        nFeatures_(nFeatures),
+        scaleFactor_(scaleFactor),
+        nLevels_(nLevels),
+        edgeThreshold_(edgeThreshold),
+        firstLevel_(firstLevel),
+        WTA_K_(WTA_K),
+        scoreType_(scoreType),
+        patchSize_(patchSize),
+        fastThreshold_(fastThreshold),
+        blurForDescriptor_(blurForDescriptor)
     {
-        n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
-        sum_n_features += n_features_per_level_[level];
-        n_desired_features_per_scale *= factor;
-    }
-    n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
+        CV_Assert( patchSize_ >= 2 );
+        CV_Assert( WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4 );
 
-    // pre-compute the end of a row in a circular patch
-    int half_patch_size = patchSize_ / 2;
-    std::vector<int> u_max(half_patch_size + 2);
-    for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
-        u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        fastDetector_ = cuda::FastFeatureDetector::create(fastThreshold_);
 
-    // Make sure we are symmetric
-    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
-    {
-        while (u_max[v_0] == u_max[v_0 + 1])
-            ++v_0;
-        u_max[v] = v_0;
-        ++v_0;
-    }
-    CV_Assert(u_max.size() < 32);
-    cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
+        // fill the extractors and descriptors for the corresponding scales
+        float factor = 1.0f / scaleFactor_;
+        float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
 
-    // Calc pattern
-    const int npoints = 512;
-    Point pattern_buf[npoints];
-    const Point* pattern0 = (const Point*)bit_pattern_31_;
-    if (patchSize_ != 31)
-    {
-        pattern0 = pattern_buf;
-        makeRandomPattern(patchSize_, pattern_buf, npoints);
-    }
-
-    CV_Assert(WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4);
-
-    Mat h_pattern;
-
-    if (WTA_K_ == 2)
-    {
-        h_pattern.create(2, npoints, CV_32SC1);
-
-        int* pattern_x_ptr = h_pattern.ptr<int>(0);
-        int* pattern_y_ptr = h_pattern.ptr<int>(1);
-
-        for (int i = 0; i < npoints; ++i)
+        n_features_per_level_.resize(nLevels_);
+        size_t sum_n_features = 0;
+        for (int level = 0; level < nLevels_ - 1; ++level)
         {
-            pattern_x_ptr[i] = pattern0[i].x;
-            pattern_y_ptr[i] = pattern0[i].y;
+            n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
+            sum_n_features += n_features_per_level_[level];
+            n_desired_features_per_scale *= factor;
         }
-    }
-    else
-    {
-        int ntuples = descriptorSize() * 4;
-        initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
-    }
+        n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
 
-    pattern_.upload(h_pattern);
-
-    blurFilter = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
-
-    blurForDescriptor = false;
-}
-
-namespace
-{
-    inline float getScale(float scaleFactor, int firstLevel, int level)
-    {
-        return pow(scaleFactor, level - firstLevel);
-    }
-}
-
-void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat& image, const GpuMat& mask)
-{
-    CV_Assert(image.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
-
-    imagePyr_.resize(nLevels_);
-    maskPyr_.resize(nLevels_);
-
-    for (int level = 0; level < nLevels_; ++level)
-    {
-        float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
-
-        Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
-
-        ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
-        ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
-        maskPyr_[level].setTo(Scalar::all(255));
-
-        // Compute the resized image
-        if (level != firstLevel_)
+        // pre-compute the end of a row in a circular patch
+        int half_patch_size = patchSize_ / 2;
+        std::vector<int> u_max(half_patch_size + 2);
+        for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
         {
-            if (level < firstLevel_)
-            {
-                cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+            u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        }
 
-                if (!mask.empty())
-                    cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
-            }
-            else
-            {
-                cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+        // Make sure we are symmetric
+        for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
+        {
+            while (u_max[v_0] == u_max[v_0 + 1])
+                ++v_0;
+            u_max[v] = v_0;
+            ++v_0;
+        }
+        CV_Assert( u_max.size() < 32 );
+        cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
 
-                if (!mask.empty())
-                {
-                    cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
-                    cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
-                }
+        // Calc pattern
+        const int npoints = 512;
+        Point pattern_buf[npoints];
+        const Point* pattern0 = (const Point*)bit_pattern_31_;
+        if (patchSize_ != 31)
+        {
+            pattern0 = pattern_buf;
+            makeRandomPattern(patchSize_, pattern_buf, npoints);
+        }
+
+        Mat h_pattern;
+        if (WTA_K_ == 2)
+        {
+            h_pattern.create(2, npoints, CV_32SC1);
+
+            int* pattern_x_ptr = h_pattern.ptr<int>(0);
+            int* pattern_y_ptr = h_pattern.ptr<int>(1);
+
+            for (int i = 0; i < npoints; ++i)
+            {
+                pattern_x_ptr[i] = pattern0[i].x;
+                pattern_y_ptr[i] = pattern0[i].y;
             }
         }
         else
         {
-            image.copyTo(imagePyr_[level]);
-
-            if (!mask.empty())
-                mask.copyTo(maskPyr_[level]);
+            int ntuples = descriptorSize() * 4;
+            initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
         }
 
-        // Filter keypoints by image border
-        ensureSizeIsEnough(sz, CV_8UC1, buf_);
-        buf_.setTo(Scalar::all(0));
-        Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
-        buf_(inner).setTo(Scalar::all(255));
+        pattern_.upload(h_pattern);
 
-        cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+        blurFilter_ = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
     }
-}
 
-namespace
-{
-    //takes keypoints and culls them by the response
-    void cull(GpuMat& keypoints, int& count, int n_points)
+    void ORB_Impl::detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints)
+    {
+        CV_Assert( useProvidedKeypoints == false );
+
+        detectAndComputeAsync(_image, _mask, d_keypoints_, _descriptors, false, Stream::Null());
+        convert(d_keypoints_, keypoints);
+    }
+
+    void ORB_Impl::detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream)
+    {
+        CV_Assert( useProvidedKeypoints == false );
+
+        buildScalePyramids(_image, _mask);
+        computeKeyPointsPyramid();
+        if (_descriptors.needed())
+        {
+            computeDescriptors(_descriptors);
+        }
+        mergeKeyPoints(_keypoints);
+    }
+
+    static float getScale(float scaleFactor, int firstLevel, int level)
+    {
+        return pow(scaleFactor, level - firstLevel);
+    }
+
+    void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask)
+    {
+        const GpuMat image = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( image.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+
+        imagePyr_.resize(nLevels_);
+        maskPyr_.resize(nLevels_);
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
+
+            Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
+
+            ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
+            ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
+            maskPyr_[level].setTo(Scalar::all(255));
+
+            // Compute the resized image
+            if (level != firstLevel_)
+            {
+                if (level < firstLevel_)
+                {
+                    cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                    if (!mask.empty())
+                        cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+                }
+                else
+                {
+                    cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                    if (!mask.empty())
+                    {
+                        cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+                        cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
+                    }
+                }
+            }
+            else
+            {
+                image.copyTo(imagePyr_[level]);
+
+                if (!mask.empty())
+                    mask.copyTo(maskPyr_[level]);
+            }
+
+            // Filter keypoints by image border
+            ensureSizeIsEnough(sz, CV_8UC1, buf_);
+            buf_.setTo(Scalar::all(0));
+            Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
+            buf_(inner).setTo(Scalar::all(255));
+
+            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+        }
+    }
+
+    // takes keypoints and culls them by the response
+    static void cull(GpuMat& keypoints, int& count, int n_points)
     {
         using namespace cv::cuda::device::orb;
 
@@ -554,222 +670,199 @@ namespace
                 return;
             }
 
-            count = cull_gpu(keypoints.ptr<int>(FAST_CUDA::LOCATION_ROW), keypoints.ptr<float>(FAST_CUDA::RESPONSE_ROW), count, n_points);
+            count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
         }
     }
-}
 
-void cv::cuda::ORB_CUDA::computeKeyPointsPyramid()
-{
-    using namespace cv::cuda::device::orb;
-
-    int half_patch_size = patchSize_ / 2;
-
-    keyPointsPyr_.resize(nLevels_);
-    keyPointsCount_.resize(nLevels_);
-
-    for (int level = 0; level < nLevels_; ++level)
+    void ORB_Impl::computeKeyPointsPyramid()
     {
-        keyPointsCount_[level] = fastDetector_.calcKeyPointsLocation(imagePyr_[level], maskPyr_[level]);
+        using namespace cv::cuda::device::orb;
 
-        if (keyPointsCount_[level] == 0)
-            continue;
+        int half_patch_size = patchSize_ / 2;
 
-        ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
+        keyPointsPyr_.resize(nLevels_);
+        keyPointsCount_.resize(nLevels_);
 
-        GpuMat fastKpRange = keyPointsPyr_[level].rowRange(0, 2);
-        keyPointsCount_[level] = fastDetector_.getKeyPoints(fastKpRange);
+        fastDetector_->setThreshold(fastThreshold_);
 
-        if (keyPointsCount_[level] == 0)
-            continue;
-
-        int n_features = static_cast<int>(n_features_per_level_[level]);
-
-        if (scoreType_ == ORB::HARRIS_SCORE)
+        for (int level = 0; level < nLevels_; ++level)
         {
-            // Keep more points than necessary as FAST does not give amazing corners
-            cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+            fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
 
-            // Compute the Harris cornerness (better scoring than FAST)
-            HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+            GpuMat fastKpRange;
+            fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
+
+            keyPointsCount_[level] = fastKpRange.cols;
+
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
+            fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
+
+            const int n_features = static_cast<int>(n_features_per_level_[level]);
+
+            if (scoreType_ == ORB::HARRIS_SCORE)
+            {
+                // Keep more points than necessary as FAST does not give amazing corners
+                cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+
+                // Compute the Harris cornerness (better scoring than FAST)
+                HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+            }
+
+            //cull to the final desired level, using the new Harris scores or the original FAST scores.
+            cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
+
+            // Compute orientation
+            IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
         }
-
-        //cull to the final desired level, using the new Harris scores or the original FAST scores.
-        cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
-
-        // Compute orientation
-        IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
-    }
-}
-
-void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat& descriptors)
-{
-    using namespace cv::cuda::device::orb;
-
-    int nAllkeypoints = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
-        nAllkeypoints += keyPointsCount_[level];
-
-    if (nAllkeypoints == 0)
-    {
-        descriptors.release();
-        return;
     }
 
-    ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, descriptors);
-
-    int offset = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
+    void ORB_Impl::computeDescriptors(OutputArray _descriptors)
     {
-        if (keyPointsCount_[level] == 0)
-            continue;
+        using namespace cv::cuda::device::orb;
 
-        GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+        int nAllkeypoints = 0;
 
-        if (blurForDescriptor)
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
         {
-            // preprocess the resized image
-            ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_);
+            _descriptors.release();
+            return;
         }
 
-        computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
-            keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+        ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, _descriptors);
+        GpuMat descriptors = _descriptors.getGpuMat();
 
-        offset += keyPointsCount_[level];
+        int offset = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+
+            if (blurForDescriptor_)
+            {
+                // preprocess the resized image
+                ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
+                blurFilter_->apply(imagePyr_[level], buf_);
+            }
+
+            computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
+                keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+
+            offset += keyPointsCount_[level];
+        }
     }
-}
 
-void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat& keypoints)
-{
-    using namespace cv::cuda::device::orb;
-
-    int nAllkeypoints = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
-        nAllkeypoints += keyPointsCount_[level];
-
-    if (nAllkeypoints == 0)
+    void ORB_Impl::mergeKeyPoints(OutputArray _keypoints)
     {
-        keypoints.release();
-        return;
+        using namespace cv::cuda::device::orb;
+
+        int nAllkeypoints = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
+        {
+            _keypoints.release();
+            return;
+        }
+
+        ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
+
+        int offset = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            float sf = getScale(scaleFactor_, firstLevel_, level);
+
+            GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+
+            float locScale = level != firstLevel_ ? sf : 1.0f;
+
+            mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+
+            GpuMat range = keyPointsRange.rowRange(2, 4);
+            keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
+
+            keyPointsRange.row(4).setTo(Scalar::all(level));
+            keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
+
+            offset += keyPointsCount_[level];
+        }
     }
 
-    ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, keypoints);
-
-    int offset = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
+    void ORB_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
     {
-        if (keyPointsCount_[level] == 0)
-            continue;
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }
 
-        float sf = getScale(scaleFactor_, firstLevel_, level);
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }
 
-        GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.type() == CV_32FC1 );
 
-        float locScale = level != firstLevel_ ? sf : 1.0f;
+        const int npoints = h_keypoints.cols;
 
-        mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+        keypoints.resize(npoints);
 
-        GpuMat range = keyPointsRange.rowRange(2, 4);
-        keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
+        const float* x_ptr = h_keypoints.ptr<float>(X_ROW);
+        const float* y_ptr = h_keypoints.ptr<float>(Y_ROW);
+        const float* response_ptr = h_keypoints.ptr<float>(RESPONSE_ROW);
+        const float* angle_ptr = h_keypoints.ptr<float>(ANGLE_ROW);
+        const float* octave_ptr = h_keypoints.ptr<float>(OCTAVE_ROW);
+        const float* size_ptr = h_keypoints.ptr<float>(SIZE_ROW);
 
-        keyPointsRange.row(4).setTo(Scalar::all(level));
-        keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp;
 
-        offset += keyPointsCount_[level];
+            kp.pt.x = x_ptr[i];
+            kp.pt.y = y_ptr[i];
+            kp.response = response_ptr[i];
+            kp.angle = angle_ptr[i];
+            kp.octave = static_cast<int>(octave_ptr[i]);
+            kp.size = size_ptr[i];
+
+            keypoints[i] = kp;
+        }
     }
 }
 
-void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat &d_keypoints, std::vector<KeyPoint>& keypoints)
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int nfeatures,
+                                         float scaleFactor,
+                                         int nlevels,
+                                         int edgeThreshold,
+                                         int firstLevel,
+                                         int WTA_K,
+                                         int scoreType,
+                                         int patchSize,
+                                         int fastThreshold,
+                                         bool blurForDescriptor)
 {
-    if (d_keypoints.empty())
-    {
-        keypoints.clear();
-        return;
-    }
-
-    Mat h_keypoints(d_keypoints);
-
-    convertKeyPoints(h_keypoints, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat &d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-    {
-        keypoints.clear();
-        return;
-    }
-
-    CV_Assert(d_keypoints.type() == CV_32FC1 && d_keypoints.rows == ROWS_COUNT);
-
-    const float* x_ptr = d_keypoints.ptr<float>(X_ROW);
-    const float* y_ptr = d_keypoints.ptr<float>(Y_ROW);
-    const float* response_ptr = d_keypoints.ptr<float>(RESPONSE_ROW);
-    const float* angle_ptr = d_keypoints.ptr<float>(ANGLE_ROW);
-    const float* octave_ptr = d_keypoints.ptr<float>(OCTAVE_ROW);
-    const float* size_ptr = d_keypoints.ptr<float>(SIZE_ROW);
-
-    keypoints.resize(d_keypoints.cols);
-
-    for (int i = 0; i < d_keypoints.cols; ++i)
-    {
-        KeyPoint kp;
-
-        kp.pt.x = x_ptr[i];
-        kp.pt.y = y_ptr[i];
-        kp.response = response_ptr[i];
-        kp.angle = angle_ptr[i];
-        kp.octave = static_cast<int>(octave_ptr[i]);
-        kp.size = size_ptr[i];
-
-        keypoints[i] = kp;
-    }
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
-{
-    buildScalePyramids(image, mask);
-    computeKeyPointsPyramid();
-    mergeKeyPoints(keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors)
-{
-    buildScalePyramids(image, mask);
-    computeKeyPointsPyramid();
-    computeDescriptors(descriptors);
-    mergeKeyPoints(keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    (*this)(image, mask, d_keypoints_);
-    downloadKeyPoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors)
-{
-    (*this)(image, mask, d_keypoints_, descriptors);
-    downloadKeyPoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::release()
-{
-    imagePyr_.clear();
-    maskPyr_.clear();
-
-    buf_.release();
-
-    keyPointsPyr_.clear();
-
-    fastDetector_.release();
-
-    d_keypoints_.release();
+    return makePtr<ORB_Impl>(nfeatures, scaleFactor, nlevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize, fastThreshold, blurForDescriptor);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp
index 6e4479b7d5..3046a604b3 100644
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -76,15 +76,14 @@ CUDA_TEST_P(FAST, Accuracy)
     cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
 
-    cv::cuda::FAST_CUDA fast(threshold);
-    fast.nonmaxSuppression = nonmaxSuppression;
+    cv::Ptr<cv::cuda::FastFeatureDetector> fast = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
 
     if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
     {
         try
         {
             std::vector<cv::KeyPoint> keypoints;
-            fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+            fast->detect(loadMat(image), keypoints);
         }
         catch (const cv::Exception& e)
         {
@@ -94,7 +93,7 @@ CUDA_TEST_P(FAST, Accuracy)
     else
     {
         std::vector<cv::KeyPoint> keypoints;
-        fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+        fast->detect(loadMat(image), keypoints);
 
         std::vector<cv::KeyPoint> keypoints_gold;
         cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
@@ -123,7 +122,7 @@ namespace
     IMPLEMENT_PARAM_CLASS(ORB_BlurForDescriptor, bool)
 }
 
-CV_ENUM(ORB_ScoreType, ORB::HARRIS_SCORE, ORB::FAST_SCORE)
+CV_ENUM(ORB_ScoreType, cv::ORB::HARRIS_SCORE, cv::ORB::FAST_SCORE)
 
 PARAM_TEST_CASE(ORB, cv::cuda::DeviceInfo, ORB_FeaturesCount, ORB_ScaleFactor, ORB_LevelsCount, ORB_EdgeThreshold, ORB_firstLevel, ORB_WTA_K, ORB_ScoreType, ORB_PatchSize, ORB_BlurForDescriptor)
 {
@@ -163,8 +162,9 @@ CUDA_TEST_P(ORB, Accuracy)
     cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
     mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
 
-    cv::cuda::ORB_CUDA orb(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
-    orb.blurForDescriptor = blurForDescriptor;
+    cv::Ptr<cv::cuda::ORB> orb =
+            cv::cuda::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel,
+                                  WTA_K, scoreType, patchSize, 20, blurForDescriptor);
 
     if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
     {
@@ -172,7 +172,7 @@ CUDA_TEST_P(ORB, Accuracy)
         {
             std::vector<cv::KeyPoint> keypoints;
             cv::cuda::GpuMat descriptors;
-            orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+            orb->detectAndComputeAsync(loadMat(image), loadMat(mask), keypoints, descriptors);
         }
         catch (const cv::Exception& e)
         {
@@ -183,7 +183,7 @@ CUDA_TEST_P(ORB, Accuracy)
     {
         std::vector<cv::KeyPoint> keypoints;
         cv::cuda::GpuMat descriptors;
-        orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+        orb->detectAndCompute(loadMat(image), loadMat(mask), keypoints, descriptors);
 
         cv::Ptr<cv::ORB> orb_gold = cv::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
 
@@ -208,7 +208,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Features2D, ORB,  testing::Combine(
     testing::Values(ORB_ScaleFactor(1.2f)),
     testing::Values(ORB_LevelsCount(4), ORB_LevelsCount(8)),
     testing::Values(ORB_EdgeThreshold(31)),
-    testing::Values(ORB_firstLevel(0), ORB_firstLevel(2)),
+    testing::Values(ORB_firstLevel(0)),
     testing::Values(ORB_WTA_K(2), ORB_WTA_K(3), ORB_WTA_K(4)),
     testing::Values(ORB_ScoreType(cv::ORB::HARRIS_SCORE)),
     testing::Values(ORB_PatchSize(31), ORB_PatchSize(29)),
@@ -285,7 +285,8 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::cuda::DeviceInfo, NormCode, DescriptorSiz
 
 CUDA_TEST_P(BruteForceMatcher, Match_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     cv::cuda::GpuMat mask;
     if (useMask)
@@ -295,7 +296,7 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)
     }
 
     std::vector<cv::DMatch> matches;
-    matcher.match(loadMat(query), loadMat(train), matches, mask);
+    matcher->match(loadMat(query), loadMat(train), matches, mask);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -312,13 +313,14 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)
 
 CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -331,9 +333,9 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 
     std::vector<cv::DMatch> matches;
     if (useMask)
-        matcher.match(cv::cuda::GpuMat(query), matches, masks);
+        matcher->match(cv::cuda::GpuMat(query), matches, masks);
     else
-        matcher.match(cv::cuda::GpuMat(query), matches);
+        matcher->match(cv::cuda::GpuMat(query), matches);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -366,7 +368,8 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 2;
 
@@ -378,7 +381,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
     }
 
     std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -405,7 +408,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 3;
 
@@ -417,7 +421,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
     }
 
     std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -444,15 +448,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 2;
 
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -466,9 +471,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
     std::vector< std::vector<cv::DMatch> > matches;
 
     if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
     else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -506,15 +511,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 3;
 
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -528,9 +534,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
     std::vector< std::vector<cv::DMatch> > matches;
 
     if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
     else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -568,7 +574,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 
 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const float radius = 1.f / countFactor;
 
@@ -577,7 +584,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
         try
         {
             std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
+            matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius);
         }
         catch (const cv::Exception& e)
         {
@@ -594,7 +601,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
         }
 
         std::vector< std::vector<cv::DMatch> > matches;
-        matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
+        matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
 
         ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -617,7 +624,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 
 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int n = 3;
     const float radius = 1.f / countFactor * n;
@@ -625,8 +633,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -642,7 +650,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
         try
         {
             std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
         }
         catch (const cv::Exception& e)
         {
@@ -654,9 +662,9 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
         std::vector< std::vector<cv::DMatch> > matches;
 
         if (useMask)
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
         else
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius);
 
         ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp
index 2ab35ccee5..ed72a3ab5c 100644
--- a/modules/cudafilters/src/filtering.cpp
+++ b/modules/cudafilters/src/filtering.cpp
@@ -542,7 +542,7 @@ namespace
             anchor_ = Point(iters_, iters_);
             iters_ = 1;
         }
-        else if (iters_ > 1 && countNonZero(kernel) == (int) kernel.total())
+        else if (iters_ > 1 && cv::countNonZero(kernel) == (int) kernel.total())
         {
             anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_);
             kernel = getStructuringElement(MORPH_RECT,
diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp
index 162ee469ce..73221c44d1 100644
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -81,7 +81,6 @@ namespace
         GpuMat Dy_;
         GpuMat buf_;
         GpuMat eig_;
-        GpuMat minMaxbuf_;
         GpuMat tmpCorners_;
     };
 
@@ -112,7 +111,7 @@ namespace
         cornerCriteria_->compute(image, eig_);
 
         double maxVal = 0;
-        cuda::minMax(eig_, 0, &maxVal, noArray(), minMaxbuf_);
+        cuda::minMax(eig_, 0, &maxVal);
 
         ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
 
diff --git a/modules/cudaimgproc/src/match_template.cpp b/modules/cudaimgproc/src/match_template.cpp
index c5ab143ec7..25c42dfd96 100644
--- a/modules/cudaimgproc/src/match_template.cpp
+++ b/modules/cudaimgproc/src/match_template.cpp
@@ -271,7 +271,6 @@ namespace
     private:
         Match_CCORR_8U match_CCORR_;
         GpuMat image_sqsums_;
-        GpuMat intBuffer_;
     };
 
     void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
@@ -288,7 +287,7 @@ namespace
         match_CCORR_.match(image, templ, _result, stream);
         GpuMat result = _result.getGpuMat();
 
-        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);
 
         double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
 
@@ -335,7 +334,6 @@ namespace
 
     private:
         GpuMat image_sqsums_;
-        GpuMat intBuffer_;
         Match_CCORR_8U match_CCORR_;
     };
 
@@ -359,7 +357,7 @@ namespace
             return;
         }
 
-        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);
 
         double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
 
@@ -383,7 +381,6 @@ namespace
 
     private:
         GpuMat image_sqsums_;
-        GpuMat intBuffer_;
         Match_CCORR_8U match_CCORR_;
     };
 
@@ -398,7 +395,7 @@ namespace
         CV_Assert( image.type() == templ.type() );
         CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
 
-        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);
 
         double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
 
@@ -421,7 +418,6 @@ namespace
         void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
 
     private:
-        GpuMat intBuffer_;
         std::vector<GpuMat> images_;
         std::vector<GpuMat> image_sums_;
         Match_CCORR_8U match_CCORR_;
@@ -444,7 +440,7 @@ namespace
         if (image.channels() == 1)
         {
             image_sums_.resize(1);
-            cuda::integral(image, image_sums_[0], intBuffer_, stream);
+            cuda::integral(image, image_sums_[0], stream);
 
             int templ_sum = (int) cuda::sum(templ)[0];
 
@@ -456,7 +452,7 @@ namespace
 
             image_sums_.resize(images_.size());
             for (int i = 0; i < image.channels(); ++i)
-                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
+                cuda::integral(images_[i], image_sums_[i], stream);
 
             Scalar templ_sum = cuda::sum(templ);
 
@@ -501,7 +497,6 @@ namespace
     private:
         GpuMat imagef_, templf_;
         Match_CCORR_32F match_CCORR_32F_;
-        GpuMat intBuffer_;
         std::vector<GpuMat> images_;
         std::vector<GpuMat> image_sums_;
         std::vector<GpuMat> image_sqsums_;
@@ -527,10 +522,10 @@ namespace
         if (image.channels() == 1)
         {
             image_sums_.resize(1);
-            cuda::integral(image, image_sums_[0], intBuffer_, stream);
+            cuda::integral(image, image_sums_[0], stream);
 
             image_sqsums_.resize(1);
-            cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
+            cuda::sqrIntegral(image, image_sqsums_[0], stream);
 
             int templ_sum = (int) cuda::sum(templ)[0];
             double templ_sqsum = cuda::sqrSum(templ)[0];
@@ -547,8 +542,8 @@ namespace
             image_sqsums_.resize(images_.size());
             for (int i = 0; i < image.channels(); ++i)
             {
-                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
-                cuda::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream);
+                cuda::integral(images_[i], image_sums_[i], stream);
+                cuda::sqrIntegral(images_[i], image_sqsums_[i], stream);
             }
 
             Scalar templ_sum = cuda::sum(templ);
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy.hpp b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
index a72ef09c75..5e57733857 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -43,6 +43,7 @@
 #ifndef __OPENCV_CUDALEGACY_HPP__
 #define __OPENCV_CUDALEGACY_HPP__
 
+#include "opencv2/core/cuda.hpp"
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVPyramid.hpp"
@@ -56,4 +57,16 @@
   @}
 */
 
+namespace cv { namespace cuda {
+
+class CV_EXPORTS ImagePyramid : public Algorithm
+{
+public:
+    virtual void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const = 0;
+};
+
+CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());
+
+}}
+
 #endif /* __OPENCV_CUDALEGACY_HPP__ */
diff --git a/modules/cudalegacy/src/image_pyramid.cpp b/modules/cudalegacy/src/image_pyramid.cpp
new file mode 100644
index 0000000000..938ffea5d8
--- /dev/null
+++ b/modules/cudalegacy/src/image_pyramid.cpp
@@ -0,0 +1,147 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray, int, Stream&) { throw_no_cuda(); return Ptr<ImagePyramid>(); }
+
+#else // HAVE_CUDA
+
+namespace
+{
+    class ImagePyramidImpl : public ImagePyramid
+    {
+    public:
+        ImagePyramidImpl(InputArray img, int nLayers, Stream& stream);
+
+        void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const;
+
+    private:
+        GpuMat layer0_;
+        std::vector<GpuMat> pyramid_;
+        int nLayers_;
+    };
+
+    ImagePyramidImpl::ImagePyramidImpl(InputArray _img, int numLayers, Stream& stream)
+    {
+        GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.depth() <= CV_32F && img.channels() <= 4 );
+
+        img.copyTo(layer0_, stream);
+
+        Size szLastLayer = img.size();
+        nLayers_ = 1;
+
+        if (numLayers <= 0)
+            numLayers = 255; // it will cut-off when any of the dimensions goes 1
+
+        pyramid_.resize(numLayers);
+
+        for (int i = 0; i < numLayers - 1; ++i)
+        {
+            Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+
+            if (szCurLayer.width == 0 || szCurLayer.height == 0)
+                break;
+
+            ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
+            nLayers_++;
+
+            const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
+
+            cv::cuda::device::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
+
+            szLastLayer = szCurLayer;
+        }
+    }
+
+    void ImagePyramidImpl::getLayer(OutputArray _outImg, Size outRoi, Stream& stream) const
+    {
+        CV_Assert( outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0 );
+
+        ensureSizeIsEnough(outRoi, layer0_.type(), _outImg);
+        GpuMat outImg = _outImg.getGpuMat();
+
+        if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
+        {
+            layer0_.copyTo(outImg, stream);
+            return;
+        }
+
+        float lastScale = 1.0f;
+        float curScale;
+        GpuMat lastLayer = layer0_;
+        GpuMat curLayer;
+
+        for (int i = 0; i < nLayers_ - 1; ++i)
+        {
+            curScale = lastScale * 0.5f;
+            curLayer = pyramid_[i];
+
+            if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
+            {
+                curLayer.copyTo(outImg, stream);
+            }
+
+            if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
+                break;
+
+            lastScale = curScale;
+            lastLayer = curLayer;
+        }
+
+        cv::cuda::device::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
+    }
+}
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray img, int nLayers, Stream& stream)
+{
+    return Ptr<ImagePyramid>(new ImagePyramidImpl(img, nLayers, stream));
+}
+
+#endif
diff --git a/modules/cudaobjdetect/CMakeLists.txt b/modules/cudaobjdetect/CMakeLists.txt
new file mode 100644
index 0000000000..351f6e87b4
--- /dev/null
+++ b/modules/cudaobjdetect/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudaobjdetect)
+endif()
+
+set(the_description "CUDA-accelerated Object Detection")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudaobjdetect opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
diff --git a/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
new file mode 100644
index 0000000000..ce916b25a2
--- /dev/null
+++ b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
@@ -0,0 +1,288 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDAOBJDETECT_HPP__
+#define __OPENCV_CUDAOBJDETECT_HPP__
+
+#ifndef __cplusplus
+#  error cudaobjdetect.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+      @defgroup cudaobjdetect Object Detection
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudaobjdetect
+//! @{
+
+//
+// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector
+//
+
+/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
+
+@note
+    -   An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/cpp/peopledetect.cpp
+    -   A CUDA example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/gpu/hog.cpp
+    -   (Python) An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/python2/peopledetect.py
+ */
+class CV_EXPORTS HOG : public Algorithm
+{
+public:
+    enum
+    {
+        DESCR_FORMAT_ROW_BY_ROW,
+        DESCR_FORMAT_COL_BY_COL
+    };
+
+    /** @brief Creates the HOG descriptor and detector.
+
+    @param win_size Detection window size. Align to block size and block stride.
+    @param block_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
+    @param block_stride Block stride. It must be a multiple of cell size.
+    @param cell_size Cell size. Only (8, 8) is supported for now.
+    @param nbins Number of bins. Only 9 bins per cell are supported for now.
+     */
+    static Ptr<HOG> create(Size win_size = Size(64, 128),
+                           Size block_size = Size(16, 16),
+                           Size block_stride = Size(8, 8),
+                           Size cell_size = Size(8, 8),
+                           int nbins = 9);
+
+    //! Gaussian smoothing window parameter.
+    virtual void setWinSigma(double win_sigma) = 0;
+    virtual double getWinSigma() const = 0;
+
+    //! L2-Hys normalization method shrinkage.
+    virtual void setL2HysThreshold(double threshold_L2hys) = 0;
+    virtual double getL2HysThreshold() const = 0;
+
+    //! Flag to specify whether the gamma correction preprocessing is required or not.
+    virtual void setGammaCorrection(bool gamma_correction) = 0;
+    virtual bool getGammaCorrection() const = 0;
+
+    //! Maximum number of detection window increases.
+    virtual void setNumLevels(int nlevels) = 0;
+    virtual int getNumLevels() const = 0;
+
+    //! Threshold for the distance between features and SVM classifying plane.
+    //! Usually it is 0 and should be specfied in the detector coefficients (as the last free
+    //! coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
+    //! manually here.
+    virtual void setHitThreshold(double hit_threshold) = 0;
+    virtual double getHitThreshold() const = 0;
+
+    //! Window stride. It must be a multiple of block stride.
+    virtual void setWinStride(Size win_stride) = 0;
+    virtual Size getWinStride() const = 0;
+
+    //! Coefficient of the detection window increase.
+    virtual void setScaleFactor(double scale0) = 0;
+    virtual double getScaleFactor() const = 0;
+
+    //! Coefficient to regulate the similarity threshold. When detected, some
+    //! objects can be covered by many rectangles. 0 means not to perform grouping.
+    //! See groupRectangles.
+    virtual void setGroupThreshold(int group_threshold) = 0;
+    virtual int getGroupThreshold() const = 0;
+
+    //! Descriptor storage format:
+    //!   - **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
+    //!   - **DESCR_FORMAT_COL_BY_COL** - Column-major order.
+    virtual void setDescriptorFormat(int descr_format) = 0;
+    virtual int getDescriptorFormat() const = 0;
+
+    /** @brief Returns the number of coefficients required for the classification.
+     */
+    virtual size_t getDescriptorSize() const = 0;
+
+    /** @brief Returns the block histogram size.
+     */
+    virtual size_t getBlockHistogramSize() const = 0;
+
+    /** @brief Sets coefficients for the linear SVM classifier.
+     */
+    virtual void setSVMDetector(InputArray detector) = 0;
+
+    /** @brief Returns coefficients of the classifier trained for people detection.
+     */
+    virtual Mat getDefaultPeopleDetector() const = 0;
+
+    /** @brief Performs object detection without a multi-scale window.
+
+    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
+    @param found_locations Left-top corner points of detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detect(InputArray img,
+                        std::vector<Point>& found_locations,
+                        std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Performs object detection with a multi-scale window.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param found_locations Detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detectMultiScale(InputArray img,
+                                  std::vector<Rect>& found_locations,
+                                  std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Returns block descriptors computed for the whole image.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param descriptors 2D array of descriptors.
+    @param stream CUDA stream.
+     */
+    virtual void compute(InputArray img,
+                         OutputArray descriptors,
+                         Stream& stream = Stream::Null()) = 0;
+};
+
+//
+// CascadeClassifier
+//
+
+/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
+
+@note
+   -   A cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier.cpp
+    -   A Nvidea API specific cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
+ */
+class CV_EXPORTS CascadeClassifier : public Algorithm
+{
+public:
+    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP.
+     */
+    static Ptr<CascadeClassifier> create(const String& filename);
+    /** @overload
+     */
+    static Ptr<CascadeClassifier> create(const FileStorage& file);
+
+    //! Maximum possible object size. Objects larger than that are ignored. Used for
+    //! second signature and supported only for LBP cascades.
+    virtual void setMaxObjectSize(Size maxObjectSize) = 0;
+    virtual Size getMaxObjectSize() const = 0;
+
+    //! Minimum possible object size. Objects smaller than that are ignored.
+    virtual void setMinObjectSize(Size minSize) = 0;
+    virtual Size getMinObjectSize() const = 0;
+
+    //! Parameter specifying how much the image size is reduced at each image scale.
+    virtual void setScaleFactor(double scaleFactor) = 0;
+    virtual double getScaleFactor() const = 0;
+
+    //! Parameter specifying how many neighbors each candidate rectangle should have
+    //! to retain it.
+    virtual void setMinNeighbors(int minNeighbors) = 0;
+    virtual int getMinNeighbors() const = 0;
+
+    virtual void setFindLargestObject(bool findLargestObject) = 0;
+    virtual bool getFindLargestObject() = 0;
+
+    virtual void setMaxNumObjects(int maxNumObjects) = 0;
+    virtual int getMaxNumObjects() const = 0;
+
+    virtual Size getClassifierSize() const = 0;
+
+    /** @brief Detects objects of different sizes in the input image.
+
+    @param image Matrix of type CV_8U containing an image where objects should be detected.
+    @param objects Buffer to store detected objects (rectangles).
+    @param stream CUDA stream.
+
+    To get final array of detected objects use CascadeClassifier::convert method.
+
+    @code
+        Ptr<cuda::CascadeClassifier> cascade_gpu = cuda::CascadeClassifier::create(...);
+
+        Mat image_cpu = imread(...)
+        GpuMat image_gpu(image_cpu);
+
+        GpuMat objbuf;
+        cascade_gpu->detectMultiScale(image_gpu, objbuf);
+
+        std::vector<Rect> faces;
+        cascade_gpu->convert(objbuf, faces);
+
+        for(int i = 0; i < detections_num; ++i)
+           cv::rectangle(image_cpu, faces[i], Scalar(255));
+
+        imshow("Faces", image_cpu);
+    @endcode
+
+    @sa CascadeClassifier::detectMultiScale
+     */
+    virtual void detectMultiScale(InputArray image,
+                                  OutputArray objects,
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts objects array from internal representation to standard vector.
+
+    @param gpu_objects Objects array in internal representation.
+    @param objects Resulting array.
+     */
+    virtual void convert(OutputArray gpu_objects,
+                         std::vector<Rect>& objects) = 0;
+};
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* __OPENCV_CUDAOBJDETECT_HPP__ */
diff --git a/modules/cudaobjdetect/perf/perf_main.cpp b/modules/cudaobjdetect/perf/perf_main.cpp
new file mode 100644
index 0000000000..7a927be744
--- /dev/null
+++ b/modules/cudaobjdetect/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaobjdetect)
diff --git a/modules/cuda/perf/perf_objdetect.cpp b/modules/cudaobjdetect/perf/perf_objdetect.cpp
similarity index 80%
rename from modules/cuda/perf/perf_objdetect.cpp
rename to modules/cudaobjdetect/perf/perf_objdetect.cpp
index c5d4649b84..8b3112498d 100644
--- a/modules/cuda/perf/perf_objdetect.cpp
+++ b/modules/cudaobjdetect/perf/perf_objdetect.cpp
@@ -71,10 +71,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
         const cv::cuda::GpuMat d_img(img);
         std::vector<cv::Rect> gpu_found_locations;
 
-        cv::cuda::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+        d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
 
-        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
+        TEST_CYCLE() d_hog->detectMultiScale(d_img, gpu_found_locations);
 
         SANITY_CHECK(gpu_found_locations);
     }
@@ -82,8 +82,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
     {
         std::vector<cv::Rect> cpu_found_locations;
 
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+
         cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog.setSVMDetector(d_hog->getDefaultPeopleDetector());
 
         TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);
 
@@ -105,18 +107,17 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::CascadeClassifier_CUDA d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));
 
         const cv::cuda::GpuMat d_img(img);
         cv::cuda::GpuMat objects_buffer;
-        int detections_num = 0;
 
-        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);
 
-        std::vector<cv::Rect> gpu_rects(detections_num);
-        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
         cv::groupRectangles(gpu_rects, 3, 0.2);
         SANITY_CHECK(gpu_rects);
     }
@@ -144,18 +145,17 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::CascadeClassifier_CUDA d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));
 
         const cv::cuda::GpuMat d_img(img);
         cv::cuda::GpuMat objects_buffer;
-        int detections_num = 0;
 
-        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);
 
-        std::vector<cv::Rect> gpu_rects(detections_num);
-        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
         cv::groupRectangles(gpu_rects, 3, 0.2);
         SANITY_CHECK(gpu_rects);
     }
diff --git a/modules/cudaobjdetect/perf/perf_precomp.hpp b/modules/cudaobjdetect/perf/perf_precomp.hpp
new file mode 100644
index 0000000000..16ebf61f22
--- /dev/null
+++ b/modules/cudaobjdetect/perf/perf_precomp.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
diff --git a/modules/cuda/src/cascadeclassifier.cpp b/modules/cudaobjdetect/src/cascadeclassifier.cpp
similarity index 60%
rename from modules/cuda/src/cascadeclassifier.cpp
rename to modules/cudaobjdetect/src/cascadeclassifier.cpp
index c4e9870151..c3830ad1f1 100644
--- a/modules/cuda/src/cascadeclassifier.cpp
+++ b/modules/cudaobjdetect/src/cascadeclassifier.cpp
@@ -48,160 +48,185 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()               { throw_no_cuda(); }
-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String&)  { throw_no_cuda(); }
-cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA()              { throw_no_cuda(); }
-bool cv::cuda::CascadeClassifier_CUDA::empty() const                    { throw_no_cuda(); return true; }
-bool cv::cuda::CascadeClassifier_CUDA::load(const String&)              { throw_no_cuda(); return true; }
-Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const        { throw_no_cuda(); return Size();}
-void cv::cuda::CascadeClassifier_CUDA::release()                        { throw_no_cuda(); }
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_no_cuda(); return -1;}
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }
 
 #else
 
-struct cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+//
+// CascadeClassifierBase
+//
+
+namespace
 {
-public:
-    CascadeClassifierImpl(){}
-    virtual ~CascadeClassifierImpl(){}
+    class CascadeClassifierBase : public cuda::CascadeClassifier
+    {
+    public:
+        CascadeClassifierBase();
 
-    virtual unsigned int process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize, cv::Size maxObjectSize) = 0;
+        virtual void setMaxObjectSize(Size maxObjectSize) { maxObjectSize_ = maxObjectSize; }
+        virtual Size getMaxObjectSize() const { return maxObjectSize_; }
 
-    virtual cv::Size getClassifierCvSize() const = 0;
-    virtual bool read(const String& classifierAsXml) = 0;
-};
+        virtual void setMinObjectSize(Size minSize) { minObjectSize_ = minSize; }
+        virtual Size getMinObjectSize() const { return minObjectSize_; }
 
-#ifndef HAVE_OPENCV_CUDALEGACY
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }
 
-struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+        virtual void setMinNeighbors(int minNeighbors) { minNeighbors_ = minNeighbors; }
+        virtual int getMinNeighbors() const { return minNeighbors_; }
+
+        virtual void setFindLargestObject(bool findLargestObject) { findLargestObject_ = findLargestObject; }
+        virtual bool getFindLargestObject() { return findLargestObject_; }
+
+        virtual void setMaxNumObjects(int maxNumObjects) { maxNumObjects_ = maxNumObjects; }
+        virtual int getMaxNumObjects() const { return maxNumObjects_; }
+
+    protected:
+        Size maxObjectSize_;
+        Size minObjectSize_;
+        double scaleFactor_;
+        int minNeighbors_;
+        bool findLargestObject_;
+        int maxNumObjects_;
+    };
+
+    CascadeClassifierBase::CascadeClassifierBase() :
+        maxObjectSize_(),
+        minObjectSize_(),
+        scaleFactor_(1.2),
+        minNeighbors_(4),
+        findLargestObject_(false),
+        maxNumObjects_(100)
+    {
+    }
+}
+
+//
+// HaarCascade
+//
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+
+namespace
 {
-public:
-    HaarCascade()
+    class HaarCascade_Impl : public CascadeClassifierBase
     {
-        throw_no_cuda();
+    public:
+        explicit HaarCascade_Impl(const String& filename);
+
+        virtual Size getClassifierSize() const;
+
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        NCVStatus load(const String& classifierFile);
+        NCVStatus calculateMemReqsAndAllocate(const Size& frameSize);
+        NCVStatus process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections);
+
+        Size lastAllocatedFrameSize;
+
+        Ptr<NCVMemStackAllocator> gpuAllocator;
+        Ptr<NCVMemStackAllocator> cpuAllocator;
+
+        cudaDeviceProp devProp;
+        NCVStatus ncvStat;
+
+        Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
+        Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+
+        HaarClassifierCascadeDescriptor haar;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+    };
+
+    static void NCVDebugOutputHandler(const String &msg)
+    {
+        CV_Error(Error::GpuApiCallError, msg.c_str());
     }
 
-    unsigned int process(const GpuMat&, GpuMat&, float, int, bool, bool, cv::Size, cv::Size)
-    {
-        throw_no_cuda();
-        return 0;
-    }
-
-    cv::Size getClassifierCvSize() const
-    {
-        throw_no_cuda();
-        return cv::Size();
-    }
-
-    bool read(const String&)
-    {
-        throw_no_cuda();
-        return false;
-    }
-};
-
-#else
-
-struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
-{
-public:
-    HaarCascade() : lastAllocatedFrameSize(-1, -1)
+    HaarCascade_Impl::HaarCascade_Impl(const String& filename) :
+        lastAllocatedFrameSize(-1, -1)
     {
         ncvSetDebugOutputHandler(NCVDebugOutputHandler);
-    }
-
-    bool read(const String& filename)
-    {
         ncvSafeCall( load(filename) );
-        return true;
     }
 
-    NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize,
-                      /*out*/unsigned int& numDetections)
+    Size HaarCascade_Impl::getClassifierSize() const
     {
-        calculateMemReqsAndAllocate(src.size());
-
-        NCVMemPtr src_beg;
-        src_beg.ptr = (void*)src.ptr<Ncv8u>();
-        src_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment src_seg;
-        src_seg.begin = src_beg;
-        src_seg.size  = src.step * src.rows;
-
-        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
-        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        CV_Assert(objects.rows == 1);
-
-        NCVMemPtr objects_beg;
-        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
-        objects_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment objects_seg;
-        objects_seg.begin = objects_beg;
-        objects_seg.size = objects.step * objects.rows;
-        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
-        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        NcvSize32u roi;
-        roi.width = d_src.width();
-        roi.height = d_src.height();
-
-        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
-
-        Ncv32u flags = 0;
-        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
-        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;
-
-        ncvStat = ncvDetectObjectsMultiScale_device(
-            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures,
-            winMinSize,
-            minNeighbors,
-            scaleStep, 1,
-            flags,
-            *gpuAllocator, *cpuAllocator, devProp, 0);
-        ncvAssertReturnNcvStat(ncvStat);
-        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-        return NCV_SUCCESS;
+        return Size(haar.ClassifierSize.width, haar.ClassifierSize.height);
     }
 
-    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
+    void HaarCascade_Impl::detectMultiScale(InputArray _image,
+                                            OutputArray _objects,
+                                            Stream& stream)
     {
-        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
+        const GpuMat image = _image.getGpuMat();
 
-        const int defaultObjSearchNum = 100;
-        if (objectsBuf.empty())
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );
+
+        Size ncvMinSize = getClassifierSize();
+        if (ncvMinSize.width < minObjectSize_.width && ncvMinSize.height < minObjectSize_.height)
         {
-            objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
+            ncvMinSize.width = minObjectSize_.width;
+            ncvMinSize.height = minObjectSize_.height;
         }
 
-        cv::Size ncvMinSize = this->getClassifierCvSize();
-
-        if (ncvMinSize.width < minSize.width && ncvMinSize.height < minSize.height)
-        {
-            ncvMinSize.width = minSize.width;
-            ncvMinSize.height = minSize.height;
-        }
+        BufferPool pool(stream);
+        GpuMat objectsBuf = pool.getBuffer(1, maxNumObjects_, DataType<Rect>::type);
 
         unsigned int numDetections;
-        ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections));
+        ncvSafeCall( process(image, objectsBuf, ncvMinSize, numDetections) );
 
-        return numDetections;
+        if (numDetections > 0)
+        {
+            objectsBuf.colRange(0, numDetections).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
+        }
     }
 
-    cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }
+    void HaarCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
+    {
+        if (_gpu_objects.empty())
+        {
+            objects.clear();
+            return;
+        }
 
-private:
-    static void NCVDebugOutputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_objects.getGpuMat().download(gpu_objects);
+        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
+        }
 
-    NCVStatus load(const String& classifierFile)
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == DataType<Rect>::type );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    NCVStatus HaarCascade_Impl::load(const String& classifierFile)
     {
         int devId = cv::cuda::getDevice();
         ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
@@ -246,7 +271,7 @@ private:
         return NCV_SUCCESS;
     }
 
-    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
+    NCVStatus HaarCascade_Impl::calculateMemReqsAndAllocate(const Size& frameSize)
     {
         if (lastAllocatedFrameSize == frameSize)
         {
@@ -289,88 +314,62 @@ private:
         return NCV_SUCCESS;
     }
 
-    cudaDeviceProp devProp;
-    NCVStatus ncvStat;
+    NCVStatus HaarCascade_Impl::process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections)
+    {
+        calculateMemReqsAndAllocate(src.size());
 
-    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
-    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+        NCVMemPtr src_beg;
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();
+        src_beg.memtype = NCVMemoryTypeDevice;
 
-    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+        NCVMemSegment src_seg;
+        src_seg.begin = src_beg;
+        src_seg.size  = src.step * src.rows;
 
-    HaarClassifierCascadeDescriptor haar;
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
+        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
 
-    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+        CV_Assert(objects.rows == 1);
 
-    Size lastAllocatedFrameSize;
+        NCVMemPtr objects_beg;
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
+        objects_beg.memtype = NCVMemoryTypeDevice;
 
-    Ptr<NCVMemStackAllocator> gpuAllocator;
-    Ptr<NCVMemStackAllocator> cpuAllocator;
+        NCVMemSegment objects_seg;
+        objects_seg.begin = objects_beg;
+        objects_seg.size = objects.step * objects.rows;
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
+        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
 
-    virtual ~HaarCascade(){}
-};
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+
+        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
+
+        Ncv32u flags = 0;
+        flags |= findLargestObject_ ? NCVPipeObjDet_FindLargestObject : 0;
+
+        ncvStat = ncvDetectObjectsMultiScale_device(
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,
+            winMinSize,
+            minNeighbors_,
+            scaleFactor_, 1,
+            flags,
+            *gpuAllocator, *cpuAllocator, devProp, 0);
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+}
 
 #endif
 
-cv::Size operator -(const cv::Size& a, const cv::Size& b)
-{
-    return cv::Size(a.width - b.width, a.height - b.height);
-}
-
-cv::Size operator +(const cv::Size& a, const int& i)
-{
-    return cv::Size(a.width + i, a.height + i);
-}
-
-cv::Size operator *(const cv::Size& a, const float& f)
-{
-    return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
-}
-
-cv::Size operator /(const cv::Size& a, const float& f)
-{
-    return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
-}
-
-bool operator <=(const cv::Size& a, const cv::Size& b)
-{
-    return a.width <= b.width && a.height <= b.width;
-}
-
-struct PyrLavel
-{
-    PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
-    {
-        do
-        {
-            order = _order;
-            scale = pow(_scale, order);
-            sFrame = frame / scale;
-            workArea = sFrame - window + 1;
-            sWindow = window * scale;
-            _order++;
-        } while (sWindow <= minObjectSize);
-    }
-
-    bool isFeasible(cv::Size maxObj)
-    {
-        return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
-    }
-
-    PyrLavel next(float factor, cv::Size frame, cv::Size window, cv::Size minObjectSize)
-    {
-        return PyrLavel(order + 1, factor, frame, window, minObjectSize);
-    }
-
-    int order;
-    float scale;
-    cv::Size sFrame;
-    cv::Size workArea;
-    cv::Size sWindow;
-};
+//
+// LbpCascade
+//
 
 namespace cv { namespace cuda { namespace device
 {
@@ -394,42 +393,154 @@ namespace cv { namespace cuda { namespace device
                              unsigned int* classified,
                              PtrStepSzi integral);
 
-        void connectedConmonents(PtrStepSz<int4>  candidates, int ncandidates, PtrStepSz<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        void connectedConmonents(PtrStepSz<int4> candidates,
+                                 int ncandidates,
+                                 PtrStepSz<int4> objects,
+                                 int groupThreshold,
+                                 float grouping_eps,
+                                 unsigned int* nclasses);
     }
 }}}
 
-struct cv::cuda::CascadeClassifier_CUDA::LbpCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+namespace
 {
-public:
-    struct Stage
+    cv::Size operator -(const cv::Size& a, const cv::Size& b)
     {
-        int    first;
-        int    ntrees;
-        float  threshold;
+        return cv::Size(a.width - b.width, a.height - b.height);
+    }
+
+    cv::Size operator +(const cv::Size& a, const int& i)
+    {
+        return cv::Size(a.width + i, a.height + i);
+    }
+
+    cv::Size operator *(const cv::Size& a, const float& f)
+    {
+        return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
+    }
+
+    cv::Size operator /(const cv::Size& a, const float& f)
+    {
+        return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
+    }
+
+    bool operator <=(const cv::Size& a, const cv::Size& b)
+    {
+        return a.width <= b.width && a.height <= b.width;
+    }
+
+    struct PyrLavel
+    {
+        PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+        {
+            do
+            {
+                order = _order;
+                scale = pow(_scale, order);
+                sFrame = frame / scale;
+                workArea = sFrame - window + 1;
+                sWindow = window * scale;
+                _order++;
+            } while (sWindow <= minObjectSize);
+        }
+
+        bool isFeasible(cv::Size maxObj)
+        {
+            return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
+        }
+
+        PyrLavel next(float factor, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+        {
+            return PyrLavel(order + 1, factor, frame, window, minObjectSize);
+        }
+
+        int order;
+        float scale;
+        cv::Size sFrame;
+        cv::Size workArea;
+        cv::Size sWindow;
     };
 
-    LbpCascade(){}
-    virtual ~LbpCascade(){}
-
-    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
-        bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
+    class LbpCascade_Impl : public CascadeClassifierBase
     {
-        CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);
+    public:
+        explicit LbpCascade_Impl(const FileStorage& file);
+
+        virtual Size getClassifierSize() const { return NxM; }
+
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        bool load(const FileNode &root);
+        void allocateBuffers(cv::Size frame);
+
+    private:
+        struct Stage
+        {
+            int    first;
+            int    ntrees;
+            float  threshold;
+        };
+
+        enum stage { BOOST = 0 };
+        enum feature { LBP = 1, HAAR = 2 };
+
+        static const stage stageType = BOOST;
+        static const feature featureType = LBP;
+
+        cv::Size NxM;
+        bool isStumps;
+        int ncategories;
+        int subsetSize;
+        int nodeStep;
+
+        // gpu representation of classifier
+        GpuMat stage_mat;
+        GpuMat trees_mat;
+        GpuMat nodes_mat;
+        GpuMat leaves_mat;
+        GpuMat subsets_mat;
+        GpuMat features_mat;
+
+        GpuMat integral;
+        GpuMat integralBuffer;
+        GpuMat resuzeBuffer;
+
+        GpuMat candidates;
+        static const int integralFactor = 4;
+    };
+
+    LbpCascade_Impl::LbpCascade_Impl(const FileStorage& file)
+    {
+        load(file.getFirstTopLevelNode());
+    }
+
+    void LbpCascade_Impl::detectMultiScale(InputArray _image,
+                                           OutputArray _objects,
+                                           Stream& stream)
+    {
+        const GpuMat image = _image.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );
 
-        // const int defaultObjSearchNum = 100;
         const float grouping_eps = 0.2f;
 
-        if( !objects.empty() && objects.depth() == CV_32S)
-            objects.reshape(4, 1);
-        else
-            objects.create(1 , image.cols >> 4, CV_32SC4);
+        BufferPool pool(stream);
+        GpuMat objects = pool.getBuffer(1, maxNumObjects_, DataType<Rect>::type);
 
         // used for debug
         // candidates.setTo(cv::Scalar::all(0));
         // objects.setTo(cv::Scalar::all(0));
 
-        if (maxObjectSize == cv::Size())
-            maxObjectSize = image.size();
+        if (maxObjectSize_ == cv::Size())
+            maxObjectSize_ = image.size();
 
         allocateBuffers(image.size());
 
@@ -437,9 +548,9 @@ public:
         GpuMat dclassified(1, 1, CV_32S);
         cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
 
-        PyrLavel level(0, scaleFactor, image.size(), NxM, minObjectSize);
+        PyrLavel level(0, scaleFactor_, image.size(), NxM, minObjectSize_);
 
-        while (level.isFeasible(maxObjectSize))
+        while (level.isFeasible(maxObjectSize_))
         {
             int acc = level.sFrame.width + 1;
             float iniScale = level.scale;
@@ -449,23 +560,22 @@ public:
 
             int total = 0, prev  = 0;
 
-            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
+            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize_))
             {
                 // create sutable matrix headers
                 GpuMat src  = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
                 GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
-                GpuMat buff = integralBuffer;
 
                 // generate integral for scale
                 cuda::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
-                cuda::integral(src, sint, buff);
+                cuda::integral(src, sint);
 
                 // calculate job
                 int totalWidth = level.workArea.width / step;
                 total += totalWidth * (level.workArea.height / step);
 
                 // go to next pyramide level
-                level = level.next(scaleFactor, image.size(), NxM, minObjectSize);
+                level = level.next(scaleFactor_, image.size(), NxM, minObjectSize_);
                 area = level.workArea;
 
                 step = (1 + (level.scale <= 2.f));
@@ -473,60 +583,55 @@ public:
                 acc += level.sFrame.width + 1;
             }
 
-            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor_, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
                 leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
         }
 
-        if (groupThreshold <= 0  || objects.empty())
-            return 0;
+        if (minNeighbors_ <= 0  || objects.empty())
+            return;
 
         cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
-        device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
+        device::lbp::connectedConmonents(candidates, classified, objects, minNeighbors_, grouping_eps, dclassified.ptr<unsigned int>());
 
         cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
         cudaSafeCall( cudaDeviceSynchronize() );
-        return classified;
-    }
 
-    virtual cv::Size getClassifierCvSize() const { return NxM; }
-
-    bool read(const String& classifierAsXml)
-    {
-        FileStorage fs(classifierAsXml, FileStorage::READ);
-        return fs.isOpened() ? read(fs.getFirstTopLevelNode()) : false;
-    }
-
-private:
-
-    void allocateBuffers(cv::Size frame)
-    {
-        if (frame == cv::Size())
-            return;
-
-        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        if (classified > 0)
         {
-            resuzeBuffer.create(frame, CV_8UC1);
-
-            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
-
-#ifdef HAVE_OPENCV_CUDALEGACY
-            NcvSize32u roiSize;
-            roiSize.width = frame.width;
-            roiSize.height = frame.height;
-
-            cudaDeviceProp prop;
-            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
-
-            Ncv32u bufSize;
-            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-            integralBuffer.create(1, bufSize, CV_8UC1);
-#endif
-
-            candidates.create(1 , frame.width >> 1, CV_32SC4);
+            objects.colRange(0, classified).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
         }
     }
 
-    bool read(const FileNode &root)
+    void LbpCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
+    {
+        if (_gpu_objects.empty())
+        {
+            objects.clear();
+            return;
+        }
+
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_objects.getGpuMat().download(gpu_objects);
+        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
+        }
+
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == DataType<Rect>::type );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    bool LbpCascade_Impl::load(const FileNode &root)
     {
         const char *CUDA_CC_STAGE_TYPE       = "stageType";
         const char *CUDA_CC_FEATURE_TYPE     = "featureType";
@@ -667,92 +772,90 @@ private:
         return true;
     }
 
-    enum stage { BOOST = 0 };
-    enum feature { LBP = 1, HAAR = 2 };
-    static const stage stageType = BOOST;
-    static const feature featureType = LBP;
+    void LbpCascade_Impl::allocateBuffers(cv::Size frame)
+    {
+        if (frame == cv::Size())
+            return;
 
-    cv::Size NxM;
-    bool isStumps;
-    int ncategories;
-    int subsetSize;
-    int nodeStep;
+        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        {
+            resuzeBuffer.create(frame, CV_8UC1);
 
-    // gpu representation of classifier
-    GpuMat stage_mat;
-    GpuMat trees_mat;
-    GpuMat nodes_mat;
-    GpuMat leaves_mat;
-    GpuMat subsets_mat;
-    GpuMat features_mat;
+            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
 
-    GpuMat integral;
-    GpuMat integralBuffer;
-    GpuMat resuzeBuffer;
+        #ifdef HAVE_OPENCV_CUDALEGACY
+            NcvSize32u roiSize;
+            roiSize.width = frame.width;
+            roiSize.height = frame.height;
 
-    GpuMat candidates;
-    static const int integralFactor = 4;
-};
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
 
-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()
-: findLargestObject(false), visualizeInPlace(false), impl(0) {}
+            Ncv32u bufSize;
+            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+            integralBuffer.create(1, bufSize, CV_8UC1);
+        #endif
 
-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String& filename)
-: findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }
+            candidates.create(1 , frame.width >> 1, CV_32SC4);
+        }
+    }
 
-cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA() { release(); }
-
-void cv::cuda::CascadeClassifier_CUDA::release() { if (impl) { delete impl; impl = 0; } }
-
-bool cv::cuda::CascadeClassifier_CUDA::empty() const { return impl == 0; }
-
-Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const
-{
-    return this->empty() ? Size() : impl->getClassifierCvSize();
 }
 
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
-{
-    CV_Assert( !this->empty());
-    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
-}
+//
+// create
+//
 
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String& filename)
 {
-    CV_Assert( !this->empty());
-    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
-}
-
-bool cv::cuda::CascadeClassifier_CUDA::load(const String& filename)
-{
-    release();
-
     String fext = filename.substr(filename.find_last_of(".") + 1);
     fext = fext.toLowerCase();
 
     if (fext == "nvbin")
     {
-        impl = new HaarCascade();
-        return impl->read(filename);
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
     }
 
     FileStorage fs(filename, FileStorage::READ);
 
     if (!fs.isOpened())
     {
-        impl = new HaarCascade();
-        return impl->read(filename);
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
     }
 
     const char *CUDA_CC_LBP = "LBP";
     String featureTypeStr = (String)fs.getFirstTopLevelNode()["featureType"];
     if (featureTypeStr == CUDA_CC_LBP)
-        impl = new LbpCascade();
+    {
+        return makePtr<LbpCascade_Impl>(fs);
+    }
     else
-        impl = new HaarCascade();
+    {
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
+    }
 
-    impl->read(filename);
-    return !this->empty();
+    CV_Error(Error::StsUnsupportedFormat, "Unsupported format for CUDA CascadeClassifier");
+    return Ptr<cuda::CascadeClassifier>();
+}
+
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage& file)
+{
+    return makePtr<LbpCascade_Impl>(file);
 }
 
 #endif
diff --git a/modules/cuda/src/cuda/hog.cu b/modules/cudaobjdetect/src/cuda/hog.cu
similarity index 100%
rename from modules/cuda/src/cuda/hog.cu
rename to modules/cudaobjdetect/src/cuda/hog.cu
diff --git a/modules/cuda/src/cuda/lbp.cu b/modules/cudaobjdetect/src/cuda/lbp.cu
similarity index 100%
rename from modules/cuda/src/cuda/lbp.cu
rename to modules/cudaobjdetect/src/cuda/lbp.cu
diff --git a/modules/cuda/src/cuda/lbp.hpp b/modules/cudaobjdetect/src/cuda/lbp.hpp
similarity index 100%
rename from modules/cuda/src/cuda/lbp.hpp
rename to modules/cudaobjdetect/src/cuda/lbp.hpp
diff --git a/modules/cudaobjdetect/src/hog.cpp b/modules/cudaobjdetect/src/hog.cpp
new file mode 100644
index 0000000000..1d465ff25c
--- /dev/null
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -0,0 +1,1697 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size, Size, Size, Size, int) { throw_no_cuda(); return Ptr<cuda::HOG>(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hog
+    {
+        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
+                              int nblocks_win_x, int nblocks_win_y);
+
+        void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
+                           int height, int width, const cv::cuda::PtrStepSzf& grad,
+                           const cv::cuda::PtrStepSzb& qangle, float sigma, float* block_hists);
+
+        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
+                             int height, int width, float* block_hists, float threshold);
+
+        void classify_hists(int win_height, int win_width, int block_stride_y,
+                            int block_stride_x, int win_stride_y, int win_stride_x, int height,
+                            int width, float* block_hists, float* coefs, float free_coef,
+                            float threshold, unsigned char* labels);
+
+        void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                           int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                           float* coefs, float free_coef, float threshold, float *confidences);
+
+        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    cv::cuda::PtrStepSzf descriptors);
+        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    cv::cuda::PtrStepSzf descriptors);
+
+        void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
+        void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
+                                    float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
+
+        void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
+        void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
+    }
+}}}
+
+using namespace cv::cuda::device;
+
+namespace
+{
+    class HOG_Impl : public cv::cuda::HOG
+    {
+    public:
+        HOG_Impl(Size win_size,
+                 Size block_size,
+                 Size block_stride,
+                 Size cell_size,
+                 int nbins);
+
+        virtual void setWinSigma(double win_sigma) { win_sigma_ = win_sigma; }
+        virtual double getWinSigma() const;
+
+        virtual void setL2HysThreshold(double threshold_L2hys) { threshold_L2hys_ = threshold_L2hys; }
+        virtual double getL2HysThreshold() const { return threshold_L2hys_; }
+
+        virtual void setGammaCorrection(bool gamma_correction) { gamma_correction_ = gamma_correction; }
+        virtual bool getGammaCorrection() const { return gamma_correction_; }
+
+        virtual void setNumLevels(int nlevels) { nlevels_ = nlevels; }
+        virtual int getNumLevels() const { return nlevels_; }
+
+        virtual void setHitThreshold(double hit_threshold) { hit_threshold_ = hit_threshold; }
+        virtual double getHitThreshold() const { return hit_threshold_; }
+
+        virtual void setWinStride(Size win_stride) { win_stride_ = win_stride; }
+        virtual Size getWinStride() const { return win_stride_; }
+
+        virtual void setScaleFactor(double scale0) { scale0_ = scale0; }
+        virtual double getScaleFactor() const { return scale0_; }
+
+        virtual void setGroupThreshold(int group_threshold) { group_threshold_ = group_threshold; }
+        virtual int getGroupThreshold() const { return group_threshold_; }
+
+        virtual void setDescriptorFormat(int descr_format) { descr_format_ = descr_format; }
+        virtual int getDescriptorFormat() const { return descr_format_; }
+
+        virtual size_t getDescriptorSize() const;
+
+        virtual size_t getBlockHistogramSize() const;
+
+        virtual void setSVMDetector(InputArray detector);
+
+        virtual Mat getDefaultPeopleDetector() const;
+
+        virtual void detect(InputArray img,
+                            std::vector<Point>& found_locations,
+                            std::vector<double>* confidences);
+
+        virtual void detectMultiScale(InputArray img,
+                                      std::vector<Rect>& found_locations,
+                                      std::vector<double>* confidences);
+
+        virtual void compute(InputArray img,
+                             OutputArray descriptors,
+                             Stream& stream);
+
+    private:
+        Size win_size_;
+        Size block_size_;
+        Size block_stride_;
+        Size cell_size_;
+        int nbins_;
+
+        double win_sigma_;
+        double threshold_L2hys_;
+        bool gamma_correction_;
+        int nlevels_;
+        double hit_threshold_;
+        Size win_stride_;
+        double scale0_;
+        int group_threshold_;
+        int descr_format_;
+
+    private:
+        int getTotalHistSize(Size img_size) const;
+        void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists);
+        void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
+
+        // Coefficients of the separating plane
+        float free_coef_;
+        GpuMat detector_;
+    };
+
+    HOG_Impl::HOG_Impl(Size win_size,
+                       Size block_size,
+                       Size block_stride,
+                       Size cell_size,
+                       int nbins) :
+        win_size_(win_size),
+        block_size_(block_size),
+        block_stride_(block_stride),
+        cell_size_(cell_size),
+        nbins_(nbins),
+
+        win_sigma_(-1.0),
+        threshold_L2hys_(0.2),
+        gamma_correction_(true),
+        nlevels_(64),
+        hit_threshold_(0.0),
+        win_stride_(block_stride),
+        scale0_(1.05),
+        group_threshold_(2),
+        descr_format_(DESCR_FORMAT_COL_BY_COL)
+    {
+        CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
+                  (win_size.height - block_size.height) % block_stride.height == 0);
+
+        CV_Assert(block_size.width % cell_size.width == 0 &&
+                  block_size.height % cell_size.height == 0);
+
+        CV_Assert(block_stride == cell_size);
+
+        CV_Assert(cell_size == Size(8, 8));
+
+        Size cells_per_block(block_size.width / cell_size.width, block_size.height / cell_size.height);
+        CV_Assert(cells_per_block == Size(2, 2));
+    }
+
+    static int numPartsWithin(int size, int part_size, int stride)
+    {
+        return (size - part_size + stride) / stride;
+    }
+
+    static Size numPartsWithin(Size size, Size part_size, Size stride)
+    {
+        return Size(numPartsWithin(size.width, part_size.width, stride.width),
+                    numPartsWithin(size.height, part_size.height, stride.height));
+    }
+
+    size_t HOG_Impl::getDescriptorSize() const
+    {
+        return numPartsWithin(win_size_, block_size_, block_stride_).area() * getBlockHistogramSize();
+    }
+
+    size_t HOG_Impl::getBlockHistogramSize() const
+    {
+        Size cells_per_block(block_size_.width / cell_size_.width, block_size_.height / cell_size_.height);
+        return nbins_ * cells_per_block.area();
+    }
+
+    double HOG_Impl::getWinSigma() const
+    {
+        return win_sigma_ >= 0 ? win_sigma_ : (block_size_.width + block_size_.height) / 8.0;
+    }
+
+    void HOG_Impl::setSVMDetector(InputArray _detector)
+    {
+        const int descriptor_size = static_cast<int>(getDescriptorSize());
+
+        const Mat detector = _detector.getMat();
+
+        CV_Assert( detector.type() == CV_32FC1 );
+        CV_Assert( detector.rows == 1 );
+        CV_Assert( detector.cols == descriptor_size || detector.cols == descriptor_size + 1 );
+
+        std::vector<float> detector_reordered(detector.ptr<float>(), detector.ptr<float>() + detector.cols);
+
+        size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
+
+        for (int i = 0; i < blocks_per_win.height; ++i)
+        {
+            for (int j = 0; j < blocks_per_win.width; ++j)
+            {
+                const float* src = detector.ptr<float>() + (j * blocks_per_win.height + i) * block_hist_size;
+                float* dst = &detector_reordered[0] + (i * blocks_per_win.width + j) * block_hist_size;
+                for (size_t k = 0; k < block_hist_size; ++k)
+                    dst[k] = src[k];
+            }
+        }
+
+        detector_.upload(Mat(detector_reordered).reshape(1, 1));
+        free_coef_ = detector.cols > descriptor_size ? detector.at<float>(0, descriptor_size) : 0;
+    }
+
+    static Mat getPeopleDetector64x128();
+    static Mat getPeopleDetector48x96();
+
+    Mat HOG_Impl::getDefaultPeopleDetector() const
+    {
+        CV_Assert( win_size_ == Size(64, 128) || win_size_ == Size(48, 96) );
+
+        if (win_size_ == Size(64, 128))
+            return getPeopleDetector64x128();
+        else
+            return getPeopleDetector48x96();
+    }
+
+    void HOG_Impl::detect(InputArray _img, std::vector<Point>& hits, std::vector<double>* confidences)
+    {
+        const GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
+
+        hits.clear();
+        if (detector_.empty())
+            return;
+
+        BufferPool pool(Stream::Null());
+
+        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
+        computeBlockHistograms(img, block_hists);
+
+        Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
+
+        if (confidences == NULL)
+        {
+            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_8UC1);
+
+            hog::classify_hists(win_size_.height, win_size_.width,
+                                block_stride_.height, block_stride_.width,
+                                win_stride_.height, win_stride_.width,
+                                img.rows, img.cols,
+                                block_hists.ptr<float>(),
+                                detector_.ptr<float>(),
+                                (float)free_coef_,
+                                (float)hit_threshold_,
+                                labels.ptr());
+
+            Mat labels_host;
+            labels.download(labels_host);
+            unsigned char* vec = labels_host.ptr();
+
+            for (int i = 0; i < wins_per_img.area(); i++)
+            {
+                int y = i / wins_per_img.width;
+                int x = i - wins_per_img.width * y;
+                if (vec[i])
+                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
+            }
+        }
+        else
+        {
+            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_32FC1);
+
+            hog::compute_confidence_hists(win_size_.height, win_size_.width,
+                                          block_stride_.height, block_stride_.width,
+                                          win_stride_.height, win_stride_.width,
+                                          img.rows, img.cols,
+                                          block_hists.ptr<float>(),
+                                          detector_.ptr<float>(),
+                                          (float)free_coef_,
+                                          (float)hit_threshold_,
+                                          labels.ptr<float>());
+
+            Mat labels_host;
+            labels.download(labels_host);
+            float* vec = labels_host.ptr<float>();
+
+            confidences->clear();
+            for (int i = 0; i < wins_per_img.area(); i++)
+            {
+                int y = i / wins_per_img.width;
+                int x = i - wins_per_img.width * y;
+
+                if (vec[i] >= hit_threshold_)
+                {
+                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
+                    confidences->push_back((double)vec[i]);
+                }
+            }
+        }
+    }
+
+    void HOG_Impl::detectMultiScale(InputArray _img,
+                                    std::vector<Rect>& found_locations,
+                                    std::vector<double>* confidences)
+    {
+        const GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( confidences == NULL || group_threshold_ == 0 );
+
+        std::vector<double> level_scale;
+        double scale = 1.0;
+        int levels = 0;
+        for (levels = 0; levels < nlevels_; levels++)
+        {
+            level_scale.push_back(scale);
+
+            if (cvRound(img.cols / scale) < win_size_.width ||
+                cvRound(img.rows / scale) < win_size_.height ||
+                scale0_ <= 1)
+            {
+                break;
+            }
+
+            scale *= scale0_;
+        }
+        levels = std::max(levels, 1);
+        level_scale.resize(levels);
+
+        std::vector<Point> level_hits;
+        std::vector<double> level_confidences;
+
+        BufferPool pool(Stream::Null());
+
+        found_locations.clear();
+        for (size_t i = 0; i < level_scale.size(); i++)
+        {
+            scale = level_scale[i];
+
+            Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
+
+            GpuMat smaller_img;
+            if (sz == img.size())
+            {
+                smaller_img = img;
+            }
+            else
+            {
+                smaller_img = pool.getBuffer(sz, img.type());
+                switch (img.type())
+                {
+                    case CV_8UC1: hog::resize_8UC1(img, smaller_img); break;
+                    case CV_8UC4: hog::resize_8UC4(img, smaller_img); break;
+                }
+            }
+
+            detect(smaller_img, level_hits,
+                   confidences ? &level_confidences : NULL);
+
+            Size scaled_win_size(cvRound(win_size_.width * scale),
+                                 cvRound(win_size_.height * scale));
+
+            for (size_t j = 0; j < level_hits.size(); j++)
+            {
+                found_locations.push_back(Rect(Point2d(level_hits[j]) * scale, scaled_win_size));
+                if (confidences)
+                    confidences->push_back(level_confidences[j]);
+            }
+        }
+
+        if (group_threshold_ > 0)
+        {
+            groupRectangles(found_locations, group_threshold_, 0.2/*magic number copied from CPU version*/);
+        }
+    }
+
+    void HOG_Impl::compute(InputArray _img,
+                           OutputArray _descriptors,
+                           Stream& stream)
+    {
+        const GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
+        CV_Assert( !stream );
+
+        BufferPool pool(stream);
+
+        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
+        computeBlockHistograms(img, block_hists);
+
+        const size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
+        Size wins_per_img   = numPartsWithin(img.size(), win_size_, win_stride_);
+
+        _descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32FC1);
+        GpuMat descriptors = _descriptors.getGpuMat();
+
+        switch (descr_format_)
+        {
+        case DESCR_FORMAT_ROW_BY_ROW:
+            hog::extract_descrs_by_rows(win_size_.height, win_size_.width,
+                                        block_stride_.height, block_stride_.width,
+                                        win_stride_.height, win_stride_.width,
+                                        img.rows, img.cols,
+                                        block_hists.ptr<float>(),
+                                        descriptors);
+            break;
+        case DESCR_FORMAT_COL_BY_COL:
+            hog::extract_descrs_by_cols(win_size_.height, win_size_.width,
+                                        block_stride_.height, block_stride_.width,
+                                        win_stride_.height, win_stride_.width,
+                                        img.rows, img.cols,
+                                        block_hists.ptr<float>(),
+                                        descriptors);
+            break;
+        default:
+            CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
+        }
+    }
+
+    int HOG_Impl::getTotalHistSize(Size img_size) const
+    {
+        size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_img = numPartsWithin(img_size, block_size_, block_stride_);
+        return static_cast<int>(block_hist_size * blocks_per_img.area());
+    }
+
+    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
+    {
+        cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
+        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height);
+
+        BufferPool pool(Stream::Null());
+
+        GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
+        GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
+        computeGradient(img, grad, qangle);
+
+        block_hists.create(1, getTotalHistSize(img.size()), CV_32FC1);
+
+        hog::compute_hists(nbins_,
+                           block_stride_.width, block_stride_.height,
+                           img.rows, img.cols,
+                           grad, qangle,
+                           (float)getWinSigma(),
+                           block_hists.ptr<float>());
+
+        hog::normalize_hists(nbins_,
+                             block_stride_.width, block_stride_.height,
+                             img.rows, img.cols,
+                             block_hists.ptr<float>(),
+                             (float)threshold_L2hys_);
+    }
+
+    void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)
+    {
+        grad.create(img.size(), CV_32FC2);
+        qangle.create(img.size(), CV_8UC2);
+
+        float angleScale = (float)(nbins_ / CV_PI);
+        switch (img.type())
+        {
+            case CV_8UC1:
+                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
+                break;
+            case CV_8UC4:
+                hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
+                break;
+        }
+    }
+}
+
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size win_size,
+                                     Size block_size,
+                                     Size block_stride,
+                                     Size cell_size,
+                                     int nbins)
+{
+    return makePtr<HOG_Impl>(win_size, block_size, block_stride, cell_size, nbins);
+}
+
+namespace
+{
+    static Mat getPeopleDetector48x96()
+    {
+        static float detector[] = {
+            0.294350f, -0.098796f, -0.129522f, 0.078753f, 0.387527f, 0.261529f,
+            0.145939f, 0.061520f, 0.328699f, 0.227148f, -0.066467f, -0.086723f,
+            0.047559f, 0.106714f, 0.037897f, 0.111461f, -0.024406f, 0.304769f,
+            0.254676f, -0.069235f, 0.082566f, 0.147260f, 0.326969f, 0.148888f,
+            0.055270f, -0.087985f, 0.261720f, 0.143442f, 0.026812f, 0.238212f,
+            0.194020f, 0.056341f, -0.025854f, -0.034444f, -0.156631f, 0.205174f,
+            0.089008f, -0.139811f, -0.100147f, -0.037830f, -0.029230f, -0.055641f,
+            0.033248f, -0.016512f, 0.155244f, 0.247315f, -0.124694f, -0.048414f,
+            -0.062219f, 0.193683f, 0.004574f, 0.055089f, 0.093565f, 0.167712f,
+            0.167581f, 0.018895f, 0.215258f, 0.122609f, 0.090520f, -0.067219f,
+            -0.049029f, -0.099615f, 0.241804f, -0.094893f, -0.176248f, 0.001727f,
+            -0.134473f, 0.104442f, 0.050942f, 0.081165f, 0.072156f, 0.121646f,
+            0.002656f, -0.297974f, -0.133587f, -0.060121f, -0.092515f, -0.048974f,
+            -0.084754f, -0.180111f, -0.038590f, 0.086283f, -0.134636f, -0.107249f,
+            0.132890f, 0.141556f, 0.249425f, 0.130273f, -0.030031f, 0.073212f,
+            -0.008155f, 0.019931f, 0.071688f, 0.000300f, -0.019525f, -0.021725f,
+            -0.040993f, -0.086841f, 0.070124f, 0.240033f, 0.265350f, 0.043208f,
+            0.166754f, 0.091453f, 0.060916f, -0.036972f, -0.091043f, 0.079873f,
+            0.219781f, 0.158102f, -0.140618f, -0.043016f, 0.124802f, 0.093668f,
+            0.103208f, 0.094872f, 0.080541f, 0.137711f, 0.160566f, -0.169231f,
+            0.013983f, 0.309508f, -0.004217f, -0.057200f, -0.064489f, 0.014066f,
+            0.361009f, 0.251328f, -0.080983f, -0.044183f, 0.061436f, -0.037381f,
+            -0.078786f, 0.030993f, 0.066314f, 0.037683f, 0.152325f, -0.091683f,
+            0.070203f, 0.217856f, 0.036435f, -0.076462f, 0.006254f, -0.094431f,
+            0.154829f, -0.023038f, -0.196961f, -0.024594f, 0.178465f, -0.050139f,
+            -0.045932f, -0.000965f, 0.109112f, 0.046165f, -0.159373f, -0.008713f,
+            0.041307f, 0.097129f, -0.057211f, -0.064599f, 0.077165f, 0.176167f,
+            0.138322f, 0.065753f, -0.104950f, 0.017933f, 0.136255f, -0.011598f,
+            0.047007f, 0.080550f, 0.068619f, 0.084661f, -0.035493f, -0.091314f,
+            -0.041411f, 0.060971f, -0.101912f, -0.079870f, -0.085977f, -0.022686f,
+            0.079788f, -0.098064f, -0.054603f, 0.040383f, 0.300794f, 0.128603f,
+            0.094844f, 0.047407f, 0.101825f, 0.061832f, -0.162160f, -0.204553f,
+            -0.035165f, 0.101450f, -0.016641f, -0.027140f, -0.134392f, -0.008743f,
+            0.102331f, 0.114853f, 0.009644f, 0.062823f, 0.237339f, 0.167843f,
+            0.053066f, -0.012592f, 0.043158f, 0.002305f, 0.065001f, -0.038929f,
+            -0.020356f, 0.152343f, 0.043469f, -0.029967f, -0.042948f, 0.032481f,
+            0.068488f, -0.110840f, -0.111083f, 0.111980f, -0.002072f, -0.005562f,
+            0.082926f, 0.006635f, -0.108153f, 0.024242f, -0.086464f, -0.189884f,
+            -0.017492f, 0.191456f, -0.007683f, -0.128769f, -0.038017f, -0.132380f,
+            0.091926f, 0.079696f, -0.106728f, -0.007656f, 0.172744f, 0.011576f,
+            0.009883f, 0.083258f, -0.026516f, 0.145534f, 0.153924f, -0.130290f,
+            -0.108945f, 0.124490f, -0.003186f, -0.100485f, 0.015024f, -0.060512f,
+            0.026288f, -0.086713f, -0.169012f, 0.076517f, 0.215778f, 0.043701f,
+            -0.131642f, -0.012585f, -0.045181f, -0.118183f, -0.241544f, -0.167293f,
+            -0.020107f, -0.019917f, -0.101827f, -0.107096f, -0.010503f, 0.044938f,
+            0.189680f, 0.217119f, -0.046086f, 0.044508f, 0.199716f, -0.036004f,
+            -0.148927f, 0.013355f, -0.078279f, 0.030451f, 0.056301f, -0.024609f,
+            0.083224f, 0.099533f, -0.039432f, -0.138880f, 0.005482f, -0.024120f,
+            -0.140468f, -0.066381f, -0.017057f, 0.009260f, -0.058004f, -0.028486f,
+            -0.061610f, 0.007483f, -0.158309f, -0.150687f, -0.044595f, -0.105121f,
+            -0.045763f, -0.006618f, -0.024419f, -0.117713f, -0.119366f, -0.175941f,
+            -0.071542f, 0.119027f, 0.111362f, 0.043080f, 0.034889f, 0.093003f,
+            0.007842f, 0.057368f, -0.108834f, -0.079968f, 0.230959f, 0.020205f,
+            0.011470f, 0.098877f, 0.101310f, -0.030215f, -0.018018f, -0.059552f,
+            -0.106157f, 0.021866f, -0.036471f, 0.080051f, 0.041165f, -0.082101f,
+            0.117726f, 0.030961f, -0.054763f, -0.084102f, -0.185778f, -0.061305f,
+            -0.038089f, -0.110728f, -0.264010f, 0.076675f, -0.077111f, -0.137644f,
+            0.036232f, 0.277995f, 0.019116f, 0.107738f, 0.144003f, 0.080304f,
+            0.215036f, 0.228897f, 0.072713f, 0.077773f, 0.120168f, 0.075324f,
+            0.062730f, 0.122478f, -0.049008f, 0.164912f, 0.162450f, 0.041246f,
+            0.009891f, -0.097827f, -0.038700f, -0.023027f, -0.120020f, 0.203364f,
+            0.248474f, 0.149810f, -0.036276f, -0.082814f, -0.090343f, -0.027143f,
+            -0.075689f, -0.320310f, -0.000500f, -0.143334f, -0.065077f, -0.186936f,
+            0.129372f, 0.116431f, 0.181699f, 0.170436f, 0.418854f, 0.460045f,
+            0.333719f, 0.230515f, 0.047822f, -0.044954f, -0.068086f, 0.140179f,
+            -0.044821f, 0.085550f, 0.092483f, -0.107296f, -0.130670f, -0.206629f,
+            0.114601f, -0.317869f, -0.076663f, 0.038680f, 0.212753f, -0.016059f,
+            -0.126526f, -0.163602f, 0.210154f, 0.099887f, -0.126366f, 0.118453f,
+            0.019309f, -0.021611f, -0.096499f, -0.111809f, -0.200489f, 0.142854f,
+            0.228840f, -0.353346f, -0.179151f, 0.116834f, 0.252389f, -0.031728f,
+            -0.188135f, -0.158998f, 0.386523f, 0.122315f, 0.209944f, 0.394023f,
+            0.359030f, 0.260717f, 0.170335f, 0.013683f, -0.142596f, -0.026138f,
+            -0.011878f, -0.150519f, 0.047159f, -0.107062f, -0.147347f, -0.187689f,
+            -0.186027f, -0.208048f, 0.058468f, -0.073026f, -0.236556f, -0.079788f,
+            -0.146216f, -0.058563f, -0.101361f, -0.071294f, -0.071093f, 0.116919f,
+            0.234304f, 0.306781f, 0.321866f, 0.240000f, 0.073261f, -0.012173f,
+            0.026479f, 0.050173f, 0.166127f, 0.228955f, 0.061905f, 0.156460f,
+            0.205990f, 0.120672f, 0.037350f, 0.167884f, 0.290099f, 0.420900f,
+            -0.012601f, 0.189839f, 0.306378f, 0.118383f, -0.095598f, -0.072360f,
+            -0.132496f, -0.224259f, -0.126021f, 0.022714f, 0.284039f, 0.051369f,
+            -0.000927f, -0.058735f, -0.083354f, -0.141254f, -0.187578f, -0.202669f,
+            0.048902f, 0.246597f, 0.441863f, 0.342519f, 0.066979f, 0.215286f,
+            0.188191f, -0.072240f, -0.208142f, -0.030196f, 0.178141f, 0.136985f,
+            -0.043374f, -0.181098f, 0.091815f, 0.116177f, -0.126690f, -0.386625f,
+            0.368165f, 0.269149f, -0.088042f, -0.028823f, 0.092961f, 0.024099f,
+            0.046112f, 0.176756f, 0.135849f, 0.124955f, 0.195467f, -0.037218f,
+            0.167217f, 0.188938f, 0.053528f, -0.066561f, 0.133721f, -0.070565f,
+            0.115898f, 0.152435f, -0.116993f, -0.110592f, -0.179005f, 0.026668f,
+            0.080530f, 0.075084f, -0.070401f, 0.012497f, 0.021849f, -0.139764f,
+            -0.022020f, -0.096301f, -0.064954f, -0.127446f, -0.013806f, -0.108315f,
+            0.156285f, 0.149867f, -0.011382f, 0.064532f, 0.029168f, 0.027393f,
+            0.069716f, 0.153735f, 0.038459f, 0.230714f, 0.253840f, 0.059522f,
+            -0.045053f, 0.014083f, 0.071103f, 0.068747f, 0.095887f, 0.005832f,
+            0.144887f, 0.026357f, -0.067359f, -0.044151f, -0.123283f, -0.019911f,
+            0.005318f, 0.109208f, -0.003201f, -0.021734f, 0.142025f, -0.066907f,
+            -0.120070f, -0.188639f, 0.012472f, -0.048704f, -0.012366f, -0.184828f,
+            0.168591f, 0.267166f, 0.058208f, -0.044101f, 0.033500f, 0.178558f,
+            0.104550f, 0.122418f, 0.080177f, 0.173246f, 0.298537f, 0.064173f,
+            0.053397f, 0.174341f, 0.230984f, 0.117025f, 0.166242f, 0.227781f,
+            0.120623f, 0.176952f, -0.011393f, -0.086483f, -0.008270f, 0.051700f,
+            -0.153369f, -0.058837f, -0.057639f, -0.060115f, 0.026349f, -0.160745f,
+            -0.037894f, -0.048575f, 0.041052f, -0.022112f, 0.060365f, 0.051906f,
+            0.162657f, 0.138519f, -0.050185f, -0.005938f, 0.071301f, 0.127686f,
+            0.062342f, 0.144400f, 0.072600f, 0.198436f, 0.246219f, -0.078185f,
+            -0.036169f, 0.075934f, 0.047328f, -0.013601f, 0.087205f, 0.019900f,
+            0.022606f, -0.015365f, -0.092506f, 0.075275f, -0.116375f, 0.050500f,
+            0.045118f, 0.166567f, 0.072073f, 0.060371f, 0.131747f, -0.169863f,
+            -0.039352f, -0.047486f, -0.039797f, -0.204312f, 0.021710f, 0.129443f,
+            -0.021173f, 0.173416f, -0.070794f, -0.063986f, 0.069689f, -0.064099f,
+            -0.123201f, -0.017372f, -0.206870f, 0.065863f, 0.113226f, 0.024707f,
+            -0.071341f, -0.066964f, -0.098278f, -0.062927f, 0.075840f, 0.014716f,
+            0.019378f, 0.132699f, -0.074191f, -0.089557f, -0.078446f, -0.197488f,
+            -0.173665f, 0.052583f, 0.044361f, 0.113549f, 0.098492f, 0.077379f,
+            -0.011146f, -0.192593f, -0.164435f, 0.045568f, 0.205699f, 0.049187f,
+            -0.082281f, 0.134874f, 0.185499f, 0.034968f, -0.119561f, -0.112372f,
+            -0.115091f, -0.054042f, -0.183816f, -0.078100f, 0.190695f, 0.091617f,
+            0.004257f, -0.041135f, -0.061453f, -0.141592f, -0.194809f, -0.120638f,
+            0.020168f, 0.109672f, 0.067398f, -0.015238f, -0.239145f, -0.264671f,
+            -0.185176f, 0.050472f, 0.020793f, 0.035678f, 0.022839f, -0.052055f,
+            -0.127968f, -0.113049f, -0.228416f, -0.258281f, -0.053437f, 0.076424f,
+            0.061450f, 0.237478f, 0.003618f, -0.055865f, -0.108087f, -0.028937f,
+            0.045585f, 0.052829f, -0.001471f, 0.022826f, 0.059565f, -0.104430f,
+            -0.077266f, -0.211882f, -0.212078f, 0.028074f, 0.075846f, 0.016265f,
+            0.161879f, 0.134477f, 0.008935f, -0.048041f, 0.074692f, 0.004928f,
+            -0.025156f, 0.192874f, 0.074410f, 0.308732f, 0.267400f, 0.094208f,
+            -0.005251f, 0.042041f, -0.032148f, 0.015588f, 0.252869f, 0.175302f,
+            0.022892f, 0.081673f, 0.063208f, 0.162626f, 0.194426f, 0.233890f,
+            0.262292f, 0.186930f, 0.084079f, -0.286388f, -0.213034f, -0.048867f,
+            -0.207669f, -0.170050f, 0.011673f, -0.092958f, -0.192786f, -0.273536f,
+            0.230904f, 0.266732f, 0.320519f, 0.297155f, 0.548169f, 0.304922f,
+            0.132687f, 0.247333f, 0.212488f, -0.271472f, -0.142105f, -0.002627f,
+            -0.119215f, 0.128383f, 0.100079f, -0.057490f, -0.121902f, -0.228892f,
+            0.202292f, -0.399795f, -0.371326f, -0.095836f, -0.063626f, -0.161375f,
+            -0.311180f, -0.294797f, 0.242122f, 0.011788f, 0.095573f, 0.322523f,
+            0.511840f, 0.322880f, 0.313259f, 0.173331f, 0.002542f, -0.029802f,
+            0.324766f, -0.326170f, -0.340547f, -0.138288f, -0.002963f, -0.114060f,
+            -0.377312f, -0.442570f, 0.212446f, -0.007759f, -0.011576f, 0.169711f,
+            0.308689f, 0.317348f, 0.539390f, 0.332845f, 0.057331f, -0.068180f,
+            0.101994f, 0.266995f, 0.209570f, 0.355730f, 0.091635f, 0.170238f,
+            0.125215f, 0.274154f, 0.070223f, 0.025515f, 0.049946f, -0.000550f,
+            0.043715f, -0.141843f, 0.020844f, 0.129871f, 0.256588f, 0.105015f,
+            0.148339f, 0.170682f, 0.028792f, 0.074037f, 0.160042f, 0.405137f,
+            0.246187f, 0.352160f, 0.168951f, 0.222263f, 0.264439f, 0.065945f,
+            0.021963f, -0.075084f, 0.093105f, 0.027318f, 0.098864f, 0.057566f,
+            -0.080282f, 0.185032f, 0.314419f, 0.333727f, 0.125798f, 0.294919f,
+            0.386002f, 0.217619f, -0.183517f, -0.278622f, -0.002342f, -0.027821f,
+            -0.134266f, -0.331843f, -0.008296f, 0.124564f, 0.053712f, -0.369016f,
+            -0.095036f, 0.209381f, 0.423760f, 0.371760f, 0.106397f, 0.369408f,
+            0.485608f, 0.231201f, -0.138685f, -0.349208f, -0.070083f, 0.028991f,
+            -0.081630f, -0.395992f, -0.146791f, -0.027354f, 0.063396f, -0.272484f,
+            0.058299f, 0.338207f, 0.110767f, -0.052642f, -0.233848f, -0.027448f,
+            0.030328f, 0.155572f, -0.093826f, 0.019331f, 0.120638f, 0.006292f,
+            -0.106083f, -0.236290f, -0.140933f, -0.088067f, -0.025138f, -0.208395f,
+            -0.025502f, 0.144192f, -0.048353f, -0.106144f, -0.305121f, -0.114147f,
+            0.090963f, 0.327727f, 0.035606f, -0.093779f, 0.002651f, -0.171081f,
+            -0.188131f, -0.216571f, -0.209101f, -0.054402f, 0.157147f, -0.057127f,
+            0.066584f, 0.008988f, 0.041191f, 0.034456f, -0.078255f, 0.052099f,
+            -0.022239f, 0.066981f, -0.117520f, -0.072637f, 0.062512f, 0.037570f,
+            -0.057544f, -0.312359f, 0.034357f, -0.031549f, 0.002566f, -0.207375f,
+            -0.070654f, -0.018786f, -0.044815f, -0.012814f, -0.076320f, 0.078183f,
+            0.023877f, 0.117078f, 0.022292f, -0.205424f, -0.060430f, -0.017296f,
+            -0.004827f, -0.321036f, -0.092155f, 0.038837f, 0.073190f, -0.067513f,
+            0.026521f, 0.171945f, 0.087318f, 0.034495f, -0.034089f, 0.154410f,
+            -0.061431f, 0.007435f, -0.111094f, -0.095976f, 0.014741f, -0.132324f,
+            -0.029517f, -0.192160f, 0.098667f, 0.020762f, 0.177050f, -0.064510f,
+            -0.054437f, -0.058678f, -0.001858f, 0.167602f, 0.015735f, 0.054338f,
+            0.016477f, 0.186381f, -0.010667f, 0.054692f, 0.126742f, 0.013140f,
+            0.090353f, -0.133608f, -0.018017f, -0.152619f, 0.027600f, -0.138700f,
+            -0.050274f, 0.045141f, -0.118731f, 0.094797f, -0.167605f, 0.097461f,
+            -0.009131f, 0.199920f, -0.052976f, 0.158194f, 0.178568f, -0.107600f,
+            0.009671f, -0.084072f, -0.040258f, -0.205673f, 0.102891f, 0.223511f,
+            0.042699f, 0.118548f, -0.021274f, 0.110997f, -0.155121f, 0.027696f,
+            -0.149968f, 0.051552f, -0.129219f, 0.173524f, 0.073972f, -0.189045f,
+            -0.034523f, -0.106655f, -0.011843f, -0.197381f, 0.219413f, 0.183197f,
+            -0.054920f, 0.144955f, 0.036517f, -0.085412f, -0.229070f, -0.143710f,
+            -0.049486f, 0.156634f, -0.008673f, -0.064778f, 0.082344f, 0.145673f,
+            0.002912f, -0.210121f, -0.116564f, 0.078425f, 0.220908f, -0.067594f,
+            0.048610f, 0.084912f, -0.066202f, -0.112515f, -0.217767f, -0.082640f,
+            -0.017414f, 0.230265f, -0.070735f, 0.066073f, 0.215256f, 0.071157f,
+            -0.087220f, -0.202235f, -0.011918f, 0.099562f, 0.174716f, -0.063845f,
+            -0.121055f, 0.014367f, 0.132709f, -0.005060f, -0.244606f, -0.179693f,
+            -0.134690f, 0.023239f, -0.193116f, -0.076975f, -0.021164f, -0.001938f,
+            -0.163799f, -0.111437f, -0.210362f, -0.166376f, 0.034754f, 0.010036f,
+            -0.021917f, 0.068014f, -0.086893f, -0.251746f, -0.267171f, 0.037383f,
+            0.003966f, 0.033571f, -0.151506f, 0.025437f, -0.020626f, -0.308454f,
+            -0.343143f, -0.092263f, -0.026261f, -0.028345f, 0.036036f, 0.035169f,
+            0.129470f, 0.122205f, 0.015661f, -0.070612f, -0.094333f, -0.066055f,
+            -0.041083f, 0.159146f, 0.073184f, 0.110044f, 0.174471f, 0.078069f,
+            -0.014881f, 0.008116f, 0.013209f, 0.075857f, 0.195605f, 0.062714f,
+            0.067955f, 0.056544f, -0.153908f, -0.141749f, -0.072550f, 0.033523f,
+            -0.024665f, 0.134487f, 0.079076f, 0.133562f, 0.227130f, 0.018054f,
+            0.004928f, 0.169162f, 0.065152f, 0.072160f, 0.131631f, 0.096303f,
+            0.054288f, 0.106256f, 0.114632f, 0.119038f, 0.515200f, 0.247429f,
+            0.199134f, 0.211957f, 0.127558f, -0.294684f, -0.194890f, -0.049988f,
+            -0.112247f, -0.008122f, -0.006176f, 0.037035f, -0.110881f, -0.249989f,
+            0.152434f, 0.234621f, 0.153340f, 0.349283f, 0.683049f, 0.157174f,
+            0.124844f, 0.099136f, 0.064407f, -0.248400f, -0.155323f, -0.026498f,
+            -0.023450f, 0.049051f, -0.114187f, 0.007195f, -0.176825f, -0.376926f,
+            0.366159f, -0.179938f, -0.148508f, 0.006043f, 0.170048f, 0.097866f,
+            -0.102658f, -0.260430f, 0.248868f, 0.037019f, -0.118111f, 0.078176f,
+            0.194171f, 0.211328f, 0.368612f, 0.361213f, 0.130013f, 0.094650f,
+            0.227396f, -0.178058f, -0.114782f, -0.008093f, 0.231080f, -0.011843f,
+            -0.097917f, -0.325788f, 0.141879f, 0.119738f, -0.230427f, -0.117419f,
+            -0.114153f, 0.037903f, 0.116383f, 0.218773f, -0.101884f, 0.059466f,
+            0.119255f, 0.010874f, -0.031449f, 0.045996f, 0.119931f, 0.273760f,
+            0.311700f, 0.261794f, 0.194809f, 0.339829f, 0.239449f, 0.064140f,
+            0.077597f, 0.098996f, 0.143534f, 0.184602f, 0.037507f, 0.225494f,
+            0.096142f, -0.147370f, -0.207833f, -0.174742f, -0.086391f, -0.038942f,
+            0.159577f, -0.088492f, -0.000989f, 0.108154f, -0.025890f, -0.072713f,
+            0.025997f, -0.006803f, -0.086879f, -0.011290f, -0.269200f, -0.103450f,
+            -0.124910f, -0.116340f, 0.141459f, 0.208800f, 0.042268f, 0.265034f,
+            0.516474f, 0.217591f, -0.018843f, -0.313328f, -0.168363f, 0.047129f,
+            0.090480f, -0.109852f, -0.018761f, 0.210669f, 0.281269f, -0.043591f,
+            -0.034147f, -0.237772f, -0.134843f, -0.072481f, -0.103831f, 0.038355f,
+            0.308619f, 0.148023f, -0.045867f, -0.123950f, -0.210860f, -0.064973f,
+            -0.036308f, -0.046731f, -0.022099f, 0.095776f, 0.409423f, 0.060635f,
+            -0.065196f, 0.051828f, 0.027981f, -0.009609f, -0.137681f, -0.095011f,
+            -0.019045f, 0.177278f, 0.009759f, -0.092119f, -0.016958f, -0.133860f,
+            -0.118421f, -0.032039f, -0.006214f, -0.084541f, 0.063971f, -0.073642f,
+            0.165676f, 0.110443f, 0.044131f, 0.046568f, 0.053292f, -0.055466f,
+            0.015512f, 0.371947f, 0.232102f, -0.016923f, 0.103979f, -0.091758f,
+            0.005907f, 0.209100f, 0.157433f, 0.030518f, 0.250366f, 0.062322f,
+            0.036720f, 0.094676f, 0.017306f, -0.010328f, -0.079012f, 0.016781f,
+            -0.112435f, 0.061795f, 0.042543f, -0.126799f, -0.009975f, -0.056760f,
+            0.046424f, -0.194712f, -0.139399f, -0.037731f, 0.157989f, -0.016261f,
+            0.123345f, 0.230563f, 0.083300f, -0.016392f, 0.059567f, -0.016035f,
+            -0.064767f, 0.231945f, 0.156629f, 0.034602f, 0.145628f, 0.041315f,
+            0.034535f, 0.019967f, -0.089188f, -0.012091f, 0.307857f, 0.211405f,
+            -0.025091f, -0.148249f, -0.129384f, 0.063536f, -0.068603f, -0.067941f,
+            -0.035104f, 0.210832f, 0.063810f, 0.062764f, -0.089889f, -0.030554f,
+            0.014791f, -0.053362f, -0.037818f, -0.196640f, 0.008388f, -0.082654f,
+            0.143056f, 0.064221f, 0.069795f, 0.191040f, 0.097321f, -0.028679f,
+            0.075794f, 0.313154f, 0.086240f, 0.207643f, 0.017809f, 0.122867f,
+            0.224586f, 0.167403f, -0.023884f, 0.047434f, 0.344091f, 0.187745f,
+            0.136177f, 0.141738f, 0.063799f, 0.045233f, -0.077342f, -0.003525f,
+            -0.165041f, -0.025616f, -0.073745f, 0.164439f, 0.011200f, -0.145896f,
+            -0.027954f, -0.061987f, -0.039874f, -0.142775f, 0.151042f, -0.038238f,
+            0.053152f, 0.078615f, 0.086061f, 0.100593f, 0.128046f, -0.071006f,
+            -0.116558f, 0.208445f, 0.051086f, 0.076843f, 0.023191f, -0.084781f,
+            -0.011790f, 0.147807f, -0.048554f, -0.113932f, 0.283322f, 0.190934f,
+            0.092789f, 0.033018f, -0.142428f, -0.142480f, -0.099023f, -0.041020f,
+            -0.042760f, 0.203295f, -0.053475f, 0.042424f, 0.222839f, -0.019167f,
+            -0.133176f, -0.276216f, -0.031998f, 0.117290f, 0.177827f, -0.059973f,
+            -0.064744f, -0.117040f, -0.155482f, -0.099531f, 0.164121f, -0.026682f,
+            -0.093810f, 0.238993f, -0.006506f, 0.007830f, 0.065819f, -0.203643f,
+            -0.100925f, -0.053652f, -0.130770f, 0.026277f, 0.131796f, 0.032742f,
+            0.127186f, 0.116694f, -0.161122f, -0.279773f, -0.252515f, -0.002638f,
+            0.042812f, 0.096776f, -0.123280f, 0.064858f, -0.010455f, -0.219760f,
+            -0.239331f, -0.104363f, -0.058022f, -0.053584f, 0.025611f, 0.005129f,
+            -0.100418f, -0.045712f, -0.194418f, -0.126366f, -0.030530f, 0.051168f,
+            0.215959f, 0.172402f, -0.054700f, -0.185995f, -0.278360f, -0.193693f,
+            -0.040309f, 0.003735f, -0.007770f, 0.123556f, 0.190179f, -0.077315f,
+            0.117403f, 0.212942f, 0.012160f, 0.000113f, 0.027331f, 0.040202f,
+            0.033293f, 0.219438f, 0.184174f, 0.259349f, 0.311206f, 0.082547f,
+            -0.047875f, -0.078417f, 0.010746f, 0.082620f, 0.311931f, 0.307605f,
+            0.003863f, 0.021405f, -0.026388f, -0.019572f, 0.020582f, -0.059353f,
+            0.025199f, 0.261319f, 0.086316f, 0.143614f, 0.107780f, 0.003900f,
+            -0.188397f, -0.038563f, -0.106045f, -0.125154f, -0.010509f, 0.054021f,
+            0.242130f, 0.279152f, 0.215546f, 0.346995f, 0.440856f, 0.237452f,
+            0.234154f, 0.301646f, 0.168929f, -0.208358f, -0.126848f, 0.010260f,
+            0.121018f, -0.062975f, -0.052848f, 0.050341f, -0.061103f, -0.266482f,
+            0.107186f, 0.140221f, 0.280065f, 0.287889f, 0.373198f, 0.151596f,
+            0.013593f, 0.115616f, 0.014616f, -0.281710f, -0.237597f, -0.117305f,
+            -0.000034f, -0.136739f, -0.196275f, -0.095225f, -0.125310f, -0.250514f,
+            0.236804f, -0.071805f, -0.037421f, 0.048230f, 0.321596f, 0.063632f,
+            0.024039f, -0.029133f, 0.230983f, 0.160593f, -0.154355f, -0.013086f,
+            -0.079929f, 0.094692f, 0.160391f, 0.180239f, 0.053895f, 0.100759f,
+            0.288631f, 0.038191f, 0.181692f, 0.229682f, 0.440166f, 0.063401f,
+            0.006273f, 0.020865f, 0.338695f, 0.256244f, -0.043927f, 0.115617f,
+            0.003296f, 0.173965f, 0.021318f, -0.040936f, -0.118932f, 0.182380f,
+            0.235922f, -0.053233f, -0.015053f, -0.101057f, 0.095341f, 0.051111f,
+            0.161831f, 0.032614f, 0.159496f, 0.072375f, 0.025089f, 0.023748f,
+            0.029151f, 0.161284f, -0.117717f, -0.036191f, -0.176822f, -0.162006f,
+            0.226542f, -0.078329f, 0.043079f, -0.119172f, 0.054614f, -0.101365f,
+            -0.064541f, -0.115304f, 0.135170f, 0.298872f, 0.098060f, 0.089428f,
+            -0.007497f, 0.110391f, -0.028824f, 0.020835f, -0.036804f, 0.125411f,
+            0.192105f, -0.048931f, 0.003086f, -0.010681f, 0.074698f, -0.016263f,
+            0.096063f, 0.060267f, -0.007277f, 0.139139f, -0.080635f, 0.036628f,
+            0.086058f, 0.131979f, 0.085707f, 0.025301f, 0.226094f, 0.194759f,
+            0.042193f, -0.157846f, -0.068402f, -0.141450f, -0.112659f, -0.076305f,
+            -0.069085f, -0.114332f, -0.102005f, 0.132193f, -0.067042f, 0.106643f,
+            0.198964f, 0.171616f, 0.167237f, -0.033730f, -0.026755f, 0.083621f,
+            0.149459f, -0.002799f, -0.000318f, 0.011753f, 0.065889f, -0.089375f,
+            -0.049610f, 0.224579f, 0.216548f, -0.034908f, -0.017851f, -0.088144f,
+            0.007530f, 0.240268f, 0.073270f, 0.013263f, 0.175323f, 0.012082f,
+            0.093993f, 0.015282f, 0.105854f, 0.107990f, 0.077798f, -0.096166f,
+            -0.079607f, 0.177820f, 0.142392f, 0.033337f, -0.078100f, -0.081616f,
+            -0.046993f, 0.139459f, 0.020272f, -0.123161f, 0.175269f, 0.105217f,
+            0.057328f, 0.080909f, -0.012612f, -0.097081f, 0.082060f, -0.096716f,
+            -0.063921f, 0.201884f, 0.128166f, -0.035051f, -0.032227f, -0.068139f,
+            -0.115915f, 0.095080f, -0.086007f, -0.067543f, 0.030776f, 0.032712f,
+            0.088937f, 0.054336f, -0.039329f, -0.114022f, 0.171672f, -0.112321f,
+            -0.217646f, 0.065186f, 0.060223f, 0.192174f, 0.055580f, -0.131107f,
+            -0.144338f, 0.056730f, -0.034707f, -0.081616f, -0.135298f, -0.000614f,
+            0.087189f, 0.014614f, 0.067709f, 0.107689f, 0.225780f, 0.084361f,
+            -0.008544f, 0.051649f, -0.048369f, -0.037739f, -0.060710f, 0.002654f,
+            0.016935f, 0.085563f, -0.015961f, -0.019265f, 0.111788f, 0.062376f,
+            0.202019f, 0.047713f, 0.042261f, 0.069716f, 0.242913f, 0.021052f,
+            -0.072812f, -0.155920f, -0.026436f, 0.035621f, -0.079300f, -0.028787f,
+            -0.048329f, 0.084718f, -0.060565f, -0.083750f, -0.164075f, -0.040742f,
+            -0.086219f, 0.015271f, -0.005204f, -0.016038f, 0.045816f, -0.050433f,
+            -0.077652f, 0.117109f, 0.009611f, -0.009045f, -0.008634f, -0.055373f,
+            -0.085968f, 0.028527f, -0.054736f, -0.168089f, 0.175839f, 0.071205f,
+            -0.023603f, 0.037907f, -0.004561f, -0.022634f, 0.123831f, 0.094469f,
+            -0.072920f, -0.133642f, -0.014032f, -0.142754f, -0.026999f, -0.199409f,
+            0.013268f, 0.226989f, 0.048650f, -0.170988f, -0.050141f, 0.007880f,
+            0.061880f, 0.019078f, -0.043578f, -0.038139f, 0.134814f, 0.054097f,
+            -0.081670f, 0.176838f, 0.047920f, -0.038176f, 0.050406f, -0.107181f,
+            -0.036279f, 0.027060f, 0.081594f, -0.002820f, 0.090507f, -0.033338f,
+            -0.059571f, 0.013404f, -0.099860f, 0.073371f, 0.342805f, 0.098305f,
+            -0.150910f, -0.020822f, -0.056960f, 0.046262f, -0.043413f, -0.149405f,
+            -0.129105f, -0.010899f, -0.014229f, -0.179949f, -0.113044f, -0.049468f,
+            -0.065513f, 0.090269f, -0.011919f, 0.087846f, 0.095796f, 0.146127f,
+            0.101599f, 0.078066f, -0.084348f, -0.100002f, -0.020134f, -0.050169f,
+            0.062122f, 0.014640f, 0.019143f, 0.036543f, 0.180924f, -0.013976f,
+            -0.066768f, -0.001090f, -0.070419f, -0.004839f, -0.001504f, 0.034483f,
+            -0.044954f, -0.050336f, -0.088638f, -0.174782f, -0.116082f, -0.205507f,
+            0.015587f, -0.042839f, -0.096879f, -0.144097f, -0.050268f, -0.196796f,
+            0.109639f, 0.271411f, 0.173732f, 0.108070f, 0.156437f, 0.124255f,
+            0.097242f, 0.238693f, 0.083941f, 0.109105f, 0.223940f, 0.267188f,
+            0.027385f, 0.025819f, 0.125070f, 0.093738f, 0.040353f, 0.038645f,
+            -0.012730f, 0.144063f, 0.052931f, -0.009138f, 0.084193f, 0.160272f,
+            -0.041366f, 0.011951f, -0.121446f, -0.106713f, -0.047566f, 0.047984f,
+            -0.255224f, -0.076116f, 0.098685f, -0.150845f, -0.171513f, -0.156590f,
+            0.058331f, 0.187493f, 0.413018f, 0.554265f, 0.372242f, 0.237943f,
+            0.124571f, 0.110829f, 0.010322f, -0.174477f, -0.067627f, -0.001979f,
+            0.142913f, 0.040597f, 0.019907f, 0.025963f, -0.043585f, -0.120732f,
+            0.099937f, 0.091059f, 0.247307f, 0.204226f, -0.042753f, -0.068580f,
+            -0.119002f, 0.026722f, 0.034853f, -0.060934f, -0.025054f, -0.093026f,
+            -0.035372f, -0.233209f, -0.049869f, -0.039151f, -0.022279f, -0.065380f,
+            -9.063785f };
+
+        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
+    }
+
+    Mat getPeopleDetector64x128()
+    {
+        static float detector[] = {
+           0.05359386f, -0.14721455f, -0.05532170f, 0.05077307f,
+           0.11547081f, -0.04268804f, 0.04635834f, -0.05468199f, 0.08232084f,
+           0.10424068f, -0.02294518f, 0.01108519f, 0.01378693f, 0.11193510f,
+           0.01268418f, 0.08528346f, -0.06309239f, 0.13054633f, 0.08100729f,
+           -0.05209739f, -0.04315529f, 0.09341384f, 0.11035026f, -0.07596218f,
+           -0.05517511f, -0.04465296f, 0.02947334f, 0.04555536f,
+           -3.55954492e-003f, 0.07818956f, 0.07730991f, 0.07890715f, 0.06222893f,
+           0.09001380f, -0.03574381f, 0.03414327f, 0.05677258f, -0.04773581f,
+           0.03746637f, -0.03521175f, 0.06955440f, -0.03849038f, 0.01052293f,
+           0.01736112f, 0.10867710f, 0.08748853f, 3.29739624e-003f, 0.10907028f,
+           0.07913758f, 0.10393070f, 0.02091867f, 0.11594022f, 0.13182420f,
+           0.09879354f, 0.05362710f, -0.06745391f, -7.01260753e-003f,
+           5.24702156e-003f, 0.03236255f, 0.01407916f, 0.02207983f, 0.02537322f,
+           0.04547948f, 0.07200756f, 0.03129894f, -0.06274468f, 0.02107014f,
+           0.06035208f, 0.08636236f, 4.53164103e-003f, 0.02193363f, 0.02309801f,
+           0.05568166f, -0.02645093f, 0.04448695f, 0.02837519f, 0.08975694f,
+           0.04461516f, 0.08975355f, 0.07514391f, 0.02306982f, 0.10410084f,
+           0.06368385f, 0.05943464f, 4.58420580e-003f, 0.05220337f, 0.06675851f,
+           0.08358569f, 0.06712101f, 0.06559004f, -0.03930482f, -9.15936660e-003f,
+           -0.05897915f, 0.02816453f, 0.05032348f, 0.06780671f, 0.03377650f,
+           -6.09417039e-004f, -0.01795146f, -0.03083684f, -0.01302475f,
+           -0.02972313f, 7.88706727e-003f, -0.03525961f, -2.50397739e-003f,
+           0.05245084f, 0.11791293f, -0.02167498f, 0.05299332f, 0.06640524f,
+           0.05190265f, -8.27316567e-003f, 0.03033127f, 0.05842173f,
+           -4.01050318e-003f, -6.25105947e-003f, 0.05862958f, -0.02465461f,
+           0.05546781f, -0.08228195f, -0.07234028f, 0.04640540f, -0.01308254f,
+           -0.02506191f, 0.03100746f, -0.04665651f, -0.04591486f, 0.02949927f,
+           0.06035462f, 0.02244646f, -0.01698639f, 0.01040041f, 0.01131170f,
+           0.05419579f, -0.02130277f, -0.04321722f, -0.03665198f, 0.01126490f,
+           -0.02606488f, -0.02228328f, -0.02255680f, -0.03427236f,
+           -7.75165204e-003f, -0.06195229f, 8.21638294e-003f, 0.09535975f,
+           -0.03709979f, -0.06942501f, 0.14579427f, -0.05448192f, -0.02055904f,
+           0.05747357f, 0.02781788f, -0.07077577f, -0.05178314f, -0.10429011f,
+           -0.11235505f, 0.07529039f, -0.07559302f, -0.08786739f, 0.02983843f,
+           0.02667585f, 0.01382199f, -0.01797496f, -0.03141199f, -0.02098101f,
+           0.09029204f, 0.04955018f, 0.13718739f, 0.11379953f, 1.80019124e-003f,
+           -0.04577610f, -1.11108483e-003f, -0.09470536f, -0.11596080f,
+           0.04489342f, 0.01784211f, 3.06850672e-003f, 0.10781866f,
+           3.36498418e-003f, -0.10842580f, -0.07436839f, -0.10535070f,
+           -0.01866805f, 0.16057891f, -5.07316366e-003f, -0.04295658f,
+           -5.90488780e-003f, 8.82003549e-003f, -0.01492646f, -0.05029279f,
+           -0.12875880f, 8.78831954e-004f, -0.01297184f, -0.07592774f,
+           -0.02668831f, -6.93787413e-004f, 0.02406698f, -0.01773298f,
+           -0.03855745f, -0.05877856f, 0.03259695f, 0.12826584f, 0.06292590f,
+           -4.10733931e-003f, 0.10996531f, 0.01332991f, 0.02088735f, 0.04037504f,
+           -0.05210760f, 0.07760046f, 0.06399347f, -0.05751930f, -0.10053057f,
+           0.07505023f, -0.02139782f, 0.01796176f, 2.34400877e-003f, -0.04208319f,
+           0.07355055f, 0.05093350f, -0.02996780f, -0.02219072f, 0.03355330f,
+           0.04418742f, -0.05580705f, -0.05037573f, -0.04548179f, 0.01379514f,
+           0.02150671f, -0.02194211f, -0.13682702f, 0.05464972f, 0.01608082f,
+           0.05309116f, 0.04701022f, 1.33690401e-003f, 0.07575664f, 0.09625306f,
+           8.92647635e-003f, -0.02819123f, 0.10866830f, -0.03439325f,
+           -0.07092371f, -0.06004780f, -0.02712298f, -7.07467366e-003f,
+           -0.01637020f, 0.01336790f, -0.10313606f, 0.04906582f, -0.05732445f,
+           -0.02731079f, 0.01042235f, -0.08340668f, 0.03686501f, 0.06108340f,
+           0.01322748f, -0.07809529f, 0.03774724f, -0.03413248f, -0.06096525f,
+           -0.04212124f, -0.07982176f, -1.25973229e-003f, -0.03045501f,
+           -0.01236493f, -0.06312395f, 0.04789570f, -0.04602066f, 0.08576570f,
+           0.02521080f, 0.02988098f, 0.10314583f, 0.07060035f, 0.04520544f,
+           -0.04426654f, 0.13146530f, 0.08386490f, 0.02164590f, -2.12280243e-003f,
+           -0.03686353f, -0.02074944f, -0.03829959f, -0.01530596f, 0.02689708f,
+           0.11867401f, -0.06043470f, -0.02785023f, -0.04775074f, 0.04878745f,
+           0.06350956f, 0.03494788f, 0.01467400f, 1.17890188e-003f, 0.04379614f,
+           2.03681854e-003f, -0.03958609f, -0.01072688f, 6.43705716e-003f,
+           0.02996500f, -0.03418507f, -0.01960307f, -0.01219154f,
+           -4.37000440e-003f, -0.02549453f, 0.02646318f, -0.01632513f,
+           6.46516960e-003f, -0.01929734f, 4.78711911e-003f, 0.04962371f,
+           0.03809111f, 0.07265724f, 0.05758125f, -0.03741554f, 0.01648608f,
+           -8.45285598e-003f, 0.03996826f, -0.08185477f, 0.02638875f,
+           -0.04026615f, -0.02744674f, -0.04071517f, 1.05096330e-003f,
+           -0.04741232f, -0.06733172f, 8.70434940e-003f, -0.02192543f,
+           1.35350740e-003f, -0.03056974f, -0.02975521f, -0.02887780f,
+           -0.01210713f, -0.04828526f, -0.09066251f, -0.09969629f, -0.03665164f,
+           -8.88111943e-004f, -0.06826669f, -0.01866150f, -0.03627640f,
+           -0.01408288f, 0.01874239f, -0.02075835f, 0.09145175f, -0.03547291f,
+           0.05396780f, 0.04198981f, 0.01301925f, -0.03384354f, -0.12201976f,
+           0.06830920f, -0.03715654f, 9.55848210e-003f, 5.05685573e-003f,
+           0.05659294f, 3.90764466e-003f, 0.02808490f, -0.05518097f, -0.03711621f,
+           -0.02835565f, -0.04420464f, -0.01031947f, 0.01883466f,
+           -8.49525444e-003f, -0.09419250f, -0.01269387f, -0.02133371f,
+           -0.10190815f, -0.07844430f, 2.43644323e-003f, -4.09610150e-003f,
+           0.01202551f, -0.06452291f, -0.10593818f, -0.02464746f, -0.02199699f,
+           -0.07401930f, 0.07285886f, 8.87513801e-004f, 9.97662079e-003f,
+           8.46779719e-003f, 0.03730333f, -0.02905126f, 0.03573337f, -0.04393689f,
+           -0.12014472f, 0.03176554f, -2.76015815e-003f, 0.10824566f, 0.05090732f,
+           -3.30179278e-003f, -0.05123822f, 5.04784798e-003f, -0.05664124f,
+           -5.99415926e-003f, -0.05341901f, -0.01221393f, 0.01291318f,
+           9.91760660e-003f, -7.56987557e-003f, -0.06193124f, -2.24549137e-003f,
+           0.01987562f, -0.02018840f, -0.06975540f, -0.06601523f, -0.03349112f,
+           -0.08910118f, -0.03371435f, -0.07406893f, -0.02248047f, -0.06159951f,
+           2.77751544e-003f, -0.05723337f, -0.04792468f, 0.07518548f,
+           2.77279224e-003f, 0.04211938f, 0.03100502f, 0.05278448f, 0.03954679f,
+           -0.03006846f, -0.03851741f, -0.02792403f, -0.02875333f, 0.01531280f,
+           0.02186953f, -0.01989829f, 2.50679464e-003f, -0.10258728f,
+           -0.04785743f, -0.02887216f, 3.85063468e-003f, 0.01112236f,
+           8.29218887e-003f, -0.04822981f, -0.04503597f, -0.03713100f,
+           -0.06988008f, -0.11002295f, -2.69209221e-003f, 1.85383670e-003f,
+           -0.05921049f, -0.06105053f, -0.08458050f, -0.04527602f,
+           8.90329306e-004f, -0.05875023f, -2.68602883e-003f, -0.01591195f,
+           0.03631859f, 0.05493166f, 0.07300330f, 5.53333294e-003f, 0.06400407f,
+           0.01847740f, -5.76280477e-003f, -0.03210877f, 4.25160583e-003f,
+           0.01166520f, -1.44864211e-003f, 0.02253744f, -0.03367080f, 0.06983195f,
+           -4.22323542e-003f, -8.89401045e-003f, -0.07943393f, 0.05199728f,
+           0.06065201f, 0.04133492f, 1.44032843e-003f, -0.09585235f, -0.03964731f,
+           0.04232114f, 0.01750465f, -0.04487902f, -7.59733608e-003f, 0.02011171f,
+           0.04673622f, 0.09011173f, -0.07869188f, -0.04682482f, -0.05080139f,
+           -3.99383716e-003f, -0.05346331f, 0.01085723f, -0.03599333f,
+           -0.07097908f, 0.03551549f, 0.02680387f, 0.03471529f, 0.01790393f,
+           0.05471273f, 9.62048303e-003f, -0.03180215f, 0.05864431f, 0.02330614f,
+           0.01633144f, -0.05616681f, -0.10245429f, -0.08302189f, 0.07291322f,
+           -0.01972590f, -0.02619633f, -0.02485327f, -0.04627592f,
+           1.48853404e-003f, 0.05514185f, -0.01270860f, -0.01948900f, 0.06373586f,
+           0.05002292f, -0.03009798f, 8.76216311e-003f, -0.02474238f,
+           -0.05504891f, 1.74034527e-003f, -0.03333667f, 0.01524987f, 0.11663762f,
+           -1.32344989e-003f, -0.06608453f, 0.05687166f, -6.89525274e-004f,
+           -0.04402352f, 0.09450210f, -0.04222684f, -0.05360983f, 0.01779531f,
+           0.02561388f, -0.11075410f, -8.77790991e-003f, -0.01099504f,
+           -0.10380266f, 0.03103457f, -0.02105741f, -0.07371717f, 0.05146710f,
+           0.10581432f, -0.08617968f, -0.02892107f, 0.01092199f, 0.14551543f,
+           -2.24320893e-003f, -0.05818033f, -0.07390742f, 0.05701261f,
+           0.12937020f, -0.04986651f, 0.10182415f, 0.05028650f, 0.12515625f,
+           0.09175041f, 0.06404983f, 0.01523394f, 0.09460562f, 0.06106631f,
+           -0.14266998f, -0.02926703f, 0.02762171f, 0.02164151f,
+           -9.58488265e-004f, -0.04231362f, -0.09866509f, 0.04322244f,
+           0.05872034f, -0.04838847f, 0.06319253f, 0.02443798f, -0.03606876f,
+           9.38737206e-003f, 0.04289991f, -0.01027411f, 0.08156885f, 0.08751175f,
+           -0.13191354f, 8.16054735e-003f, -0.01452161f, 0.02952677f, 0.03615945f,
+           -2.09128903e-003f, 0.02246693f, 0.09623287f, 0.09412123f, -0.02924758f,
+           -0.07815186f, -0.02203079f, -2.02566991e-003f, 0.01094733f,
+           -0.01442332f, 0.02838561f, 0.11882371f, 7.28798332e-003f, -0.10345965f,
+           0.07561217f, -0.02049661f, 4.44177445e-003f, 0.01609347f, -0.04893158f,
+           -0.08758243f, -7.67420698e-003f, 0.08862378f, 0.06098121f, 0.06565887f,
+           7.32981879e-003f, 0.03558407f, -0.03874352f, -0.02490055f,
+           -0.06771075f, 0.09939223f, -0.01066077f, 0.01382995f, -0.07289080f,
+           7.47184316e-003f, 0.10621431f, -0.02878659f, 0.02383525f, -0.03274646f,
+           0.02137008f, 0.03837290f, 0.02450992f, -0.04296818f, -0.02895143f,
+           0.05327370f, 0.01499020f, 0.04998732f, 0.12938657f, 0.09391870f,
+           0.04292390f, -0.03359194f, -0.06809492f, 0.01125796f, 0.17290455f,
+           -0.03430733f, -0.06255233f, -0.01813114f, 0.11726857f, -0.06127599f,
+           -0.08677909f, -0.03429872f, 0.04684938f, 0.08161420f, 0.03538774f,
+           0.01833884f, 0.11321855f, 0.03261845f, -0.04826299f, 0.01752407f,
+           -0.01796414f, -0.10464549f, -3.30041884e-003f, 2.29343961e-004f,
+           0.01457292f, -0.02132982f, -0.02602923f, -9.87351313e-003f,
+           0.04273872f, -0.02103316f, -0.07994065f, 0.02614958f, -0.02111666f,
+           -0.06964913f, -0.13453490f, -0.06861878f, -6.09341264e-003f,
+           0.08251446f, 0.15612499f, 2.46531400e-003f, 8.88424646e-003f,
+           -0.04152999f, 0.02054853f, 0.05277953f, -0.03087788f, 0.02817579f,
+           0.13939077f, 0.07641046f, -0.03627627f, -0.03015098f, -0.04041540f,
+           -0.01360690f, -0.06227205f, -0.02738223f, 0.13577610f, 0.15235767f,
+           -0.05392922f, -0.11175954f, 0.02157129f, 0.01146481f, -0.05264937f,
+           -0.06595174f, -0.02749175f, 0.11812254f, 0.17404149f, -0.06137035f,
+           -0.11003478f, -0.01351621f, -0.01745916f, -0.08577441f, -0.04469909f,
+           -0.06106115f, 0.10559758f, 0.20806813f, -0.09174948f, 7.09621934e-004f,
+           0.03579374f, 0.07215115f, 0.02221742f, 0.01827742f, -7.90785067e-003f,
+           0.01489554f, 0.14519960f, -0.06425831f, 0.02990399f, -1.80181325e-003f,
+           -0.01401528f, -0.04171134f, -3.70530109e-003f, -0.09090481f,
+           0.09520713f, 0.08845516f, -0.02651753f, -0.03016730f, 0.02562448f,
+           0.03563816f, -0.03817881f, 0.01433385f, 0.02256983f, 0.02872120f,
+           0.01001934f, -0.06332260f, 0.04338406f, 0.07001807f, -0.04705722f,
+           -0.07318907f, 0.02630457f, 0.03106382f, 0.06648342f, 0.10913180f,
+           -0.01630815f, 0.02910308f, 0.02895109f, 0.08040254f, 0.06969310f,
+           0.06797734f, 6.08639978e-003f, 4.16588830e-003f, 0.08926726f,
+           -0.03123648f, 0.02700146f, 0.01168734f, -0.01631594f, 4.61015804e-003f,
+           8.51359498e-003f, -0.03544224f, 0.03571994f, 4.29766066e-003f,
+           -0.01970077f, -8.79793242e-003f, 0.09607988f, 0.01544222f,
+           -0.03923707f, 0.07308586f, 0.06061262f, 1.31683104e-004f,
+           -7.98222050e-003f, 0.02399261f, -0.06084389f, -0.02743429f,
+           -0.05475523f, -0.04131311f, 0.03559756f, 0.03055342f, 0.02981433f,
+           0.14860515f, 0.01766787f, 0.02945257f, 0.04898238f, 0.01026922f,
+           0.02811658f, 0.08267091f, 0.02732154f, -0.01237693f, 0.11760156f,
+           0.03802063f, -0.03309754f, 5.24957618e-003f, -0.02460510f, 0.02691451f,
+           0.05399988f, -0.10133506f, 0.06385437f, -0.01818005f, 0.02259503f,
+           0.03573135f, 0.01042848f, -0.04153402f, -0.04043029f, 0.01643575f,
+           0.08326677f, 4.61383024e-004f, -0.05308095f, -0.08536223f,
+           -1.61011645e-003f, -0.02163720f, -0.01783352f, 0.03859637f,
+           0.08498885f, -0.01725216f, 0.08625131f, 0.10995087f, 0.09177644f,
+           0.08498347f, 0.07646490f, 0.05580502f, 0.02693516f, 0.09996913f,
+           0.09070327f, 0.06667200f, 0.05873008f, -0.02247842f, 0.07772321f,
+           0.12408436f, 0.12629253f, -8.41997913e-004f, 0.01477783f, 0.09165990f,
+           -2.98401713e-003f, -0.06466447f, -0.07057302f, 2.09516948e-004f,
+           0.02210209f, -0.02158809f, -0.08602506f, -0.02284836f,
+           4.01876355e-003f, 9.56660323e-003f, -0.02073978f, -0.04635138f,
+           -7.59423291e-003f, -0.01377393f, -0.04559359f, -0.13284740f,
+           -0.08671406f, -0.03654395f, 0.01142869f, 0.03287891f, -0.04392983f,
+           0.06142959f, 0.17710890f, 0.10385257f, 0.01329137f, 0.10067633f,
+           0.12450829f, -0.04476709f, 0.09049144f, 0.04589312f, 0.11167907f,
+           0.08587538f, 0.04767583f, 1.67188141e-003f, 0.02359802f, -0.03808852f,
+           0.03126272f, -0.01919029f, -0.05698918f, -0.02365112f, -0.06519032f,
+           -0.05599358f, -0.07097308f, -0.03301812f, -0.04719102f, -0.02566297f,
+           0.01324074f, -0.09230672f, -0.05518232f, -0.04712864f, -0.03380903f,
+           -0.06719479f, 0.01183908f, -0.09326738f, 0.01642865f, 0.03789867f,
+           -6.61567831e-003f, 0.07796386f, 0.07246574f, 0.04706347f, -0.02523437f,
+           -0.01696830f, -0.08068866f, 0.06030888f, 0.10527060f, -0.06611756f,
+           0.02977346f, 0.02621830f, 0.01913855f, -0.08479366f, -0.06322418f,
+           -0.13570616f, -0.07644490f, 9.31900274e-003f, -0.08095149f,
+           -0.10197903f, -0.05204025f, 0.01413151f, -0.07800411f, -0.01885122f,
+           -0.07509381f, -0.10136326f, -0.05212355f, -0.09944065f,
+           -1.33606605e-003f, -0.06342617f, -0.04178550f, -0.12373723f,
+           -0.02832736f, -0.06057501f, 0.05830070f, 0.07604282f, -0.06462587f,
+           8.02447461e-003f, 0.11580125f, 0.12332212f, 0.01978462f,
+           -2.72378162e-003f, 0.05850752f, -0.04674481f, 0.05148062f,
+           -2.62542837e-003f, 0.11253355f, 0.09893716f, 0.09785093f, -0.04659257f,
+           -0.01102429f, -0.07002308f, 0.03088913f, -0.02565549f, -0.07671449f,
+           3.17443861e-003f, -0.10783514f, -0.02314270f, -0.11089555f,
+           -0.01024768f, 0.03116021f, -0.04964825f, 0.02281825f, 5.50005678e-003f,
+           -0.08427856f, -0.14685495f, -0.07719755f, -0.13342668f, -0.04525511f,
+           -0.09914210f, 0.02588859f, 0.03469279f, 0.04664020f, 0.11688190f,
+           0.09647275f, 0.10857815f, -0.01448726f, 0.04299758f, -0.06763151f,
+           1.33257592e-003f, 0.14331576f, 0.07574340f, 0.09166205f, 0.05674926f,
+           0.11325553f, -0.01106494f, 0.02062161f, -0.11484840f, -0.07492137f,
+           -0.02864293f, -0.01275638f, -0.06946032f, -0.10101652f, -0.04113498f,
+           -0.02214783f, -0.01273942f, -0.07480393f, -0.10556041f, -0.07622112f,
+           -0.09988393f, -0.11453961f, -0.12073903f, -0.09412795f, -0.07146588f,
+           -0.04054537f, -0.06127083f, 0.04221122f, 0.07688113f, 0.04099256f,
+           0.12663734f, 0.14683802f, 0.21761774f, 0.12525328f, 0.18431792f,
+           -1.66402373e-003f, 2.37777247e-003f, 0.01445475f, 0.03509416f,
+           0.02654697f, 0.01716739f, 0.05374011f, 0.02944174f, 0.11323927f,
+           -0.01485456f, -0.01611330f, -1.85554172e-003f, -0.01708549f,
+           -0.05435753f, -0.05302101f, 0.05260378f, -0.03582945f,
+           -3.42867890e-004f, 1.36076682e-003f, -0.04436073f, -0.04228432f,
+           0.03281291f, -0.05480836f, -0.10197772f, -0.07206279f, -0.10741059f,
+           -0.02366946f, 0.10278475f, -2.74783419e-003f, -0.03242477f,
+           0.02308955f, 0.02835869f, 0.10348799f, 0.19580358f, 0.10252027f,
+           0.08039929f, 0.05525554f, -0.13250865f, -0.14395352f, 3.13586881e-003f,
+           -0.03387071f, 8.94669443e-003f, 0.05406157f, -4.97324532e-003f,
+           -0.01189114f, 2.82919413e-004f, -0.03901557f, -0.04898705f,
+           0.02164520f, -0.01382906f, -0.01850416f, 0.01869347f, -0.02450060f,
+           0.02291678f, 0.08196463f, 0.03309153f, -0.10629974f, 0.02473924f,
+           0.05344394f, -0.02404823f, -0.03243643f, -5.55244600e-003f,
+           -0.08009996f, 0.02811539f, 0.04235742f, 0.01859004f, 0.04902123f,
+           -0.01438252f, -0.01526853f, 0.02044195f, -0.05008660f, 0.04244113f,
+           0.07611816f, 0.04950470f, -0.06020549f, -4.26026015e-003f, 0.13133512f,
+           -0.01438738f, -0.01958807f, -0.04044152f, -0.12425045f,
+           2.84353318e-003f, -0.05042776f, -0.09121484f, 7.34345755e-003f,
+           0.09388847f, 0.11800314f, 4.72295098e-003f, 4.44378285e-003f,
+           -0.07984917f, -0.03613737f, 0.04490915f, -0.02246483f, 0.04681071f,
+           0.05240871f, 0.02157206f, -0.04603431f, -0.01197929f, -0.02748779f,
+           0.13621049f, 0.08812155f, -0.07802048f, 4.86458559e-003f, -0.01598836f,
+           0.01024450f, -0.03463517f, -0.02304239f, -0.08692665f, 0.06655128f,
+           0.05785803f, -0.12640759f, 0.02307472f, 0.07337402f, 0.07525434f,
+           0.04943763f, -0.02241034f, -0.09978238f, 0.14487994f, -0.06570521f,
+           -0.07855482f, 0.02830222f, -5.29603509e-004f, -0.04669895f,
+           -0.11822784f, -0.12246452f, -0.15365660f, -0.02969127f, 0.08078201f,
+           0.13512598f, 0.11505685f, 0.04740673f, 0.01376022f, -0.05852978f,
+           -0.01537809f, -0.05541119f, 0.02491065f, -0.02870786f, 0.02760978f,
+           0.23836176f, 0.22347429f, 0.10306466f, -0.06919070f, -0.10132039f,
+           -0.20198342f, -0.05040560f, 0.27163076f, 0.36987007f, 0.34540465f,
+           0.29095781f, 0.05649706f, 0.04125737f, 0.07505883f, -0.02737836f,
+           -8.43431335e-003f, 0.07368195f, 0.01653876f, -0.09402955f,
+           -0.09574359f, 0.01474337f, -0.07128561f, -0.03460737f, 0.11438941f,
+           0.13752601f, -0.06385452f, -0.06310338f, 8.19548313e-003f, 0.11622470f,
+           5.05133113e-003f, -0.07602754f, 0.06695660f, 0.25723928f, 0.09037900f,
+           0.28826267f, 0.13165380f, -0.05312614f, -0.02137198f, -0.03442232f,
+           -0.06255679f, 0.03899667f, 0.18391028f, 0.26016650f, 0.03374462f,
+           0.01860465f, 0.19077586f, 0.18160543f, 3.43634398e-003f, -0.03036782f,
+           0.19683038f, 0.35378191f, 0.24968483f, -0.03222649f, 0.28972381f,
+           0.43091634f, 0.30778357f, 0.02335266f, -0.09877399f, -6.85245218e-003f,
+           0.08945240f, -0.08150686f, 0.02792493f, 0.24806842f, 0.17338486f,
+           0.06231801f, -0.10432383f, -0.16653322f, -0.13197899f, -0.08531576f,
+           -0.19271527f, -0.13536365f, 0.22240199f, 0.39219588f, 0.26597717f,
+           -0.01231649f, 0.01016179f, 0.13379875f, 0.12018334f, -0.04852953f,
+           -0.07915270f, 0.07036012f, 3.87723115e-003f, -0.06126805f,
+           -0.15015170f, -0.11406515f, -0.08556531f, -0.07429333f, -0.16115491f,
+           0.13214062f, 0.25691369f, 0.05697750f, 0.06861912f, -6.02903729e-003f,
+           -7.94562511e-003f, 0.04799571f, 0.06695165f, -0.01926842f, 0.06206308f,
+           0.13450983f, -0.06381495f, -2.98370165e-003f, -0.03482971f,
+           7.53991678e-003f, 0.03895611f, 0.11464261f, 0.01669971f,
+           8.27818643e-003f, -7.49160210e-003f, -0.11712562f, -0.10650621f,
+           -0.10353880f, -0.04994106f, -7.65618810e-004f, 0.03023767f,
+           -0.04759270f, -0.07302686f, -0.05825012f, -0.13156348f, -0.10639747f,
+           -0.19393684f, -0.09973683f, -0.07918908f, 4.63177625e-004f,
+           -6.61382044e-004f, 0.15853868f, 0.08561199f, -0.07660093f,
+           -0.08015265f, -0.06164073f, 0.01882577f, -7.29908410e-004f,
+           0.06840892f, 0.03843764f, 0.20274927f, 0.22028814f, -5.26101235e-003f,
+           0.01452435f, -0.06331623f, 0.02865064f, 0.05673740f, 0.12171564f,
+           0.03837196f, 0.03555467f, -0.02662914f, -0.10280123f, -0.06526285f,
+           -0.11066351f, -0.08988424f, -0.10103678f, 8.10526591e-003f,
+           5.95238712e-003f, 0.02617721f, -0.01705742f, -0.10897956f,
+           -0.08004991f, -0.11271993f, -0.06185647f, -0.06103712f, 0.01597041f,
+           -0.05923606f, 0.09410726f, 0.22858568f, 0.03263380f, 0.06772990f,
+           -0.09003516f, 0.01017870f, 0.01931688f, 0.08628357f, -0.01430009f,
+           0.10954945f, 0.16612452f, -0.02434544f, -0.03310068f, -0.04236627f,
+           0.01212392f, -6.15046406e-003f, 0.06954194f, 0.03015283f, 0.01787957f,
+           0.02781667f, -0.05561153f, -8.96244217e-003f, -0.04971489f,
+           0.07510284f, 0.01775282f, 0.05889897f, -0.07981427f, 0.03647643f,
+           -3.73833324e-003f, -0.08894575f, -0.06429435f, -0.08068276f,
+           0.03567704f, -0.07131936f, -7.21910037e-003f, -0.09566668f,
+           0.17886090f, 0.14911725f, 0.02070032f, -0.05017120f, -0.04992622f,
+           0.01570143f, -0.09906903f, 0.06456193f, 0.15329507f, 0.18820767f,
+           0.11689861f, -0.01178513f, -0.02225163f, -0.01905318f, 0.10271224f,
+           -7.27029052e-003f, 0.11664233f, 0.14796902f, 0.07771893f, 0.02400013f,
+           -0.05361797f, -0.01972888f, 0.01376177f, 0.06740040f, -0.06525395f,
+           0.05726178f, -0.02404981f, -0.14018567f, -0.02074987f, -0.04621970f,
+           -0.04688627f, -0.01842059f, 0.07722727f, -0.04852883f, 0.01529004f,
+           -0.19639495f, 0.10817073f, 0.03795860f, -0.09435206f, -0.07984378f,
+           -0.03383440f, 0.11081333f, 0.02237366f, 0.12703256f, 0.21613893f,
+           0.02918790f, 4.66472283e-003f, -0.10274266f, -0.04854131f,
+           -3.46305710e-003f, 0.08652268f, 0.02251546f, 0.09636052f, 0.17180754f,
+           -0.09272388f, 4.59174305e-004f, -0.11723048f, -0.12210111f,
+           -0.15547538f, 0.07218186f, -0.05297846f, 0.03779940f, 0.05150875f,
+           -0.03802310f, 0.03870645f, -0.15250699f, -0.08696499f, -0.02021560f,
+           0.04118926f, -0.15177974f, 0.01577647f, 0.10249301f, 7.50041893e-003f,
+           0.01721806f, -0.06828983f, -0.02397596f, -0.06598977f, -0.04317593f,
+           -0.08064980f, 6.66632550e-003f, 0.03333484f, 0.07093620f, 0.08231064f,
+           -0.06577903f, -0.06698844f, -0.06984019f, -0.06508023f, -0.14145090f,
+           -0.02393239f, 0.06485303f, 8.83263443e-003f, 0.09251080f, -0.07557579f,
+           -0.05067699f, -0.09798748f, -0.06703258f, -0.14056294f, 0.03245994f,
+           0.12554143f, 0.01761621f, 0.12980327f, -0.04081950f, -0.11906909f,
+           -0.14813015f, -0.08376863f, -0.12200681f, 0.04988137f, 0.05424247f,
+           -3.90952639e-003f, 0.03255733f, -0.12717837f, -0.07461493f,
+           -0.05703964f, -0.01736189f, -0.08026433f, -0.05433894f, -0.01719359f,
+           0.02886275f, 0.01772653f, -0.09163518f, 3.57789593e-003f, -0.10129993f,
+           -0.02653764f, -0.08131415f, -0.03847986f, -7.62157550e-004f,
+           0.06486648f, 0.19675669f, -0.04919156f, -0.07059129f, -0.04857785f,
+           -0.01042383f, -0.08328653f, 0.03660302f, -0.03696846f, 0.04969259f,
+           0.08241162f, -0.12514858f, -0.06122676f, -0.03750202f,
+           6.52989605e-003f, -0.10247213f, 0.02568346f, 4.51781414e-003f,
+           -0.03734229f, -0.01131264f, -0.05412074f, 8.89345480e-004f,
+           -0.12388977f, -0.05959237f, -0.12418608f, -0.06151643f, -0.07310260f,
+           0.02441575f, 0.07023528f, -0.07548289f, -7.57147965e-004f,
+           -0.09061348f, -0.08112976f, -0.06920306f, 9.54394229e-003f,
+           -0.01219902f, 1.21273217e-003f, -8.88989680e-003f, -0.08309301f,
+           -0.04552661f, -0.10739882f, -0.05691034f, -0.13928030f, 0.09027749f,
+           0.15123098f, 0.03175976f, 0.17763577f, 3.29913251e-004f, 0.05151888f,
+           -0.09844074f, -0.09475287f, -0.08571247f, 0.16241577f, 0.19336018f,
+           8.57454538e-003f, 0.11474732f, -0.01493934f, 0.03352379f, -0.08966240f,
+           -0.02322310f, 0.02663568f, 0.05448750f, -0.03536883f, -0.07210463f,
+           -0.06807277f, -0.03121621f, -0.05932408f, -0.17282860f, -0.15873498f,
+           -0.04956378f, 0.01603377f, -0.12385946f, 0.13878587f, 0.21468069f,
+           0.13510075f, 0.20992437f, 0.08845878f, 0.08104013f, 0.03754176f,
+           0.12173114f, 0.11103114f, 0.10643122f, 0.13941477f, 0.11640384f,
+           0.14786847f, 0.01218238f, 0.01160753f, 0.03547940f, 0.08794311f,
+           -0.01695384f, -0.07692261f, -0.08236158f, 6.79194089e-003f,
+           -0.02458403f, 0.13022894f, 0.10953187f, 0.09857773f, 0.04735930f,
+           -0.04353498f, -0.15173385f, -0.17904443f, -0.10450364f, -0.13418166f,
+           -0.06633098f, -0.03170381f, -0.06839000f, -0.11350126f, -0.06983913f,
+           0.19083543f, 0.17604128f, 0.07730632f, 0.10022651f, 0.36428109f,
+           0.28291923f, 0.12688625f, 0.15942036f, 0.14064661f, -0.11201853f,
+           -0.13969108f, -0.09088077f, -0.14107047f, 0.05117374f,
+           -2.63348082e-003f, -0.10794610f, -0.09715455f, -0.05284977f,
+           0.01565668f, 0.05031200f, 0.07021113f, -0.02963028f, 0.01766960f,
+           0.08333644f, -0.03211382f, 4.90096770e-003f, 0.05186674f, -0.05045737f,
+           -0.09624767f, -0.02525997f, 0.06916669f, 0.01213916f, 0.05333899f,
+           -0.03443280f, -0.10055527f, -0.06291115f, 5.42851724e-003f,
+           -6.30360236e-003f, 0.02270257f, -0.01769792f, 0.03273688f, 0.07746078f,
+           7.77099328e-003f, 0.05041346f, 0.01648103f, -0.02321534f, -0.09930186f,
+           -0.02293853f, 0.02034990f, -0.08324204f, 0.08510064f, -0.03732836f,
+           -0.06465405f, -0.06086946f, 0.13680504f, -0.11469388f, -0.03896406f,
+           -0.07142810f, 2.67581246e-003f, -0.03639632f, -0.09849060f,
+           -0.11014334f, 0.17489147f, 0.17610909f, -0.16091567f, -0.07248894f,
+           0.01567141f, 0.23742996f, 0.07552249f, -0.06270349f, -0.07303379f,
+           0.25442186f, 0.16903116f, -0.08168741f, -0.05913896f, -0.03954096f,
+           6.81776879e-003f, -0.05615319f, -0.07303037f, -0.12176382f,
+           0.12385108f, 0.22084464f, -0.05543206f, -0.03310431f, 0.05731593f,
+           0.19481890f, 0.04016430f, -0.06480758f, -0.12353460f, 0.18733442f,
+           -0.09631214f, -0.11192076f, 0.12404587f, 0.15671748f, 0.19256128f,
+           0.10895617f, 0.03391477f, -0.13032004f, -0.05626907f, -0.09025607f,
+           0.23485197f, 0.27812332f, 0.26725492f, 0.07255980f, 0.16565137f,
+           0.22388470f, 0.07441066f, -0.21003133f, -0.08075339f, -0.15031935f,
+           0.07023834f, 0.10872041f, 0.18156518f, 0.20037253f, 0.13571967f,
+           -0.11915682f, -0.11131983f, -0.18878011f, 0.06074620f, 0.20578890f,
+           0.12413109f, 0.03930207f, 0.29176015f, 0.29502738f, 0.27856228f,
+           -0.01803601f, 0.16646385f, 0.19268319f, 0.01900682f, 0.06026287f,
+           2.35868432e-003f, 0.01558199f, 0.02707230f, 0.11383014f, 0.12103992f,
+           0.03907350f, 0.04637353f, 0.09020995f, 0.11919726f, -3.63007211e-003f,
+           0.02220155f, 0.10336831f, 0.17351882f, 0.12259731f, 0.18983354f,
+           0.15736865f, 0.01160725f, -0.01690723f, -9.69582412e-004f, 0.07213813f,
+           0.01161613f, 0.17864859f, 0.24486147f, 0.18208991f, 0.20177495f,
+           0.05972528f, -8.93934630e-003f, -0.02316955f, 0.14436610f, 0.14114498f,
+           0.05520950f, 0.06353590f, -0.19124921f, 0.10174713f, 0.29414919f,
+           0.26448128f, 0.09344960f, 0.15284036f, 0.19797507f, 0.11369792f,
+           -0.12722753f, -0.21396367f, -0.02008235f, -0.06566695f, -0.01662150f,
+           -0.03937003f, 0.04778343f, 0.05017274f, -0.02299062f, -0.20208496f,
+           -0.06395898f, 0.13721776f, 0.22544557f, 0.14888357f, 0.08687132f,
+           0.27088094f, 0.32206613f, 0.09782200f, -0.18523243f, -0.17232181f,
+           -0.01041531f, 0.04008654f, 0.04199702f, -0.08081299f, -0.03755421f,
+           -0.04809646f, -0.05222081f, -0.21709201f, -0.06622940f, 0.02945281f,
+           -0.04600435f, -0.05256077f, -0.08432942f, 0.02848100f, 0.03490564f,
+           8.28621630e-003f, -0.11051246f, -0.11210597f, -0.01998289f,
+           -0.05369405f, -0.08869293f, -0.18799506f, -0.05436598f, -0.05011634f,
+           -0.05419716f, -0.06151857f, -0.10827805f, 0.04346735f, 0.04016083f,
+           0.01520820f, -0.12173316f, -0.04880285f, -0.01101406f, 0.03250847f,
+           -0.06009551f, -0.03082932f, -0.02295134f, -0.06856834f, -0.08775249f,
+           -0.23793389f, -0.09174541f, -0.05538322f, -0.04321031f, -0.11874759f,
+           -0.04221844f, -0.06070468f, 0.01194489f, 0.02608565f, -0.03892140f,
+           -0.01643151f, -0.02602034f, -0.01305472f, 0.03920100f, -0.06514261f,
+           0.01126918f, -6.27710763e-003f, -0.02720047f, -0.11133634f,
+           0.03300330f, 0.02398472f, 0.04079665f, -0.10564448f, 0.05966159f,
+           0.01195221f, -0.03179441f, -0.01692590f, -0.06177841f, 0.01841576f,
+           -5.51078189e-003f, -0.06821765f, -0.03191888f, -0.09545476f,
+           0.03030550f, -0.04896152f, -0.02914624f, -0.13283344f, -0.04783419f,
+           6.07836898e-003f, -0.01449538f, -0.13358212f, -0.09687774f,
+           -0.02813793f, 0.01213498f, 0.06650011f, -0.02039067f, 0.13356198f,
+           0.05986415f, -9.12760664e-003f, -0.18780160f, -0.11992817f,
+           -0.06342237f, 0.01229534f, 0.07143231f, 0.10713009f, 0.11085765f,
+           0.06569190f, -0.02956399f, -0.16288325f, -0.13993549f, -0.01292515f,
+           0.03833013f, 0.09130384f, -0.05086257f, 0.05617329f, -0.03896667f,
+           -0.06282311f, -0.11490010f, -0.14264110f, -0.04530499f, 0.01598189f,
+           0.09167797f, 0.08663294f, 0.04885277f, -0.05741219f, -0.07565769f,
+           -0.17136464f, -0.02619422f, -0.02477579f, 0.02679587f, 0.11621952f,
+           0.08788391f, 0.15520640f, 0.04709549f, 0.04504483f, -0.10214074f,
+           -0.12293372f, -0.04820546f, -0.05484834f, 0.05473754f, 0.07346445f,
+           0.05577277f, -0.08209965f, 0.03462975f, -0.20962234f, -0.09324598f,
+           3.79481679e-003f, 0.03617633f, 0.16742408f, 0.07058107f, 0.10204960f,
+           -0.06795346f, 3.22807301e-003f, -0.12589309f, -0.17496960f,
+           0.02078314f, -0.07694324f, 0.12184640f, 0.08997164f, 0.04793497f,
+           -0.11383379f, -0.08046359f, -0.25716835f, -0.08080962f,
+           6.80711539e-003f, -0.02930280f, -3.04938294e-003f, -0.11106286f,
+           -0.04628860f, -0.07821649f, 7.70127494e-003f, -0.10247706f,
+           1.21042714e-003f, 0.20573859f, -0.03241005f, 8.42972286e-003f,
+           0.01946464f, -0.01197973f, -0.14579976f, 0.04233614f,
+           -4.14096704e-003f, -0.06866436f, -0.02431862f, -0.13529138f,
+           1.25891645e-003f, -0.11425111f, -0.04303651f, -0.01694815f,
+           0.05720210f, -0.16040207f, 0.02772896f, 0.05498345f, -0.15010567f,
+           0.01450866f, 0.02350303f, -0.04301004f, -0.04951802f, 0.21702233f,
+           -0.03159155f, -0.01963303f, 0.18232647f, -0.03263875f,
+           -2.88476888e-003f, 0.01587562f, -1.94303901e-003f, -0.07789494f,
+           0.04674156f, -6.25576358e-003f, 0.08925962f, 0.21353747f, 0.01254677f,
+           -0.06999976f, -0.05931328f, -0.01884327f, -0.04306272f, 0.11794136f,
+           0.03842728f, -0.03907030f, 0.05636114f, -0.09766009f, -0.02104000f,
+           8.72711372e-003f, -0.02736877f, -0.05112274f, 0.16996814f, 0.02955785f,
+           0.02094014f, 0.08414304f, -0.03335762f, -0.03617457f, -0.05808248f,
+           -0.08872101f, 0.02927705f, 0.27077839f, 0.06075108f, 0.07478261f,
+           0.15282831f, -0.03908454f, -0.05101782f, -9.51998029e-003f,
+           -0.03272416f, -0.08735625f, 0.07633440f, -0.07185312f, 0.13841286f,
+           0.07812646f, -0.12901451f, -0.05488589f, -0.05644578f, -0.03290703f,
+           -0.11184757f, 0.03751570f, -0.05978153f, -0.09155276f, 0.05657315f,
+           -0.04328186f, -0.03047933f, -0.01413135f, -0.10181040f, -0.01384013f,
+           0.20132534f, -0.01536873f, -0.07641169f, 0.05906778f, -0.07833145f,
+           -0.01523801f, -0.07502609f, -0.09461885f, -0.15013233f, 0.16050665f,
+           0.09021381f, 0.08473236f, 0.03386267f, -0.09147339f, -0.09170618f,
+           -0.08498498f, -0.05119187f, -0.10431040f, 0.01041618f, -0.03064913f,
+           0.09340212f, 0.06448522f, -0.03881054f, -0.04985436f, -0.14794017f,
+           -0.05200112f, -0.02144495f, 0.04000821f, 0.12420804f, -0.01851651f,
+           -0.04116732f, -0.11951703f, -0.04879033f, -0.08722515f, -0.08454733f,
+           -0.10549165f, 0.11251976f, 0.10766345f, 0.19201984f, 0.06128913f,
+           -0.02734615f, -0.08834923f, -0.16999826f, -0.03548348f,
+           -5.36092324e-003f, 0.08297954f, 0.07226378f, 0.04194529f, 0.04668673f,
+           8.73902347e-003f, 0.06980139f, 0.05652480f, 0.05879445f, 0.02477076f,
+           0.02451423f, 0.12433673f, 0.05600227f, 0.06886370f, 0.03863076f,
+           0.07459056f, 0.02264139f, 0.01495469f, 0.06344220f, 0.06945208f,
+           0.02931899f, 0.11719371f, 0.04527427f, 0.03248192f, 2.08271481e-003f,
+           0.02044626f, 0.11403449f, 0.04303892f, 0.06444661f, 0.04959024f,
+           0.08174094f, 0.09240247f, 0.04894639f, 0.02252937f, -0.01652530f,
+           0.07587013f, 0.06064249f, 0.13954395f, 0.02772832f, 0.07093039f,
+           0.08501238f, 0.01701301f, 0.09055722f, 0.33421436f, 0.20163782f,
+           0.09821030f, 0.07951369f, 0.08695120f, -0.12757730f, -0.13865978f,
+           -0.06610068f, -0.10985506f, 0.03406816f, -0.01116336f, -0.07281768f,
+           -0.13525715f, -0.12844718f, 0.08956250f, 0.09171610f, 0.10092317f,
+           0.23385370f, 0.34489515f, 0.09901748f, 0.02002922f, 0.12335990f,
+           0.07606190f, -0.14899330f, -0.15634622f, -0.06494618f, -0.01760547f,
+           0.03404277f, -0.13208845f, -0.12101169f, -0.18294574f, -0.16560709f,
+           0.02183887f, -0.02752613f, 0.01813638f, 0.02000757f, 0.01319924f,
+           0.08030242f, 0.01220535f, 2.98233377e-003f, -0.01307070f, 0.05970297f,
+           -0.05345284f, -0.03381982f, -9.87543724e-003f, -0.06869387f,
+           0.03956730f, -0.03108176f, -0.05732809f, 0.02172386f, 0.04159765f,
+           2.62783933e-003f, 0.04813229f, 0.09358983f, -8.18389002e-003f,
+           0.01724574f, -0.02547474f, -0.04967288f, -0.02390376f, 0.06640504f,
+           -0.06306566f, 0.01137518f, 0.05589378f, -0.08237787f, 0.02455001f,
+           -0.03059422f, -0.08953978f, 0.06851497f, 0.07190268f, -0.07610799f,
+           7.87237938e-003f, -7.85830803e-003f, 0.06006952f, -0.01126728f,
+           -2.85743061e-003f, -0.04772895f, 0.01884944f, 0.15005857f,
+           -0.06268821f, -0.01989072f, 0.01138399f, 0.08760451f, 0.03879007f,
+           -9.66926850e-003f, -0.08012961f, 0.06414555f, -0.01362950f,
+           -0.09135523f, 0.01755159f, 0.04459474f, 0.09650917f, 0.05219948f,
+           -2.19440833e-003f, -0.07037939f, -0.01599054f, 0.13103317f,
+           -0.02492603f, -0.01032540f, -0.02903307f, 0.04489160f, 0.05148086f,
+           0.01858173f, -0.02919228f, 0.08299296f, -0.04590359f, -0.15745632f,
+           -0.09068198f, -0.02972453f, 0.12985018f, 0.22320485f, 0.24261914f,
+           0.03642650f, -0.05506422f, 2.67413049e-003f, -0.03834032f, 0.06449424f,
+           0.03834866f, 0.03816991f, 0.25039271f, 0.34212017f, 0.32433882f,
+           0.18824573f, -0.08599839f, -0.17599408f, -0.15317015f, -0.09913155f,
+           -0.02856072f, -0.05304699f, -1.06437842e-003f, -0.06641813f,
+           -0.07509298f, 0.01463361f, -0.07551918f, -0.04510373f,
+           -8.44620075e-003f, 0.01772176f, 0.04068235f, 0.20295307f, 0.15719447f,
+           0.05712103f, 0.26296997f, 0.14657754f, 0.01547317f, -0.05052776f,
+           -0.03881342f, -0.01437883f, -0.04930177f, 0.11719568f, 0.24098417f,
+           0.26468599f, 0.31698579f, 0.10103608f, -0.01096375f, -0.01367013f,
+           0.17104232f, 0.20065314f, 2.67622480e-003f, -0.01190034f, 0.18301608f,
+           0.09459770f, -0.06357619f, -0.06473801f, 0.01377906f, -0.10032775f,
+           -0.06388740f, 3.80393048e-003f, 0.06206078f, 0.10349120f, 0.26804337f,
+           8.17918684e-003f, -0.02314351f, 9.34422202e-003f, 0.09198381f,
+           0.03681326f, -8.77339672e-003f, -0.09662418f, -0.02715708f,
+           0.13503517f, 0.08962728f, -6.57071499e-003f, -0.03201199f, 0.28510824f,
+           0.32095715f, 0.18512695f, -0.14230858f, -0.14048551f, -0.07181299f,
+           -0.08575408f, -0.08661680f, -0.17416079f, 7.54326640e-004f,
+           0.05601677f, 0.13585392f, -0.04960437f, -0.07708392f, 0.10676333f,
+           -0.04407546f, -0.07209078f, 0.03663663f, 0.28949317f, 0.41127121f,
+           0.27431169f, -0.06900328f, -0.21474190f, -0.15578632f, -0.19555484f,
+           -0.15209621f, -0.11269179f, 0.07416003f, 0.18991330f, 0.26858172f,
+           0.01952259f, 0.01017922f, 0.02159843f, -4.95165400e-003f, -0.04368168f,
+           -0.12721671f, -0.06673957f, -0.11275250f, 0.04413409f, 0.05578312f,
+           0.03896771f, 0.03566417f, -0.05871816f, -0.07388090f, -0.17965563f,
+           -0.08570268f, -0.15273231f, -0.06022318f, -0.06999847f,
+           -6.81510568e-003f, 0.06294262f, -6.54901436e-004f, -0.01128654f,
+           -0.02289657f, 0.04849290f, 0.04140804f, 0.23681939f, 0.14545733f,
+           0.01989965f, 0.12032662f, 3.87463090e-003f, -6.02597650e-003f,
+           -0.05919775f, -0.03067224f, -0.07787777f, 0.10834727f, 0.02153730f,
+           0.02765649f, 0.03975543f, -0.12182906f, -0.04900113f, -0.09940100f,
+           -0.06453611f, -0.13757215f, -0.03721382f, 0.02827376f, -0.04351249f,
+           0.01907038f, -0.10284120f, -0.05671160f, -0.10760647f, -0.09624009f,
+           -0.09565596f, -0.01303654f, 0.03080539f, 0.01416511f, 0.05846142f,
+           -5.42971538e-003f, 0.06221476f, -0.03320325f, -0.06791797f,
+           -0.05791342f, 0.12851369f, 0.14990346f, 0.03634374f, 0.14262885f,
+           0.04330391f, 0.05032569f, -0.05631914f, 0.01606137f, 0.04387223f,
+           0.22344995f, 0.15722635f, -0.04693628f, 0.03006579f, -2.52882647e-003f,
+           0.05717621f, -0.07529724f, -0.02848588f, -0.06868757f,
+           -4.51729307e-003f, 0.06466042f, -0.05935378f, -0.04704857f,
+           -0.07363959f, 0.04843248f, -0.13421375f, -0.09789340f, -0.10255270f,
+           0.03509852f, 0.04751543f, -0.03822323f, 0.09740467f, 0.04762916f,
+           0.03940146f, -0.08283259f, 0.09552965f, 0.05038739f, 0.21258622f,
+           0.09646992f, 0.03241193f, 0.05167701f, 0.04614570f, 0.04330090f,
+           -0.02671840f, -0.06259909f, -0.02301898f, 0.18829170f, 0.10522786f,
+           0.04313190f, 0.01670948f, -0.08421925f, 0.05911417f, -0.10582602f,
+           -0.04855484f, -0.08373898f, 0.07775915f, 0.03723533f, -0.12047344f,
+           4.86345543e-003f, -0.10520902f, 0.06571782f, -0.07528137f,
+           -0.03245651f, -0.09869066f, -0.02917477f, -0.18293270f, 0.14810945f,
+           9.24033765e-003f, -0.04354914f, 0.02266885f, -0.11872729f,
+           -0.04016589f, 0.02830229f, 0.22539048f, 0.20565644f, 0.16701797f,
+           0.09019924f, 0.01300652f, 0.09760600f, -0.03675831f, -0.01935448f,
+           -0.06894835f, 0.08077277f, 0.19047537f, 0.11312226f, 0.04106043f,
+           -0.11187182f, 0.04312806f, -0.18548580f, -0.11287174f, -0.08794551f,
+           0.02078281f, -0.15295486f, 0.11806386f, -0.01103218f, -0.15971117f,
+           0.02153538f, -0.05232147f, -0.10835317f, -0.13910367f, 0.05920752f,
+           -0.10122602f, 0.20174250f, 0.09105796f, -0.01881348f, 0.09559010f,
+           -0.03725745f, -0.09442931f, -0.09763174f, 0.05854454f, 0.08287182f,
+           0.12919849f, 0.08594352f, -2.49806582e-003f, 0.02398440f,
+           5.67950122e-003f, -0.06296340f, -0.12993270f, 0.03855852f, 0.05186560f,
+           0.10839908f, -0.03380463f, -0.12654832f, -0.05399339f, -0.07456800f,
+           -0.04736232f, -0.10164231f, 0.07496139f, 0.08125214f, 0.07656177f,
+           -0.04999603f, -0.12823077f, -0.07692395f, -0.11317524f, -0.09118655f,
+           -0.05695669f, 0.10477209f, 0.07468581f, 0.01630048f, -8.00961629e-003f,
+           -0.06582128f, -0.04019095f, -0.04682907f, -0.01907842f, -0.10997720f,
+           0.04911406f, 0.02931030f, 0.04197735f, -0.05773980f, -0.09670641f,
+           -0.03594951f, -0.03402121f, -0.07149299f, -0.10566200f, 0.10601286f,
+           0.06340689f, -0.01518632f, -5.96402306e-003f, -0.07628012f,
+           -3.52779147e-003f, -0.02683854f, -0.10265494f, -0.02680815f,
+           0.16338381f, 0.03103515f, 0.02296976f, 0.01624348f, -0.10831620f,
+           -0.02314233f, -0.04789969f, -0.05530700f, -0.06461314f, 0.10494506f,
+           0.04642856f, -0.07592955f, -0.06197905f, -0.09042154f, -0.01445521f,
+           -0.04297818f, -0.11262015f, -0.11430512f, 0.03174541f, -0.03677487f,
+           -0.02963996f, -0.06610169f, -0.13292049f, -0.07059067f, -0.08444111f,
+           -0.02640536f, -0.07136250f, 0.04559967f, 0.01459980f, 0.17989251f,
+           0.04435328f, -0.12464730f, -0.02871115f, -0.10752209f, -0.03393742f,
+           -0.03791408f, 0.02548251f, 0.01956050f, 0.19245651f, 0.13963254f,
+           -0.05904696f, -0.07424626f, -0.10411884f, 1.54176133e-003f,
+           0.01797429f, 0.13025844f, 0.04547642f, -0.05710349f, -0.10697161f,
+           -0.13489437f, -0.06515755f, -0.06406886f, -4.08572936e-003f,
+           -0.01336483f, 0.04368737f, -0.11259720f, -0.05701635f, -0.06469971f,
+           -0.08346602f, -0.04166770f, -0.05795543f, -0.08247511f, -0.05742628f,
+           0.08452254f, -0.03350224f, 0.13980860f, 0.13252275f, 0.07589617f,
+           0.07539988f, 0.12155797f, 0.19087289f, 0.15050751f, 0.21250245f,
+           0.14206800f, 0.01298489f, 0.07450245f, 0.06559097f, 0.01700557f,
+           0.04512971f, 0.16950700f, 0.10261577f, 0.16389982f, 0.05505059f,
+           -0.03453077f, 0.08622462f, 0.07935954f, 0.03976260f, 0.02036091f,
+           3.95744899e-003f, 0.03267065f, 0.15235919f, 0.01297494f, -0.08109194f,
+           0.01407558f, 4.40693414e-003f, -0.15157418f, -0.11390478f,
+           -0.07487597f, -7.81322457e-003f, -0.02749545f, -0.10181408f,
+           0.13755716f, 0.14007211f, 0.13482562f, 0.27517235f, 0.34251109f,
+           0.07639657f, 0.07268607f, 0.19823882f, 0.16135791f, -0.04186463f,
+           -0.12784107f, -0.09846287f, 0.03169041f, 0.10974082f, -0.15051922f,
+           -0.08916726f, -0.07138767f, -0.04153349f, 6.25418453e-003f,
+           0.01266654f, 0.10533249f, 0.12749144f, 0.15148053f, 0.01498513f,
+           0.06305949f, -0.01247123f, -0.08778401f, -0.08551880f, -0.11955146f,
+           -0.08493572f, -0.02901620f, -0.02394859f, -0.13427313f, -0.11053200f,
+           -0.14413260f, -0.15203285f, 0.03972760f, -3.72127310e-004f,
+           -0.04200919f, 0.06105104f, 0.01904975f, -0.01106191f,
+           -7.27445772e-003f, -0.01520341f, 1.10228511e-003f, -0.04949187f,
+           -0.08013099f, 5.72071038e-003f, 0.08415454f, -0.06523152f, 0.03664081f,
+           -0.02673042f, -0.12066154f, -0.03702074f, 0.06006580f, 0.01628682f,
+           -6.17772620e-003f, 0.08192339f, -3.41629819e-003f, 0.02870512f,
+           0.05807141f, 0.04959986f, 0.04618251f, -0.04901629f, -0.10579574f,
+           0.02274442f, 0.12070961f, 2.23597488e-003f, 0.09831765f, -0.03019848f,
+           -0.11181970f, -0.04961075f, 0.02498928f, -0.03714991f, -0.01619653f,
+           0.02643486f, -7.62964319e-003f, -0.02882290f, -0.06242594f,
+           -0.08439861f, 0.07220893f, 0.07263952f, 0.01561574f, 0.03091968f,
+           0.01708712f, -0.03797151f, -3.18561122e-003f, 0.01624021f,
+           -0.02828573f, 0.11284444f, -1.32280716e-003f, -0.07784860f,
+           -0.07209100f, 0.03372242f, 0.12154529f, 0.02278104f, -0.05275500f,
+           -0.01918484f, 0.12989293f, 0.05424401f, 0.02333086f, 0.04029022f,
+           0.12392918f, 0.09495489f, 0.09190340f, 0.07935889f, 8.76816828e-003f,
+           0.17148446f, -8.51302687e-003f, -0.08011249f, -0.06796283f,
+           0.04884845f, 0.01112272f, -0.07835306f, -1.14811445e-003f,
+           -0.03440760f, 0.02845243f, 0.07695542f, -0.07069533f, -0.01151784f,
+           -8.53884313e-003f, -0.01662786f, -0.04163864f, 0.05400505f,
+           0.02859163f, 0.02921852f, 0.05003135f, -6.85718050e-003f, -0.01632611f,
+           0.07780217f, 0.04042810f, -0.01216440f, 3.60914599e-003f, -0.06322435f,
+           0.09516726f, 0.12877031f, -9.69162490e-003f, 0.01031179f, 0.05180895f,
+           -9.34659224e-003f, -0.01644533f, -0.04849347f, -0.04343236f,
+           0.10514783f, 0.08046635f, -0.04615205f, -0.03975486f, -0.01485525f,
+           0.13096830f, -0.01517950f, -0.06571898f, -0.04016372f, 0.01849786f,
+           0.02439670f, 0.08067258f, 1.74824719e-003f, 0.07053747f, 0.08819518f,
+           -5.08352555e-003f, -0.06550863f, -0.08266170f, -0.07780605f,
+           0.01453450f, -0.08756890f, 0.01096501f, -8.71319138e-003f, 0.10110464f,
+           0.02420769f, -0.06708383f, 0.02007811f, 5.93133038e-003f, 0.05398923f,
+           0.07538138f, 0.02049227f, 0.02242589f, 0.04011070f, -1.44875818e-003f,
+           -4.19115182e-003f, 0.06367654f, 0.02506934f, 0.02434536f, 0.05879405f,
+           -8.22952855e-003f, -0.01242441f, 0.04224926f, -0.01754923f,
+           0.05958161f, 0.03818886f, -0.01830363f, -0.04308917f, -0.04422197f,
+           -0.02432721f, 0.02264866f, 2.03751423e-003f, 0.01197031f, 0.04439203f,
+           0.12169247f, 0.03602713f, -0.02599251f, -1.98226492e-003f, 0.02046336f,
+           -0.02639058f, -1.91242550e-003f, -0.09334669f, -0.03595153f,
+           -9.88179818e-003f, -0.06848445f, -0.04666303f, -0.09955736f,
+           -0.04206430f, 0.02609075f, 9.09005292e-003f, -0.07138551f,
+           -4.22313227e-004f, 0.01766645f, 0.02756404f, 0.01308276f, 0.04052891f,
+           0.02387515f, 0.05337298f, 0.02500631f, -0.04970853f, -0.12467445f,
+           0.17604403f, 0.12256411f, -0.07512254f, 8.70451052e-003f, -0.05697548f,
+           -0.03626474f, -8.76623299e-003f, -0.01210897f, -0.09451522f,
+           0.07490732f, -0.02008001f, -0.02681278f, -0.06463405f, -0.01517507f,
+           7.33757764e-003f, 6.07147906e-003f, -0.09316964f, -0.04575328f,
+           0.13261597f, 0.15424870f, -0.01655918f, -0.02772390f, -0.05243644f,
+           -0.02356456f, -0.02351753f, -0.10211615f, -0.12873036f, 0.14549787f,
+           0.12519856f, 4.38762689e-003f, 0.02795992f, 0.05170322f, 0.09223596f,
+           0.05890015f, 0.02376701f, -0.02777346f, 0.09506908f, 0.02328936f,
+           -0.02319928f, -0.03218696f, -0.01527841f, -0.01016694f, -0.02674719f,
+           0.05137179f, 0.01980666f, 0.06544447f, -0.01746171f, 0.01026380f,
+           0.01561806f, 7.97004555e-004f, 0.07601810f, 0.01907250f, -0.03083035f,
+           -0.05987392f, 0.09242783f, 0.14555025f, 0.01035827f, 0.03092401f,
+           -0.09562709f, -0.03802354f, 0.02531144f, 0.03079449f, -0.07100715f,
+           0.03330721f, -2.69116857e-003f, 0.03167490f, 0.05744999f, 0.03259895f,
+           1.91266940e-003f, 0.03194578f, 0.07389776f, 0.02198060f, 0.07633314f,
+           0.03293105f, -0.09103648f, 0.04718142f, 0.06102672f, -0.01003063f,
+           5.85481385e-003f, -0.01522574f, 0.02323526f, 0.10584345f,
+           4.35879454e-003f, 0.06107873f, 0.05868603f, -0.03115531f, 0.01214679f,
+           0.08567052f, 3.93926632e-003f, -0.02521488f, -1.88425183e-003f,
+           0.02038053f, -6.26854831e-004f, 0.04897438f, -0.04280585f,
+           -0.04819689f, -0.04812867f, -0.01451186f, 0.05101469f,
+           -9.01125465e-003f, -0.03333859f, 0.03917955f, 0.04196448f, 0.04292135f,
+           0.02809529f, 0.02999715f, 0.04081348f, 9.10039060e-003f, 0.09703232f,
+           0.10379741f, 0.02348725f, -4.72756615e-003f, 0.01027325f, 0.10402658f,
+           0.12071823f, 0.09817299f, -0.02612033f, 0.03638414f, 0.05896405f,
+           0.04865025f, 0.04793910f, -0.03882321f, -0.02962117f, -0.01222268f,
+           0.04071597f, 0.01922777f, -0.02287866f, 0.03328381f, 0.01859092f,
+           0.09024994f, 0.03804455f, -0.01424510f, 0.01953739f, 0.02509617f,
+           -0.03390914f, -0.05663941f, -0.01641979f, 0.05848591f, 0.04639670f,
+           0.02092116f, 0.12911791f, 0.19918139f, 0.07739855f, -7.25806039e-003f,
+           0.04074838f, 0.03183993f, 1.39251316e-003f, -0.01428625f, 0.01865480f,
+           0.08529541f, 0.13547510f, 0.11189661f, 0.03998901f, 0.09575938f,
+           -0.02631102f, -0.03458253f, -0.04749985f, -0.06070716f,
+           4.71884012e-003f, 0.06445789f, -0.02450038f, -0.05483776f,
+           -0.04657237f, -0.02030717f, -0.03480766f, -0.09397731f, -0.06399718f,
+           -0.01804585f, 5.62348310e-003f, -6.64811488e-003f, -0.06517869f,
+           6.96210237e-003f, -0.01860148f, -0.04245830f, -0.05850367f,
+           -3.24417115e-003f, 0.07700698f, 0.11290991f, 0.09923030f, -0.02970599f,
+           0.05592411f, 0.04813979f, -0.09811195f, -0.09357996f, -0.03276114f,
+           0.05218338f, 0.04141375f, 3.92977800e-003f, -0.05047480f, 0.15960084f,
+           0.04612800f, -0.03114098f, -0.04650044f, -0.03249795f, -0.02425641f,
+           -0.04311355f, 0.04307659f, -0.09401883f, -0.04742785f, -0.01254499f,
+           -0.06598741f, 3.41369561e-003f, -0.05620445f, -7.28127593e-003f,
+           -0.05998361f, -0.03274450f, -0.07376868f, 3.19015374e-003f,
+           -0.07733069f, 0.05815864f, -0.02471071f, 0.03850617f, 0.13838784f,
+           0.15399861f, 0.01731321f, -0.01477586f, 0.10393341f, 0.05159833f,
+           -0.01945555f, -0.03427503f, -0.04867341f, 0.09237480f, 0.10732719f,
+           0.06071450f, -0.01355071f, 0.01844356f, -0.03480803f, -0.03796671f,
+           2.15628621e-004f, -0.05440186f, 0.01889855f, -0.01443413f,
+           -0.02607902f, -0.02938001f, 0.02720689f, -0.06228397f, -0.02970936f,
+           -0.03426210f, -0.10280876f, -0.06739304f, -0.05227850f, 0.03360292f,
+           -0.11278441f, -0.06966180f, -0.13937433f, 9.10932291e-003f,
+           2.52020749e-004f, -4.07359656e-003f, 0.12310639f, 0.09343060f,
+           0.07302511f, 0.03222093f, 0.07532879f, 0.03792387f, -0.04985180f,
+           0.01804602f, 0.02694195f, 0.13481498f, 0.04601225f, 0.04106982f,
+           0.08511057f, 0.12314661f, 0.01320830f, 0.05044121f, -5.52943908e-003f,
+           -0.08992624f, -0.02249301f, -0.08181777f, 0.06165213f, -0.03256603f,
+           -0.01068920f, -0.01323473f, -0.11970232f, -0.04616347f, -0.12088681f,
+           -0.06762606f, -0.08676834f, -0.06434575f, 0.01772529f, 0.03469615f,
+           -0.10926618f, 0.03013873f, 0.14030397f, 0.16130108f, 0.17985588f,
+           0.11281928f, 0.10530639f, 0.08905948f, 0.07733764f, 0.06695238f,
+           0.02142088f, 0.06438877f, 0.09794453f, 0.05745072f, 0.02788557f,
+           0.02632830f, 0.07985807f, 4.24902979e-003f, 8.47890321e-003f,
+           -0.02679466f, -5.28812688e-003f, -0.02162580f, -0.07490715f,
+           -0.08251337f, -0.02056576f, -0.01026194f, -1.15492963e-003f,
+           -5.75720915e-004f, -0.07210591f, -0.07320981f, -0.04883312f,
+           -0.10897151f, -0.07477258f, -0.08867134f, -0.09222437f, -0.10924666f,
+           -0.10430276f, 0.07953499f, 0.02767959f, 0.11393359f, 0.18779543f,
+           0.03313421f, 0.02143700f, 0.05852016f, -2.12067598e-003f,
+           -3.76984011e-003f, 0.02774167f, -0.03124610f, 0.01465141f, 0.01616004f,
+           -0.01391913f, -0.04404102f, -0.05444227f, -0.14684731f, -0.15016587f,
+           0.04509468f, 1.29563001e-003f, 0.01398350f, 0.05610404f, -0.04868806f,
+           -0.04776716f, -8.16873740e-003f, -2.30126386e-003f, -0.02286313f,
+           0.11983398f, -0.04703261f, -0.08814441f, -0.07585249f, -0.10799607f,
+           -0.03232087f, 0.01509786f, -0.04843464f, -0.03967846f, 0.09589416f,
+           0.01352560f, -0.01458119f, 0.01050829f, -0.03038946f, 0.01608388f,
+           1.11975556e-003f, -0.01250656f, 2.86211423e-003f, 0.04333691f,
+           -0.14603497f, -0.01946543f, -0.02327525f, -0.01973944f, 0.07944400f,
+           -0.02224544f, -0.06701808f, 0.03476532f, 0.11505594f, -0.02712801f,
+           -0.01665113f, 0.06315716f, -0.08205860f, 0.07431999f, 0.04915778f,
+           -0.04468752f, -0.01490402f, 0.07400476f, -0.11650901f, 0.05102430f,
+           0.04559118f, -0.05916039f, 0.08840760f, -0.01587902f, -0.14890194f,
+           0.07857784f, 0.04710254f, -0.05381983f, -0.07331945f, -0.03604643f,
+           0.15611970f, 0.07649943f, -0.05959348f, -0.02776607f, 0.11098688f,
+           0.03758875f, -0.04446875f, 0.04933187f, 0.01345535f, 0.06921103f,
+           0.07364785f, 0.05518956f, 0.02899585f, 0.09375840f, 0.10518434f,
+           -0.04420241f, 0.01915282f, -3.56386811e-003f, 0.14586878f, 0.10286101f,
+           -0.04360626f, -0.12723237f, 0.09076386f, 0.11119842f, -0.06035013f,
+           0.09674817f, 0.08938243f, 0.07065924f, 0.02603180f, 5.84815582e-003f,
+           -0.05922065f, 0.12360309f, 3.59695964e-003f, 2.99844006e-003f,
+           0.03697936f, 0.02043072f, 0.04168725f, 0.01025975f, -0.01359980f,
+           -0.01600920f, 0.02581056f, 0.02329250f, 2.98100687e-003f, 0.01629762f,
+           0.06652115f, 0.05855627f, 0.01237463f, -0.01297135f, 0.01761587f,
+           0.05090865f, 0.06549342f, -0.04425945f, 2.43203156e-003f,
+           3.07327788e-003f, 0.06678630f, -0.04303836f, 0.01082393f, -0.06476044f,
+           0.04077786f, 0.12441979f, 0.08237778f, 0.07424165f, 0.04065890f,
+           0.06905543f, 0.09556347f, 0.12724875f, -0.02132082f, 0.08514154f,
+           -0.04175328f, -0.02666954f, 0.01897836f, 0.03317382f, 9.45465732e-003f,
+           -0.01238974f, -0.04242500f, -0.01419479f, -0.03545213f, -0.02440874f,
+           0.08684119f, 0.04212951f, 0.02462858f, -0.01104825f, -5.01706870e-003f,
+           0.02968982f, 0.02597476f, -0.01568939f, 0.04514892f, 0.06974549f,
+           0.08670278f, 0.06828108f, 0.10238872f, 0.05405957f, 0.06548470f,
+           -0.03763957f, 0.01366090f, 0.07069602f, 0.05363748f, 0.04798120f,
+           0.11706422f, 0.05466456f, -0.01869259f, 0.06344382f, 0.03106543f,
+           0.08432506f, -0.02061096f, 0.03821088f, -6.92190882e-003f,
+           6.40467042e-003f, -0.01271779f, 6.89014705e-005f, 0.04541415f,
+           -0.01899539f, -0.05020239f, 0.03000903f, 0.01090422f, 4.52452758e-003f,
+           0.02573632f, -0.02388454f, -0.04200457f, 1.72783900e-003f,
+           -0.05978370f, -0.02720562f, 0.06573715f, 0.01154317f, 0.01265615f,
+           0.07375994f, -9.19828378e-003f, -0.04914120f, 0.02124831f, 0.06455322f,
+           0.04372910f, -0.03310043f, 0.03605788f, -6.78055827e-003f,
+           9.36202332e-003f, 0.01747596f, -0.06406314f, -0.06812935f, 0.08080816f,
+           -0.02778088f, 0.02735260f, 0.06393493f, 0.06652229f, 0.05676993f,
+           0.08640018f, -7.59188086e-003f, -0.02012847f, -0.04741159f,
+           -0.01657069f, -0.01624399f, 0.05547778f, -2.33309763e-003f,
+           0.01120033f, 0.06141156f, -0.06285004f, -0.08732341f, -0.09313398f,
+           -0.04267832f, 5.57443965e-003f, 0.04809862f, 0.01773641f,
+           5.37361018e-003f, 0.14842421f, -0.06298012f, -0.02935147f, 0.11443478f,
+           -0.05034208f, 5.65494271e-003f, 0.02076526f, -0.04577984f,
+           -0.04735741f, 0.02961071f, -0.09307127f, -0.04417921f, -0.04990027f,
+           -0.03940028f, 0.01306016f, 0.06267900f, 0.03758737f, 0.08460117f,
+           0.13858789f, 0.04862388f, -0.06319809f, -0.05655516f, 0.01885816f,
+           -0.03285607f, 0.03371567f, -0.07040928f, -0.04514049f, 0.01392166f,
+           0.08184422f, -0.07230316f, 0.02386871f, 0.02184591f, 0.02605764f,
+           -0.01033954f, 9.29878280e-003f, 7.67351175e-003f, 0.15189242f,
+           0.02069071f, -0.09738296f, -0.08894105f, -0.07768748f, 0.02332268f,
+           -0.01778995f, -0.03258888f, -0.08180822f, -0.08492987f, 0.02290156f,
+           -0.11368170f, -0.03554465f, -0.04533844f, -0.02861580f, 0.06782424f,
+           0.01113123f, 0.02453644f, 0.12721945f, 0.08084814f, -0.03607795f,
+           0.01109122f, 0.04803548f, -0.03489929f, 0.03399536f, -0.05682014f,
+           8.59533902e-003f, -4.27904585e-003f, 0.03230887f, -0.01300198f,
+           -0.01038137f, -0.07930113f, 8.33097473e-003f, 0.02296994f,
+           -0.01306500f, -0.01881626f, 0.04413369f, 0.05729880f, -0.03761553f,
+           0.01942326f, 1.64540811e-003f, -0.03811319f, 0.04190650f, -0.14978096f,
+           -0.04514487f, 0.01209545f, -5.46460645e-003f, -0.01647195f,
+           7.63064111e-003f, -0.07494587f, 0.08415288f, 0.10020141f, -0.01228561f,
+           0.06553826f, 0.04554005f, 0.07890417f, 0.03041138f, 0.01752007f,
+           0.09208256f, -3.74419295e-004f, 0.10549527f, 0.04686913f, 0.01894833f,
+           -0.02651412f, -4.34682379e-003f, 5.44942822e-003f, 0.01444484f,
+           0.05882156f, -0.03336544f, 0.04603891f, -0.10432546f, 0.01923928f,
+           0.01842845f, -0.01712168f, -0.02222766f, 0.04693324f, -0.06202956f,
+           -0.01422159f, 0.08732220f, -0.07706107f, 0.02661049f, -0.04300238f,
+           -0.03092422f, -0.03552184f, -0.01886088f, -0.04979934f, 0.03906401f,
+           0.04608644f, 0.04966111f, 0.04275464f, -0.04621769f, -0.02653212f,
+           8.57011229e-003f, 0.03839684f, 0.05818764f, 0.03880796f,
+           -2.76100676e-004f, 0.03076511f, -0.03266929f, -0.05374557f,
+           0.04986527f, -9.45429131e-003f, 0.03582499f, -2.64564669e-003f,
+           -1.07461517e-003f, 0.02962313f, -0.01483363f, 0.03060869f, 0.02448327f,
+           0.01845641f, 0.03282966f, -0.03534438f, -0.01084059f, -0.01119136f,
+           -1.85360224e-003f, -5.94652840e-004f, -0.04451817f, 2.98327743e-003f,
+           0.06272484f, -0.02152076f, -3.05971340e-003f, -0.05070828f,
+           0.01531762f, 0.01282815f, 0.05167150f, 9.46266949e-003f,
+           -3.34558333e-003f, 0.11442288f, -0.03906701f, -2.67325155e-003f,
+           0.03069184f, -0.01134165f, 0.02949462f, 0.02879886f, 0.03855566f,
+           -0.03450781f, 0.09142872f, -0.02156654f, 0.06075062f, -0.06220816f,
+           0.01944680f, 6.68372354e-003f, -0.06656796f, 8.70784000e-003f,
+           0.03456013f, 0.02434320f, -0.13236357f, -0.04177035f, -0.02069627f,
+           0.01068112f, 0.01505432f, -0.07517391f, -3.83571628e-003f,
+           -0.06298508f, -0.02881260f, -0.13101046f, -0.07221562f,
+           -5.79945277e-003f, -8.57300125e-003f, 0.03782469f, 0.02762164f,
+           0.04942456f, -0.02936396f, 0.09597211f, 0.01921411f, 0.06101191f,
+           -0.04787507f, -0.01379578f, -7.40224449e-003f, -0.02220136f,
+           -0.01313756f, 7.77558051e-003f, 0.12296968f, 0.02939998f, 0.03594062f,
+           -0.07788624f, -0.01133144f, 3.99316690e-004f, -0.06090347f,
+           -0.01122066f, -4.68682544e-003f, 0.07633100f, -0.06748922f,
+           -0.05640298f, -0.05265681f, -0.01139122f, -0.01624347f, -0.04715714f,
+           -0.01099092f, 0.01048561f, 3.28499987e-003f, -0.05810167f,
+           -0.07699911f, -0.03330683f, 0.04185145f, 0.03478536f, 0.02275165f,
+           0.02304766f, 6.66040834e-003f, 0.10968148f, -5.93013782e-003f,
+           -0.04858336f, -0.04203213f, -0.09316786f, -6.13074889e-003f,
+           -0.02544625f, 0.01366201f, 9.18555818e-003f, -0.01846578f,
+           -0.05622401f, -0.03989377f, -0.07810296f, 6.91275718e-003f,
+           0.05957597f, -0.03901334f, 0.01572002f, -0.01193903f,
+           -6.89400872e-003f, -0.03093356f, -0.04136098f, -0.01562869f,
+           -0.04604580f, 0.02865234f, -0.08678447f, -0.03232484f, -0.05364593f,
+           -0.01445016f, -0.07003860f, -0.08669746f, -0.04520775f, 0.04274122f,
+           0.03117515f, 0.08175703f, 0.01081109f, 0.06379741f, 0.06199206f,
+           0.02865988f, 0.02360346f, 0.06725410f, -0.03248780f, -9.37702879e-003f,
+           0.08265898f, -0.02245839f, 0.05125763f, -0.01862395f, 0.01973453f,
+           -0.01994494f, -0.10770868f, 0.03180375f, 3.23935156e-003f,
+           -0.02142080f, -0.04256190f, 0.04760900f, 0.04282863f, 0.05635953f,
+           -0.01870849f, 0.05540622f, -0.03042666f, 0.01455277f, -0.06630179f,
+           -0.05843807f, -0.03739681f, -0.09739155f, -0.03220233f, -0.05620182f,
+           -0.10381401f, 0.07400211f, 4.20676917e-003f, 0.03258535f,
+           2.14308966e-003f, 0.05121966f, -0.01274337f, 0.02384761f, 0.06335578f,
+           -0.07905591f, 0.08375625f, -0.07898903f, -0.06508528f, -0.02498444f,
+           0.06535810f, 0.03970535f, 0.04895468f, -0.01169566f, -0.03980601f,
+           0.05682293f, 0.05925463f, -0.01165808f, -0.07936699f, -0.04208954f,
+           0.01333987f, 0.09051196f, 0.10098671f, -0.03974256f, 0.01238771f,
+           -0.07501741f, -0.03655440f, -0.04301528f, 0.09216860f,
+           4.63579083e-004f, 0.02851115f, 0.02142735f, 1.28244064e-004f,
+           0.02879687f, -0.08554889f, -0.04838862f, 0.08135369f, -0.05756533f,
+           0.01413900f, 0.03451880f, -0.06619488f, -0.03053130f, 0.02961676f,
+           -0.07384635f, 0.01135692f, 0.05283910f, -0.07778034f, -0.02107482f,
+           -0.05511716f, -0.13473752f, 0.03030157f, 0.06722020f, -0.06218817f,
+           -0.05826827f, 0.06254654f, 0.02895772f, -0.01664000f, -0.03620280f,
+           -0.01612278f, -1.46097376e-003f, 0.14013411f, -8.96181818e-003f,
+           -0.03250246f, 3.38630192e-003f, 2.64779478e-003f, 0.03359732f,
+           -0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f };
+
+        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
+    }
+}
+
+#endif
diff --git a/modules/cudaobjdetect/src/precomp.hpp b/modules/cudaobjdetect/src/precomp.hpp
new file mode 100644
index 0000000000..2e5ab7af3b
--- /dev/null
+++ b/modules/cudaobjdetect/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudaobjdetect/test/test_main.cpp b/modules/cudaobjdetect/test/test_main.cpp
new file mode 100644
index 0000000000..04f4fcf6e6
--- /dev/null
+++ b/modules/cudaobjdetect/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
diff --git a/modules/cuda/test/test_objdetect.cpp b/modules/cudaobjdetect/test/test_objdetect.cpp
similarity index 64%
rename from modules/cuda/test/test_objdetect.cpp
rename to modules/cudaobjdetect/test/test_objdetect.cpp
index 8c7b5ec918..336d6e0718 100644
--- a/modules/cuda/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@@ -48,9 +48,10 @@ using namespace cvtest;
 
 //#define DUMP
 
-struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescriptor
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
     cv::cuda::DeviceInfo devInfo;
+    cv::Ptr<cv::cuda::HOG> hog;
 
 #ifdef DUMP
     std::ofstream f;
@@ -69,23 +70,13 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
         devInfo = GetParam();
 
         cv::cuda::setDevice(devInfo.deviceID());
+
+        hog = cv::cuda::HOG::create();
     }
 
 #ifdef DUMP
-    void dump(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void dump(const std::vector<cv::Point>& locations)
     {
-        f.write((char*)&blockHists.rows, sizeof(blockHists.rows));
-        f.write((char*)&blockHists.cols, sizeof(blockHists.cols));
-
-        for (int i = 0; i < blockHists.rows; ++i)
-        {
-            for (int j = 0; j < blockHists.cols; ++j)
-            {
-                float val = blockHists.at<float>(i, j);
-                f.write((char*)&val, sizeof(val));
-            }
-        }
-
         int nlocations = locations.size();
         f.write((char*)&nlocations, sizeof(nlocations));
 
@@ -93,21 +84,18 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
             f.write((char*)&locations[i], sizeof(locations[i]));
     }
 #else
-    void compare(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void compare(const std::vector<cv::Point>& locations)
     {
+        // skip block_hists check
         int rows, cols;
         f.read((char*)&rows, sizeof(rows));
         f.read((char*)&cols, sizeof(cols));
-        ASSERT_EQ(rows, blockHists.rows);
-        ASSERT_EQ(cols, blockHists.cols);
-
-        for (int i = 0; i < blockHists.rows; ++i)
+        for (int i = 0; i < rows; ++i)
         {
-            for (int j = 0; j < blockHists.cols; ++j)
+            for (int j = 0; j < cols; ++j)
             {
                 float val;
                 f.read((char*)&val, sizeof(val));
-                ASSERT_NEAR(val, blockHists.at<float>(i, j), 1e-3);
             }
         }
 
@@ -126,54 +114,41 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
 
     void testDetect(const cv::Mat& img)
     {
-        gamma_correction = false;
-        setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog->setGammaCorrection(false);
+        hog->setSVMDetector(hog->getDefaultPeopleDetector());
 
         std::vector<cv::Point> locations;
 
         // Test detect
-        detect(loadMat(img), locations, 0);
+        hog->detect(loadMat(img), locations);
 
 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
 
         // Test detect on smaller image
         cv::Mat img2;
         cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);
 
 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
 
         // Test detect on greater image
         cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);
 
 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
     }
-
-    // Does not compare border value, as interpolation leads to delta
-    void compare_inner_parts(cv::Mat d1, cv::Mat d2)
-    {
-        for (int i = 1; i < blocks_per_win_y - 1; ++i)
-            for (int j = 1; j < blocks_per_win_x - 1; ++j)
-                for (int k = 0; k < block_hist_size; ++k)
-                {
-                    float a = d1.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    float b = d2.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    ASSERT_FLOAT_EQ(a, b);
-                }
-    }
 };
 
 // desabled while resize does not fixed
@@ -182,13 +157,8 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
     cv::Mat img_rgb = readImage("hog/road.png");
     ASSERT_FALSE(img_rgb.empty());
 
-#ifdef DUMP
     f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
     ASSERT_TRUE(f.is_open());
-#else
-    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
-    ASSERT_TRUE(f.is_open());
-#endif
 
     // Test on color image
     cv::Mat img;
@@ -198,8 +168,6 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
     // Test on gray image
     cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
     testDetect(img);
-
-    f.close();
 }
 
 CUDA_TEST_P(HOG, GetDescriptors)
@@ -216,8 +184,14 @@ CUDA_TEST_P(HOG, GetDescriptors)
 
     // Convert train images into feature vectors (train table)
     cv::cuda::GpuMat descriptors, descriptors_by_cols;
-    getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
-    getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
+
+    hog->setWinStride(Size(64, 128));
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_ROW_BY_ROW);
+    hog->compute(d_img, descriptors);
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_COL_BY_COL);
+    hog->compute(d_img, descriptors_by_cols);
 
     // Check size of the result train table
     wins_per_img_x = 3;
@@ -242,48 +216,6 @@ CUDA_TEST_P(HOG, GetDescriptors)
                     ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
                               r[(x * blocks_per_win_y + y) * block_hist_size + k]);
     }
-
-    /* Now we want to extract the same feature vectors, but from single images. NOTE: results will
-    be defferent, due to border values interpolation. Using of many small images is slower, however we
-    wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms
-    works good, it can be checked in the gpu_hog sample */
-
-    img_rgb = readImage("hog/positive1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    // Everything is fine with interpolation for left top subimage
-    ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));
-
-    img_rgb = readImage("hog/positive2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
-
-    img_rgb = readImage("hog/negative1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
-
-    img_rgb = readImage("hog/negative2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
-
-    img_rgb = readImage("hog/positive3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
-
-    img_rgb = readImage("hog/negative3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
@@ -310,12 +242,12 @@ CUDA_TEST_P(CalTech, HOG)
     cv::cuda::GpuMat d_img(img);
     cv::Mat markedImage(img.clone());
 
-    cv::cuda::HOGDescriptor d_hog;
-    d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
-    d_hog.nlevels = d_hog.nlevels + 32;
+    cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+    d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
+    d_hog->setNumLevels(d_hog->getNumLevels() + 32);
 
     std::vector<cv::Rect> found_locations;
-    d_hog.detectMultiScale(d_img, found_locations);
+    d_hog->detectMultiScale(d_img, found_locations);
 
 #if defined (LOG_CASCADE_STATISTIC)
     for (int i = 0; i < (int)found_locations.size(); i++)
@@ -326,7 +258,8 @@ CUDA_TEST_P(CalTech, HOG)
         cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
     }
 
-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
 #endif
 }
 
@@ -354,9 +287,15 @@ PARAM_TEST_CASE(LBP_Read_classifier, cv::cuda::DeviceInfo, int)
 
 CUDA_TEST_P(LBP_Read_classifier, Accuracy)
 {
-    cv::cuda::CascadeClassifier_CUDA classifier;
     std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
-    ASSERT_TRUE(classifier.load(classifierXmlPath));
+
+    cv::Ptr<cv::cuda::CascadeClassifier> d_cascade;
+
+    ASSERT_NO_THROW(
+        d_cascade = cv::cuda::CascadeClassifier::create(classifierXmlPath);
+    );
+
+    ASSERT_FALSE(d_cascade.empty());
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_Read_classifier,
@@ -396,29 +335,28 @@ CUDA_TEST_P(LBP_classify, Accuracy)
     for (; it != rects.end(); ++it)
         cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0));
 
-    cv::cuda::CascadeClassifier_CUDA gpuClassifier;
-    ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
+    cv::Ptr<cv::cuda::CascadeClassifier> gpuClassifier =
+            cv::cuda::CascadeClassifier::create(classifierXmlPath);
 
-    cv::cuda::GpuMat gpu_rects;
     cv::cuda::GpuMat tested(grey);
-    int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
+    cv::cuda::GpuMat gpu_rects_buf;
+    gpuClassifier->detectMultiScale(tested, gpu_rects_buf);
+
+    std::vector<cv::Rect> gpu_rects;
+    gpuClassifier->convert(gpu_rects_buf, gpu_rects);
 
 #if defined (LOG_CASCADE_STATISTIC)
-    cv::Mat downloaded(gpu_rects);
-    const cv::Rect* faces = downloaded.ptr<cv::Rect>();
-    for (int i = 0; i < count; i++)
+    for (size_t i = 0; i < gpu_rects.size(); i++)
     {
-        cv::Rect r = faces[i];
+        cv::Rect r = gpu_rects[i];
 
         std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
         cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
     }
-#endif
 
-#if defined (LOG_CASCADE_STATISTIC)
-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
 #endif
-    (void)count;
 }
 
 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_classify,
diff --git a/modules/cudaobjdetect/test/test_precomp.hpp b/modules/cudaobjdetect/test/test_precomp.hpp
new file mode 100644
index 0000000000..a2d16c8105
--- /dev/null
+++ b/modules/cudaobjdetect/test/test_precomp.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <fstream>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "cvconfig.h"
+
+#endif
diff --git a/modules/cudawarping/CMakeLists.txt b/modules/cudawarping/CMakeLists.txt
index 231e24e695..fa99e9d04b 100644
--- a/modules/cudawarping/CMakeLists.txt
+++ b/modules/cudawarping/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Image Warping")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
 
-ocv_define_module(cudawarping opencv_imgproc OPTIONAL opencv_cudalegacy)
+ocv_define_module(cudawarping opencv_core opencv_imgproc OPTIONAL opencv_cudev)
diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp
index ca877d50c9..66c41ccefb 100644
--- a/modules/cudawarping/include/opencv2/cudawarping.hpp
+++ b/modules/cudawarping/include/opencv2/cudawarping.hpp
@@ -171,21 +171,6 @@ CV_EXPORTS void warpPerspective(InputArray src, OutputArray dst, InputArray M, S
  */
 CV_EXPORTS void buildWarpPerspectiveMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());
 
-/** @brief Builds plane warping maps.
- */
-CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, InputArray T, float scale,
-                                   OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
-
-/** @brief Builds cylindrical warping maps.
- */
-CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
-                                         OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
-
-/** @brief Builds spherical warping maps.
- */
-CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
-                                       OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
-
 /** @brief Rotates an image around the origin (0,0) and then shifts it.
 
 @param src Source image. Supports 1, 3 or 4 channels images with CV_8U , CV_16U or CV_32F
@@ -224,14 +209,6 @@ src .
  */
 CV_EXPORTS void pyrUp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-class CV_EXPORTS ImagePyramid : public Algorithm
-{
-public:
-    virtual void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const = 0;
-};
-
-CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());
-
 //! @}
 
 }} // namespace cv { namespace cuda {
diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp
index dfb11075a7..36662418c3 100644
--- a/modules/cudawarping/perf/perf_warping.cpp
+++ b/modules/cudawarping/perf/perf_warping.cpp
@@ -325,88 +325,6 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpPerspective,
     }
 }
 
-//////////////////////////////////////////////////////////////////////
-// BuildWarpPlaneMaps
-
-PERF_TEST_P(Sz, BuildWarpPlaneMaps,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-    const cv::Mat T = cv::Mat::zeros(1, 3, CV_32F);
-
-    if (PERF_RUN_CUDA())
-    {
-        cv::cuda::GpuMat map_x;
-        cv::cuda::GpuMat map_y;
-
-        TEST_CYCLE() cv::cuda::buildWarpPlaneMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, T, 1.0, map_x, map_y);
-
-        CUDA_SANITY_CHECK(map_x);
-        CUDA_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BuildWarpCylindricalMaps
-
-PERF_TEST_P(Sz, BuildWarpCylindricalMaps,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_CUDA())
-    {
-        cv::cuda::GpuMat map_x;
-        cv::cuda::GpuMat map_y;
-
-        TEST_CYCLE() cv::cuda::buildWarpCylindricalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
-
-        CUDA_SANITY_CHECK(map_x);
-        CUDA_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BuildWarpSphericalMaps
-
-PERF_TEST_P(Sz, BuildWarpSphericalMaps,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_CUDA())
-    {
-        cv::cuda::GpuMat map_x;
-        cv::cuda::GpuMat map_y;
-
-        TEST_CYCLE() cv::cuda::buildWarpSphericalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
-
-        CUDA_SANITY_CHECK(map_x);
-        CUDA_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // Rotate
 
@@ -514,40 +432,3 @@ PERF_TEST_P(Sz_Depth_Cn, PyrUp,
         CPU_SANITY_CHECK(dst);
     }
 }
-
-//////////////////////////////////////////////////////////////////////
-// ImagePyramidGetLayer
-
-PERF_TEST_P(Sz_Depth_Cn, ImagePyramidGetLayer,
-            Combine(CUDA_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    CUDA_CHANNELS_1_3_4))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    const int nLayers = 3;
-    const cv::Size dstSize(size.width / 2 + 10, size.height / 2 + 10);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst;
-
-        cv::Ptr<cv::cuda::ImagePyramid> d_pyr = cv::cuda::createImagePyramid(d_src, nLayers);
-
-        TEST_CYCLE() d_pyr->getLayer(dst, dstSize);
-
-        CUDA_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
diff --git a/modules/cudawarping/src/precomp.hpp b/modules/cudawarping/src/precomp.hpp
index 9f5b0c1529..a59a4e9257 100644
--- a/modules/cudawarping/src/precomp.hpp
+++ b/modules/cudawarping/src/precomp.hpp
@@ -47,11 +47,4 @@
 
 #include "opencv2/core/private.cuda.hpp"
 
-#include "opencv2/opencv_modules.hpp"
-
-#ifdef HAVE_OPENCV_CUDALEGACY
-#  include "opencv2/cudalegacy.hpp"
-#  include "opencv2/cudalegacy/private.hpp"
-#endif
-
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudawarping/src/pyramids.cpp b/modules/cudawarping/src/pyramids.cpp
index 3d942fc6a3..0cb0f5de57 100644
--- a/modules/cudawarping/src/pyramids.cpp
+++ b/modules/cudawarping/src/pyramids.cpp
@@ -50,8 +50,6 @@ using namespace cv::cuda;
 void cv::cuda::pyrDown(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 void cv::cuda::pyrUp(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
-Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray, int, Stream&) { throw_no_cuda(); return Ptr<ImagePyramid>(); }
-
 #else // HAVE_CUDA
 
 //////////////////////////////////////////////////////////////////////////////
@@ -133,112 +131,4 @@ void cv::cuda::pyrUp(InputArray _src, OutputArray _dst, Stream& stream)
     func(src, dst, StreamAccessor::getStream(stream));
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// ImagePyramid
-
-#ifdef HAVE_OPENCV_CUDALEGACY
-
-namespace
-{
-    class ImagePyramidImpl : public ImagePyramid
-    {
-    public:
-        ImagePyramidImpl(InputArray img, int nLayers, Stream& stream);
-
-        void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const;
-
-    private:
-        GpuMat layer0_;
-        std::vector<GpuMat> pyramid_;
-        int nLayers_;
-    };
-
-    ImagePyramidImpl::ImagePyramidImpl(InputArray _img, int numLayers, Stream& stream)
-    {
-        GpuMat img = _img.getGpuMat();
-
-        CV_Assert( img.depth() <= CV_32F && img.channels() <= 4 );
-
-        img.copyTo(layer0_, stream);
-
-        Size szLastLayer = img.size();
-        nLayers_ = 1;
-
-        if (numLayers <= 0)
-            numLayers = 255; // it will cut-off when any of the dimensions goes 1
-
-        pyramid_.resize(numLayers);
-
-        for (int i = 0; i < numLayers - 1; ++i)
-        {
-            Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
-
-            if (szCurLayer.width == 0 || szCurLayer.height == 0)
-                break;
-
-            ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
-            nLayers_++;
-
-            const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
-
-            cv::cuda::device::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
-
-            szLastLayer = szCurLayer;
-        }
-    }
-
-    void ImagePyramidImpl::getLayer(OutputArray _outImg, Size outRoi, Stream& stream) const
-    {
-        CV_Assert( outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0 );
-
-        ensureSizeIsEnough(outRoi, layer0_.type(), _outImg);
-        GpuMat outImg = _outImg.getGpuMat();
-
-        if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
-        {
-            layer0_.copyTo(outImg, stream);
-            return;
-        }
-
-        float lastScale = 1.0f;
-        float curScale;
-        GpuMat lastLayer = layer0_;
-        GpuMat curLayer;
-
-        for (int i = 0; i < nLayers_ - 1; ++i)
-        {
-            curScale = lastScale * 0.5f;
-            curLayer = pyramid_[i];
-
-            if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
-            {
-                curLayer.copyTo(outImg, stream);
-            }
-
-            if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
-                break;
-
-            lastScale = curScale;
-            lastLayer = curLayer;
-        }
-
-        cv::cuda::device::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
-    }
-}
-
 #endif
-
-Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray img, int nLayers, Stream& stream)
-{
-#ifndef HAVE_OPENCV_CUDALEGACY
-    (void) img;
-    (void) nLayers;
-    (void) stream;
-    throw_no_cuda();
-    return Ptr<ImagePyramid>();
-#else
-    return Ptr<ImagePyramid>(new ImagePyramidImpl(img, nLayers, stream));
-#endif
-}
-
-#endif // HAVE_CUDA
diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp
index 121ea5c018..99554e3122 100644
--- a/modules/cudawarping/src/warp.cpp
+++ b/modules/cudawarping/src/warp.cpp
@@ -53,10 +53,6 @@ void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputAr
 void cv::cuda::warpPerspective(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
 void cv::cuda::buildWarpPerspectiveMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::cuda::buildWarpPlaneMaps(Size, Rect, InputArray, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
-void cv::cuda::buildWarpCylindricalMaps(Size, Rect, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
-void cv::cuda::buildWarpSphericalMaps(Size, Rect, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::cuda::rotate(InputArray, OutputArray, Size, double, double, double, int, Stream&) { throw_no_cuda(); }
 
 #else // HAVE_CUDA
@@ -462,124 +458,6 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M,
     }
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpPlaneMaps
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
-                                cudaStream_t stream);
-    }
-}}}
-
-void cv::cuda::buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, InputArray _T,
-                                 float scale, OutputArray _map_x, OutputArray _map_y, Stream& stream)
-{
-    (void) src_size;
-
-    Mat K = _K.getMat();
-    Mat R = _R.getMat();
-    Mat T = _T.getMat();
-
-    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
-    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
-    CV_Assert( (T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32FC1 && T.isContinuous() );
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert( K_Rinv.isContinuous() );
-    CV_Assert( R_Kinv.isContinuous() );
-
-    _map_x.create(dst_roi.size(), CV_32FC1);
-    _map_y.create(dst_roi.size(), CV_32FC1);
-
-    GpuMat map_x = _map_x.getGpuMat();
-    GpuMat map_y = _map_y.getGpuMat();
-
-    device::imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
-                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpCylyndricalMaps
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                      const float k_rinv[9], const float r_kinv[9], float scale,
-                                      cudaStream_t stream);
-    }
-}}}
-
-void cv::cuda::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
-                                       OutputArray _map_x, OutputArray _map_y, Stream& stream)
-{
-    (void) src_size;
-
-    Mat K = _K.getMat();
-    Mat R = _R.getMat();
-
-    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
-    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert( K_Rinv.isContinuous() );
-    CV_Assert( R_Kinv.isContinuous() );
-
-    _map_x.create(dst_roi.size(), CV_32FC1);
-    _map_y.create(dst_roi.size(), CV_32FC1);
-
-    GpuMat map_x = _map_x.getGpuMat();
-    GpuMat map_y = _map_y.getGpuMat();
-
-    device::imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpSphericalMaps
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                    const float k_rinv[9], const float r_kinv[9], float scale,
-                                    cudaStream_t stream);
-    }
-}}}
-
-void cv::cuda::buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
-                                     OutputArray _map_x, OutputArray _map_y, Stream& stream)
-{
-    (void) src_size;
-
-    Mat K = _K.getMat();
-    Mat R = _R.getMat();
-
-    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
-    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert( K_Rinv.isContinuous() );
-    CV_Assert( R_Kinv.isContinuous() );
-
-    _map_x.create(dst_roi.size(), CV_32FC1);
-    _map_y.create(dst_roi.size(), CV_32FC1);
-
-    GpuMat map_x = _map_x.getGpuMat();
-    GpuMat map_y = _map_y.getGpuMat();
-
-    device::imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
 ////////////////////////////////////////////////////////////////////////
 // rotate
 
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index c6223fb6bb..3d70172284 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -337,7 +337,7 @@ public:
           double _min_margin=0.003, int _edge_blur_size=5 );
 
     CV_WRAP virtual void detectRegions( InputArray image,
-                                        std::vector<std::vector<Point> >& msers,
+                                        CV_OUT std::vector<std::vector<Point> >& msers,
                                         std::vector<Rect>& bboxes ) = 0;
 
     CV_WRAP virtual void setDelta(int delta) = 0;
diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp
index 7988584030..fd15345b29 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -818,7 +818,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
     ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
     sumX = sumY = 0.f;
 
-    for (size_t k = 0; k < ang_size; ++k) {
+    for (int k = 0; k < ang_size; ++k) {
       // Get angle from the x-axis of the sample point
       const float & ang = Ang[k];
 
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index 66e6846573..0878bff60b 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -48,6 +48,11 @@
 #  pragma GCC diagnostic ignored "-Wmissing-declarations"
 #endif
 
+#if (_WIN32_IE < 0x0500)
+#pragma message("WARNING: Win32 UI needs to be compiled with _WIN32_IE >= 0x0500 (_WIN32_IE_IE50)")
+#define _WIN32_IE 0x0500
+#endif
+
 #include <commctrl.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index a22d3dca76..b0c942172c 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -90,6 +90,8 @@ enum { IMWRITE_PNG_STRATEGY_DEFAULT      = 0,
 
 /** @brief Loads an image from a file.
 
+@anchor imread
+
 @param filename Name of file to be loaded.
 @param flags Flags specifying the color type of a loaded image:
 -   CV_LOAD_IMAGE_ANYDEPTH - If set, return 16-bit/32-bit image when the input has the
diff --git a/modules/imgcodecs/src/grfmt_gdal.cpp b/modules/imgcodecs/src/grfmt_gdal.cpp
index f172f6f9aa..0311630950 100644
--- a/modules/imgcodecs/src/grfmt_gdal.cpp
+++ b/modules/imgcodecs/src/grfmt_gdal.cpp
@@ -38,10 +38,17 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include "grfmt_gdal.hpp"
+#include "precomp.hpp"
+
+// GDAL Macros
+#include "cvconfig.h"
 
 #ifdef HAVE_GDAL
 
+// Our Header
+#include "grfmt_gdal.hpp"
+
+
 /// C++ Standard Libraries
 #include <iostream>
 #include <stdexcept>
@@ -195,7 +202,10 @@ GdalDecoder::~GdalDecoder(){
 /**
  * Convert data range
 */
-double range_cast( const GDALDataType& gdalType, const int& cvDepth, const double& value ){
+double range_cast( const GDALDataType& gdalType,
+                   const int& cvDepth,
+                   const double& value )
+{
 
     // uint8 -> uint8
     if( gdalType == GDT_Byte && cvDepth == CV_8U ){
diff --git a/modules/imgcodecs/src/grfmt_gdal.hpp b/modules/imgcodecs/src/grfmt_gdal.hpp
index b2cd224467..73d39c9470 100644
--- a/modules/imgcodecs/src/grfmt_gdal.hpp
+++ b/modules/imgcodecs/src/grfmt_gdal.hpp
@@ -42,16 +42,15 @@
 #ifndef __GRFMT_GDAL_HPP__
 #define __GRFMT_GDAL_HPP__
 
+/// OpenCV FMT Base Type
+#include "grfmt_base.hpp"
+
 /// Macro to make sure we specified GDAL in CMake
 #ifdef HAVE_GDAL
 
 /// C++ Libraries
 #include <iostream>
 
-/// OpenCV Libraries
-#include "grfmt_base.hpp"
-#include "precomp.hpp"
-
 /// Geospatial Data Abstraction Library
 #include <gdal/cpl_conv.h>
 #include <gdal/gdal_priv.h>
@@ -61,6 +60,13 @@
 /// Start of CV Namespace
 namespace cv {
 
+/**
+ * Convert GDAL Pixel Range to OpenCV Pixel Range
+*/
+double range_cast( const GDALDataType& gdalType,
+                   const int& cvDepth,
+                   const double& value );
+
 /**
  * Convert GDAL Palette Interpretation to OpenCV Pixel Type
 */
diff --git a/modules/imgcodecs/test/test_grfmt.cpp b/modules/imgcodecs/test/test_grfmt.cpp
index d3f21f16b3..d1610ae7fc 100644
--- a/modules/imgcodecs/test/test_grfmt.cpp
+++ b/modules/imgcodecs/test/test_grfmt.cpp
@@ -664,7 +664,7 @@ private:
         vector<Mat> pages;
         bool res = imreadmulti(folder + "multipage.tif", pages, flags);
         ASSERT_TRUE(res == true);
-        ASSERT_TRUE(pages.size() == page_count);
+        ASSERT_EQ(static_cast<size_t>(page_count), pages.size());
 
         for (int i = 0; i < page_count; i++)
         {
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 28025c197d..fad801b947 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -3332,9 +3332,11 @@ data type.
 @param result Map of comparison results. It must be single-channel 32-bit floating-point. If image
 is \f$W \times H\f$ and templ is \f$w \times h\f$ , then result is \f$(W-w+1) \times (H-h+1)\f$ .
 @param method Parameter specifying the comparison method, see cv::TemplateMatchModes
+@param mask Mask of searched template. It must have the same datatype and size with templ. It is
+not set by default.
  */
 CV_EXPORTS_W void matchTemplate( InputArray image, InputArray templ,
-                                 OutputArray result, int method );
+                                 OutputArray result, int method, InputArray mask = noArray() );
 
 //! @}
 
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index b0a81ed32d..758ccb02bc 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -193,7 +193,9 @@ cvStartFindContours( void* _img, CvMemStorage* storage,
 
     if( !((CV_IS_MASK_ARR( mat ) && mode < CV_RETR_FLOODFILL) ||
           (CV_MAT_TYPE(mat->type) == CV_32SC1 && mode == CV_RETR_FLOODFILL)) )
-        CV_Error( CV_StsUnsupportedFormat, "[Start]FindContours support only 8uC1 and 32sC1 images" );
+        CV_Error( CV_StsUnsupportedFormat,
+                  "[Start]FindContours supports only CV_8UC1 images when mode != CV_RETR_FLOODFILL "
+                  "otherwise supports CV_32SC1 images only" );
 
     CvSize size = cvSize( mat->width, mat->height );
     int step = mat->step;
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index f376507255..63a1005ae7 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2231,9 +2231,8 @@ struct SymmRowSmallVec_8u32s
 
     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
     {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;
 
         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
         int* dst = (int*)_dst;
@@ -2459,9 +2458,8 @@ struct SymmColumnVec_32s8u
 
     int operator()(const uchar** _src, uchar* dst, int width) const
     {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;
 
         int _ksize = kernel.rows + kernel.cols - 1;
         int ksize2 = _ksize / 2;
@@ -2612,9 +2610,8 @@ struct SymmColumnSmallVec_32s16s
 
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;
 
         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
         const float* ky = kernel.ptr<float>() + ksize2;
@@ -2788,15 +2785,13 @@ struct SymmColumnVec_32f16s
         kernel = _kernel;
         delta = (float)_delta;
         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-        //Uncomment the following line when runtime support for neon is implemented.
-        // neon_supported = checkHardwareSupport(CV_CPU_NEON);
+         neon_supported = checkHardwareSupport(CV_CPU_NEON);
     }
 
     int operator()(const uchar** _src, uchar* _dst, int width) const
     {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !neon_supported )
-        //     return 0;
+         if( !neon_supported )
+             return 0;
 
         int _ksize = kernel.rows + kernel.cols - 1;
         int ksize2 = _ksize / 2;
@@ -2943,9 +2938,8 @@ struct SymmRowSmallVec_32f
 
     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
     {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;
 
         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
         float* dst = (float*)_dst;
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 5ab70d9a26..2a69003641 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1497,7 +1497,9 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
     }
 
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(sigma1 == 0 && sigma2 == 0 && tegra::gaussian(_src.getMat(), _dst.getMat(), ksize, borderType))
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    if(sigma1 == 0 && sigma2 == 0 && tegra::gaussian(src, dst, ksize, borderType))
         return;
 #endif
 
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 416917a2fb..8afdba7d10 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -814,12 +814,97 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
         }
     }
 }
+
+static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )
+{
+    int type = _img.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert( CV_TM_SQDIFF <= method && method <= CV_TM_CCOEFF_NORMED );
+    CV_Assert( (depth == CV_8U || depth == CV_32F) && type == _templ.type() && _img.dims() <= 2 );
+
+    Mat img = _img.getMat(), templ = _templ.getMat(), mask = _mask.getMat();
+    int ttype = templ.type(), tdepth = CV_MAT_DEPTH(ttype), tcn = CV_MAT_CN(ttype);
+    int mtype = img.type(), mdepth = CV_MAT_DEPTH(type), mcn = CV_MAT_CN(mtype);
+
+    if (depth == CV_8U)
+    {
+        depth = CV_32F;
+        type = CV_MAKETYPE(CV_32F, cn);
+        img.convertTo(img, type, 1.0 / 255);
+    }
+
+    if (tdepth == CV_8U)
+    {
+        tdepth = CV_32F;
+        ttype = CV_MAKETYPE(CV_32F, tcn);
+        templ.convertTo(templ, ttype, 1.0 / 255);
+    }
+
+    if (mdepth == CV_8U)
+    {
+        mdepth = CV_32F;
+        mtype = CV_MAKETYPE(CV_32F, mcn);
+        compare(mask, Scalar::all(0), mask, CMP_NE);
+        mask.convertTo(mask, mtype, 1.0 / 255);
+    }
+
+    Size corrSize(img.cols - templ.cols + 1, img.rows - templ.rows + 1);
+    _result.create(corrSize, CV_32F);
+    Mat result = _result.getMat();
+
+    Mat img2 = img.mul(img);
+    Mat mask2 = mask.mul(mask);
+    Mat mask_templ = templ.mul(mask);
+    Scalar templMean, templSdv;
+
+    double templSum2 = 0;
+    meanStdDev( mask_templ, templMean, templSdv );
+
+    templSum2 = templSdv[0]*templSdv[0] + templSdv[1]*templSdv[1] + templSdv[2]*templSdv[2] + templSdv[3]*templSdv[3];
+    templSum2 += templMean[0]*templMean[0] + templMean[1]*templMean[1] + templMean[2]*templMean[2] + templMean[3]*templMean[3];
+    templSum2 *= ((double)templ.rows * templ.cols);
+
+    if (method == CV_TM_SQDIFF)
+    {
+        Mat mask2_templ = templ.mul(mask2);
+
+        Mat corr(corrSize, CV_32F);
+        crossCorr( img, mask2_templ, corr, corr.size(), corr.type(), Point(0,0), 0, 0 );
+        crossCorr( img2, mask, result, result.size(), result.type(), Point(0,0), 0, 0 );
+
+        result -= corr * 2;
+        result += templSum2;
+    }
+    else if (method == CV_TM_CCORR_NORMED)
+    {
+        if (templSum2 < DBL_EPSILON)
+        {
+            result = Scalar::all(1);
+            return;
+        }
+
+        Mat corr(corrSize, CV_32F);
+        crossCorr( img2, mask2, corr, corr.size(), corr.type(), Point(0,0), 0, 0 );
+        crossCorr( img, mask_templ, result, result.size(), result.type(), Point(0,0), 0, 0 );
+
+        sqrt(corr, corr);
+        result = result.mul(1/corr);
+        result /= std::sqrt(templSum2);
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method )
+void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )
 {
+    if (!_mask.empty())
+    {
+        cv::matchTemplateMask(_img, _templ, _result, method, _mask);
+        return;
+    }
+
     int type = _img.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     CV_Assert( CV_TM_SQDIFF <= method && method <= CV_TM_CCOEFF_NORMED );
     CV_Assert( (depth == CV_8U || depth == CV_32F) && type == _templ.type() && _img.dims() <= 2 );
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 4e25a5ccf1..841cfa2725 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -931,7 +931,7 @@ Ptr<CascadeClassifierImpl::MaskGenerator> CascadeClassifierImpl::getMaskGenerato
 Ptr<BaseCascadeClassifier::MaskGenerator> createFaceDetectionMaskGenerator()
 {
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    return tegra::getCascadeClassifierMaskGenerator(*this);
+    return tegra::getCascadeClassifierMaskGenerator();
 #else
     return Ptr<BaseCascadeClassifier::MaskGenerator>();
 #endif
@@ -1072,10 +1072,10 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
         {
             String opts;
             if (lbufSize.area())
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D HAAR",
                               localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
             else
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D HAAR",
                               localsz.width, localsz.height, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
             haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
             if( haarKernel.empty() )
@@ -1112,10 +1112,10 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
         {
             String opts;
             if (lbufSize.area())
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D LBP",
                               localsz.width, localsz.height, lbufSize.area(), lbufSize.width, splitstage_ocl, nstages, MAX_FACES);
             else
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D LBP",
                               localsz.width, localsz.height, splitstage_ocl, nstages, MAX_FACES);
             lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
             if( lbpKernel.empty() )
diff --git a/modules/objdetect/src/cascadedetect.hpp b/modules/objdetect/src/cascadedetect.hpp
index 17eeccd53b..4cbf3e9bf0 100644
--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "opencv2/core/ocl.hpp"
+
 namespace cv
 {
 
diff --git a/modules/objdetect/src/opencl/cascadedetect.cl b/modules/objdetect/src/opencl/cascadedetect.cl
index dfebc28dd3..13cb1aa389 100644
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -12,19 +12,22 @@
 //    Erping Pang, erping@multicorewareinc.com
 //
 
-
+#ifdef HAAR
 typedef struct __attribute__((aligned(4))) OptHaarFeature
 {
     int4 ofs[3] __attribute__((aligned (4)));
     float4 weight __attribute__((aligned (4)));
 }
 OptHaarFeature;
+#endif
 
+#ifdef LBP
 typedef struct __attribute__((aligned(4))) OptLBPFeature
 {
     int16 ofs __attribute__((aligned (4)));
 }
 OptLBPFeature;
+#endif
 
 typedef struct __attribute__((aligned(4))) Stump
 {
@@ -64,6 +67,7 @@ ScaleData;
 #define NODE_COUNT 1
 #endif
 
+#ifdef HAAR
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
 void runHaarClassifier(
     int nscales, __global const ScaleData* scaleData,
@@ -352,7 +356,9 @@ void runHaarClassifier(
         }
     }
 }
+#endif
 
+#ifdef LBP
 #undef CALC_SUM_OFS_
 #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
     ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
@@ -651,3 +657,4 @@ void runLBPClassifierStump(
         }
     }
 }
+#endif
diff --git a/modules/photo/include/opencv2/photo/cuda.hpp b/modules/photo/include/opencv2/photo/cuda.hpp
index 4b69afa7be..a5c83f7717 100644
--- a/modules/photo/include/opencv2/photo/cuda.hpp
+++ b/modules/photo/include/opencv2/photo/cuda.hpp
@@ -59,69 +59,71 @@ namespace cv { namespace cuda {
 @param block_size Size of block used for computing weights.
 @param borderMode Border type. See borderInterpolate for details. BORDER_REFLECT101 ,
 BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supported for now.
-@param s Stream for the asynchronous version.
+@param stream Stream for the asynchronous version.
 
 @sa
    fastNlMeansDenoising
  */
-CV_EXPORTS void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null());
+CV_EXPORTS void nonLocalMeans(InputArray src, OutputArray dst,
+                              float h,
+                              int search_window = 21,
+                              int block_size = 7,
+                              int borderMode = BORDER_DEFAULT,
+                              Stream& stream = Stream::Null());
 
-/** @brief The class implements fast approximate Non Local Means Denoising algorithm.
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
+removes image details, smaller h value preserves details but also preserves some noise
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.
+
+This function expected to be applied to grayscale images. For colored images look at
+FastNonLocalMeansDenoising::labMethod.
+
+@sa
+   fastNlMeansDenoising
  */
-class CV_EXPORTS FastNonLocalMeansDenoising
-{
-public:
-    /** @brief Perform image denoising using Non-local Means Denoising algorithm
-    <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
-    optimizations. Noise expected to be a gaussian white noise
+CV_EXPORTS void fastNlMeansDenoising(InputArray src, OutputArray dst,
+                                     float h,
+                                     int search_window = 21,
+                                     int block_size = 7,
+                                     Stream& stream = Stream::Null());
 
-    @param src Input 8-bit 1-channel, 2-channel or 3-channel image.
-    @param dst Output image with the same size and type as src .
-    @param h Parameter regulating filter strength. Big h value perfectly removes noise but also
-    removes image details, smaller h value preserves details but also preserves some noise
-    @param search_window Size in pixels of the window that is used to compute weighted average for
-    given pixel. Should be odd. Affect performance linearly: greater search_window - greater
-    denoising time. Recommended value 21 pixels
-    @param block_size Size in pixels of the template patch that is used to compute weights. Should be
-    odd. Recommended value 7 pixels
-    @param s Stream for the asynchronous invocations.
+/** @brief Modification of fastNlMeansDenoising function for colored images
 
-    This function expected to be applied to grayscale images. For colored images look at
-    FastNonLocalMeansDenoising::labMethod.
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h_luminance Parameter regulating filter strength. Big h value perfectly removes noise but
+also removes image details, smaller h value preserves details but also preserves some noise
+@param photo_render float The same as h but for color components. For most images value equals 10 will be
+enought to remove colored noise and do not distort colors
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.
 
-    @sa
-       fastNlMeansDenoising
-     */
-    void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
+The function converts image to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using FastNonLocalMeansDenoising::simpleMethod function.
 
-    /** @brief Modification of FastNonLocalMeansDenoising::simpleMethod for color images
-
-    @param src Input 8-bit 3-channel image.
-    @param dst Output image with the same size and type as src .
-    @param h_luminance Parameter regulating filter strength. Big h value perfectly removes noise but
-    also removes image details, smaller h value preserves details but also preserves some noise
-    @param photo_render float The same as h but for color components. For most images value equals 10 will be
-    enought to remove colored noise and do not distort colors
-    @param search_window Size in pixels of the window that is used to compute weighted average for
-    given pixel. Should be odd. Affect performance linearly: greater search_window - greater
-    denoising time. Recommended value 21 pixels
-    @param block_size Size in pixels of the template patch that is used to compute weights. Should be
-    odd. Recommended value 7 pixels
-    @param s Stream for the asynchronous invocations.
-
-    The function converts image to CIELAB colorspace and then separately denoise L and AB components
-    with given h parameters using FastNonLocalMeansDenoising::simpleMethod function.
-
-    @sa
-       fastNlMeansDenoisingColored
-     */
-    void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float photo_render, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
-
-private:
-
-    GpuMat buffer, extended_src_buffer;
-    GpuMat lab, l, ab;
-};
+@sa
+   fastNlMeansDenoisingColored
+ */
+CV_EXPORTS void fastNlMeansDenoisingColored(InputArray src, OutputArray dst,
+                                            float h_luminance, float photo_render,
+                                            int search_window = 21,
+                                            int block_size = 7,
+                                            Stream& stream = Stream::Null());
 
 //! @} photo
 
diff --git a/modules/photo/perf/perf_cuda.cpp b/modules/photo/perf/perf_cuda.cpp
index 318ec17dfd..4496599d1b 100644
--- a/modules/photo/perf/perf_cuda.cpp
+++ b/modules/photo/perf/perf_cuda.cpp
@@ -126,12 +126,10 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, CUDA_FastNonLocalMeans,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::FastNonLocalMeansDenoising fnlmd;
-
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
 
-        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);
+        TEST_CYCLE() cv::cuda::fastNlMeansDenoising(d_src, dst, h, search_widow_size, block_size);
 
         CUDA_SANITY_CHECK(dst);
     }
@@ -171,12 +169,10 @@ PERF_TEST_P(Sz_Depth_WinSz_BlockSz, CUDA_FastNonLocalMeansColored,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::FastNonLocalMeansDenoising fnlmd;
-
         const cv::cuda::GpuMat d_src(src);
         cv::cuda::GpuMat dst;
 
-        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);
+        TEST_CYCLE() cv::cuda::fastNlMeansDenoisingColored(d_src, dst, h, h, search_widow_size, block_size);
 
         CUDA_SANITY_CHECK(dst);
     }
diff --git a/modules/photo/src/denoising.cuda.cpp b/modules/photo/src/denoising.cuda.cpp
index 76b870fe58..7ea37f6951 100644
--- a/modules/photo/src/denoising.cuda.cpp
+++ b/modules/photo/src/denoising.cuda.cpp
@@ -60,9 +60,9 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || !defined(HAVE_OPENCV_CUDAARITHM) || !defined(HAVE_OPENCV_CUDAIMGPROC)
 
-void cv::cuda::nonLocalMeans(const GpuMat&, GpuMat&, float, int, int, int, Stream&) { throw_no_cuda(); }
-void cv::cuda::FastNonLocalMeansDenoising::simpleMethod(const GpuMat&, GpuMat&, float, int, int, Stream&) { throw_no_cuda(); }
-void cv::cuda::FastNonLocalMeansDenoising::labMethod( const GpuMat&, GpuMat&, float, float, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::nonLocalMeans(InputArray, OutputArray, float, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::fastNlMeansDenoising(InputArray, OutputArray, float, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::fastNlMeansDenoisingColored(InputArray, OutputArray, float, float, int, int, Stream&) { throw_no_cuda(); }
 
 #else
 
@@ -78,13 +78,15 @@ namespace cv { namespace cuda { namespace device
     }
 }}}
 
-void cv::cuda::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s)
+void cv::cuda::nonLocalMeans(InputArray _src, OutputArray _dst, float h, int search_window, int block_window, int borderMode, Stream& stream)
 {
     using cv::cuda::device::imgproc::nlm_bruteforce_gpu;
     typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
 
     static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ };
 
+    const GpuMat src = _src.getGpuMat();
+
     CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3);
 
     const func_t func = funcs[src.channels() - 1];
@@ -93,8 +95,10 @@ void cv::cuda::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search
     int b = borderMode;
     CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP);
 
-    dst.create(src.size(), src.type());
-    func(src, dst, search_window/2, block_window/2, h, borderMode, StreamAccessor::getStream(s));
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, search_window/2, block_window/2, h, borderMode, StreamAccessor::getStream(stream));
 }
 
 namespace cv { namespace cuda { namespace device
@@ -112,47 +116,55 @@ namespace cv { namespace cuda { namespace device
      }
 }}}
 
-void cv::cuda::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
+void cv::cuda::fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int search_window, int block_window, Stream& stream)
 {
+    const GpuMat src = _src.getGpuMat();
+
     CV_Assert(src.depth() == CV_8U && src.channels() < 4);
 
     int border_size = search_window/2 + block_window/2;
     Size esize = src.size() + Size(border_size, border_size) * 2;
 
-    cv::cuda::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
-    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);
+    BufferPool pool(stream);
 
-    cv::cuda::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
+    GpuMat extended_src = pool.getBuffer(esize, src.type());
+    cv::cuda::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
     GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));
 
     int bcols, brows;
     device::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
-    buffer.create(brows, bcols, CV_32S);
+    GpuMat buffer = pool.getBuffer(brows, bcols, CV_32S);
 
     using namespace cv::cuda::device::imgproc;
     typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
     static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};
 
-    dst.create(src.size(), src.type());
-    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(stream));
 }
 
-void cv::cuda::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
+void cv::cuda::fastNlMeansDenoisingColored(InputArray _src, OutputArray _dst, float h_luminance, float h_color, int search_window, int block_window, Stream& stream)
 {
+    const GpuMat src = _src.getGpuMat();
+
     CV_Assert(src.type() == CV_8UC3);
 
-    lab.create(src.size(), src.type());
-    cv::cuda::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, s);
+    BufferPool pool(stream);
 
-    l.create(src.size(), CV_8U);
-    ab.create(src.size(), CV_8UC2);
-    device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
+    GpuMat lab = pool.getBuffer(src.size(), src.type());
+    cv::cuda::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, stream);
 
-    simpleMethod(l, l, h_luminance, search_window, block_window, s);
-    simpleMethod(ab, ab, h_color, search_window, block_window, s);
+    GpuMat l = pool.getBuffer(src.size(), CV_8U);
+    GpuMat ab = pool.getBuffer(src.size(), CV_8UC2);
+    device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(stream));
 
-    device::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
-    cv::cuda::cvtColor(lab, dst, cv::COLOR_Lab2BGR, 0, s);
+    fastNlMeansDenoising(l, l, h_luminance, search_window, block_window, stream);
+    fastNlMeansDenoising(ab, ab, h_color, search_window, block_window, stream);
+
+    device::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(stream));
+    cv::cuda::cvtColor(lab, _dst, cv::COLOR_Lab2BGR, 0, stream);
 }
 
 #endif
diff --git a/modules/photo/test/test_denoising.cuda.cpp b/modules/photo/test/test_denoising.cuda.cpp
index dce20b9f51..209bac3328 100644
--- a/modules/photo/test/test_denoising.cuda.cpp
+++ b/modules/photo/test/test_denoising.cuda.cpp
@@ -99,10 +99,9 @@ TEST(CUDA_FastNonLocalMeans, Regression)
     cv::cvtColor(bgr, gray, cv::COLOR_BGR2GRAY);
 
     GpuMat dbgr, dgray;
-    cv::cuda::FastNonLocalMeansDenoising fnlmd;
 
-    fnlmd.simpleMethod(GpuMat(gray),  dgray, 20);
-    fnlmd.labMethod(GpuMat(bgr),  dbgr, 20, 10);
+    cv::cuda::fastNlMeansDenoising(GpuMat(gray),  dgray, 20);
+    cv::cuda::fastNlMeansDenoisingColored(GpuMat(bgr),  dbgr, 20, 10);
 
 #if 0
     dumpImage("../gpu/denoising/fnlm_denoised_lena_bgr.png", cv::Mat(dbgr));
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index cf60ea9785..55a79484ce 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -861,7 +861,7 @@ class PythonWrapperGenerator(object):
             decls = self.parser.parse(hdr)
             if len(decls) == 0:
                 continue
-            self.code_include.write( '#include "{}"\n'.format(hdr[hdr.rindex('opencv2/'):]) )
+            self.code_include.write( '#include "{0}"\n'.format(hdr[hdr.rindex('opencv2/'):]) )
             for decl in decls:
                 name = decl[0]
                 if name.startswith("struct") or name.startswith("class"):
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index 73db4a0310..36d4452c7f 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -1,3 +1,8 @@
 set(the_description "Images stitching")
+
+if(HAVE_CUDA)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow)
+endif()
+
 ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect
                   OPTIONAL opencv_cuda opencv_cudaarithm opencv_cudafilters opencv_cudafeatures2d opencv_xfeatures2d)
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 79f387cc2f..19dff8e1f0 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -398,7 +398,6 @@ public:
 };
 
 
-#ifdef HAVE_OPENCV_CUDAWARPING
 class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 {
 public:
@@ -515,7 +514,6 @@ public:
 private:
     cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
 };
-#endif
 
 
 struct SphericalPortraitProjector : ProjectorBase
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index c36e6877bc..015ceb025f 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -476,7 +476,11 @@ static bool ocl_normalizeUsingWeightMap(InputArray _weight, InputOutputArray _ma
 
 void normalizeUsingWeightMap(InputArray _weight, InputOutputArray _src)
 {
+    Mat src;
+    Mat weight;
 #ifdef HAVE_TEGRA_OPTIMIZATION
+    src = _src.getMat();
+    weight = _weight.getMat();
     if(tegra::normalizeUsingWeightMap(weight, src))
         return;
 #endif
@@ -486,12 +490,12 @@ void normalizeUsingWeightMap(InputArray _weight, InputOutputArray _src)
             !ocl_normalizeUsingWeightMap(_weight, _src) )
 #endif
     {
-        Mat weight = _weight.getMat();
-        Mat src = _src.getMat();
+        src = _src.getMat();
+        weight = _weight.getMat();
 
         CV_Assert(src.type() == CV_16SC3);
 
-        if(weight.type() == CV_32FC1)
+        if (weight.type() == CV_32FC1)
         {
             for (int y = 0; y < src.rows; ++y)
             {
@@ -547,7 +551,8 @@ void createWeightMap(InputArray mask, float sharpness, InputOutputArray weight)
 void createLaplacePyr(InputArray img, int num_levels, std::vector<UMat> &pyr)
 {
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(tegra::createLaplacePyr(img, num_levels, pyr))
+    cv::Mat imgMat = img.getMat();
+    if(tegra::createLaplacePyr(imgMat, num_levels, pyr))
         return;
 #endif
 
diff --git a/modules/cudawarping/src/cuda/build_warp_maps.cu b/modules/stitching/src/cuda/build_warp_maps.cu
similarity index 100%
rename from modules/cudawarping/src/cuda/build_warp_maps.cu
rename to modules/stitching/src/cuda/build_warp_maps.cu
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 49ee0f4744..ee05268d78 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -154,7 +154,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
 
     matches_info.matches.clear();
 
-    Ptr<DescriptorMatcher> matcher;
+    Ptr<cv::DescriptorMatcher> matcher;
 #if 0 // TODO check this
     if (ocl::useOpenCL())
     {
@@ -220,13 +220,13 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
     descriptors1_.upload(features1.descriptors);
     descriptors2_.upload(features2.descriptors);
 
-    BFMatcher_CUDA matcher(NORM_L2);
+    Ptr<cuda::DescriptorMatcher> matcher = cuda::DescriptorMatcher::createBFMatcher(NORM_L2);
+
     MatchesSet matches;
 
     // Find 1->2 matches
     pair_matches.clear();
-    matcher.knnMatchSingle(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2);
-    matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
+    matcher->knnMatch(descriptors1_, descriptors2_, pair_matches, 2);
     for (size_t i = 0; i < pair_matches.size(); ++i)
     {
         if (pair_matches[i].size() < 2)
@@ -242,8 +242,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
 
     // Find 2->1 matches
     pair_matches.clear();
-    matcher.knnMatchSingle(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2);
-    matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
+    matcher->knnMatch(descriptors2_, descriptors1_, pair_matches, 2);
     for (size_t i = 0; i < pair_matches.size(); ++i)
     {
         if (pair_matches[i].size() < 2)
diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp
index 744474ba6e..4b6185f4e6 100644
--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -242,91 +242,6 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b
     dst_br.y = static_cast<int>(br_vf);
 }
 
-
-#ifdef HAVE_OPENCV_CUDAWARPING
-Rect PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
-{
-    return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap);
-}
-
-Rect PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
-{
-    projector_.setCameraParams(K, R, T);
-
-    Point dst_tl, dst_br;
-    detectResultRoi(src_size, dst_tl, dst_br);
-
-    cuda::buildWarpPlaneMaps(src_size, Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1)),
-                            K, R, T, projector_.scale, xmap, ymap);
-
-    return Rect(dst_tl, dst_br);
-}
-
-Point PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
-                           cuda::GpuMat & dst)
-{
-    return warp(src, K, R, Mat::zeros(3, 1, CV_32F), interp_mode, border_mode, dst);
-}
-
-
-Point PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
-                           cuda::GpuMat & dst)
-{
-    Rect dst_roi = buildMaps(src.size(), K, R, T, d_xmap_, d_ymap_);
-    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
-    cuda::remap(src, dst, d_xmap_, d_ymap_, interp_mode, border_mode);
-    return dst_roi.tl();
-}
-
-
-Rect SphericalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
-{
-    projector_.setCameraParams(K, R);
-
-    Point dst_tl, dst_br;
-    detectResultRoi(src_size, dst_tl, dst_br);
-
-    cuda::buildWarpSphericalMaps(src_size, Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1)),
-                                K, R, projector_.scale, xmap, ymap);
-
-    return Rect(dst_tl, dst_br);
-}
-
-
-Point SphericalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
-                               cuda::GpuMat & dst)
-{
-    Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
-    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
-    cuda::remap(src, dst, d_xmap_, d_ymap_, interp_mode, border_mode);
-    return dst_roi.tl();
-}
-
-
-Rect CylindricalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
-{
-    projector_.setCameraParams(K, R);
-
-    Point dst_tl, dst_br;
-    detectResultRoi(src_size, dst_tl, dst_br);
-
-    cuda::buildWarpCylindricalMaps(src_size, Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1)),
-                                  K, R, projector_.scale, xmap, ymap);
-
-    return Rect(dst_tl, dst_br);
-}
-
-
-Point CylindricalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
-                                 cuda::GpuMat & dst)
-{
-    Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
-    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
-    cuda::remap(src, dst, d_xmap_, d_ymap_, interp_mode, border_mode);
-    return dst_roi.tl();
-}
-#endif
-
 void SphericalPortraitWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br)
 {
     detectResultRoiByBorder(src_size, dst_tl, dst_br);
diff --git a/modules/stitching/src/warpers_cuda.cpp b/modules/stitching/src/warpers_cuda.cpp
new file mode 100644
index 0000000000..d1fe8739b4
--- /dev/null
+++ b/modules/stitching/src/warpers_cuda.cpp
@@ -0,0 +1,298 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#ifdef HAVE_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
+                                cudaStream_t stream);
+
+        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                    const float k_rinv[9], const float r_kinv[9], float scale,
+                                    cudaStream_t stream);
+
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                      const float k_rinv[9], const float r_kinv[9], float scale,
+                                      cudaStream_t stream);
+    }
+}}}
+
+static void buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, InputArray _T,
+                               float scale, OutputArray _map_x, OutputArray _map_y, Stream& stream = Stream::Null())
+{
+    (void) src_size;
+
+    Mat K = _K.getMat();
+    Mat R = _R.getMat();
+    Mat T = _T.getMat();
+
+    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
+    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
+    CV_Assert( (T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32FC1 && T.isContinuous() );
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert( K_Rinv.isContinuous() );
+    CV_Assert( R_Kinv.isContinuous() );
+
+    _map_x.create(dst_roi.size(), CV_32FC1);
+    _map_y.create(dst_roi.size(), CV_32FC1);
+
+    GpuMat map_x = _map_x.getGpuMat();
+    GpuMat map_y = _map_y.getGpuMat();
+
+    device::imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
+                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+static void buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
+                                   OutputArray _map_x, OutputArray _map_y, Stream& stream = Stream::Null())
+{
+    (void) src_size;
+
+    Mat K = _K.getMat();
+    Mat R = _R.getMat();
+
+    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
+    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert( K_Rinv.isContinuous() );
+    CV_Assert( R_Kinv.isContinuous() );
+
+    _map_x.create(dst_roi.size(), CV_32FC1);
+    _map_y.create(dst_roi.size(), CV_32FC1);
+
+    GpuMat map_x = _map_x.getGpuMat();
+    GpuMat map_y = _map_y.getGpuMat();
+
+    device::imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+static void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
+                                     OutputArray _map_x, OutputArray _map_y, Stream& stream = Stream::Null())
+{
+    (void) src_size;
+
+    Mat K = _K.getMat();
+    Mat R = _R.getMat();
+
+    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
+    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert( K_Rinv.isContinuous() );
+    CV_Assert( R_Kinv.isContinuous() );
+
+    _map_x.create(dst_roi.size(), CV_32FC1);
+    _map_y.create(dst_roi.size(), CV_32FC1);
+
+    GpuMat map_x = _map_x.getGpuMat();
+    GpuMat map_y = _map_y.getGpuMat();
+
+    device::imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+#endif
+
+Rect cv::detail::PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R,
+                                           cuda::GpuMat & xmap, cuda::GpuMat & ymap)
+{
+    return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap);
+}
+
+Rect cv::detail::PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T,
+                                           cuda::GpuMat & xmap, cuda::GpuMat & ymap)
+{
+#ifndef HAVE_CUDA
+    (void)src_size;
+    (void)K;
+    (void)R;
+    (void)T;
+    (void)xmap;
+    (void)ymap;
+    throw_no_cuda();
+    return Rect();
+#else
+    projector_.setCameraParams(K, R, T);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    ::buildWarpPlaneMaps(src_size, Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1)),
+                         K, R, T, projector_.scale, xmap, ymap);
+
+    return Rect(dst_tl, dst_br);
+#endif
+}
+
+Point cv::detail::PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R,
+                                       int interp_mode, int border_mode,
+                                       cuda::GpuMat & dst)
+{
+    return warp(src, K, R, Mat::zeros(3, 1, CV_32F), interp_mode, border_mode, dst);
+}
+
+
+Point cv::detail::PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, InputArray T,
+                                       int interp_mode, int border_mode,
+                                       cuda::GpuMat & dst)
+{
+#ifndef HAVE_OPENCV_CUDAWARPING
+    (void)src;
+    (void)K;
+    (void)R;
+    (void)T;
+    (void)interp_mode;
+    (void)border_mode;
+    (void)dst;
+    throw_no_cuda();
+    return Point();
+#else
+    Rect dst_roi = buildMaps(src.size(), K, R, T, d_xmap_, d_ymap_);
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    cuda::remap(src, dst, d_xmap_, d_ymap_, interp_mode, border_mode);
+    return dst_roi.tl();
+#endif
+}
+
+Rect cv::detail::SphericalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
+{
+#ifndef HAVE_CUDA
+    (void)src_size;
+    (void)K;
+    (void)R;
+    (void)xmap;
+    (void)ymap;
+    throw_no_cuda();
+    return Rect();
+#else
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    ::buildWarpSphericalMaps(src_size, Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1)),
+                             K, R, projector_.scale, xmap, ymap);
+
+    return Rect(dst_tl, dst_br);
+#endif
+}
+
+Point cv::detail::SphericalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R,
+                                           int interp_mode, int border_mode,
+                                           cuda::GpuMat & dst)
+{
+#ifndef HAVE_OPENCV_CUDAWARPING
+    (void)src;
+    (void)K;
+    (void)R;
+    (void)interp_mode;
+    (void)border_mode;
+    (void)dst;
+    throw_no_cuda();
+    return Point();
+#else
+    Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    cuda::remap(src, dst, d_xmap_, d_ymap_, interp_mode, border_mode);
+    return dst_roi.tl();
+#endif
+}
+
+
+Rect cv::detail::CylindricalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R,
+                                                 cuda::GpuMat & xmap, cuda::GpuMat & ymap)
+{
+#ifndef HAVE_CUDA
+    (void)src_size;
+    (void)K;
+    (void)R;
+    (void)xmap;
+    (void)ymap;
+    throw_no_cuda();
+    return Rect();
+#else
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    ::buildWarpCylindricalMaps(src_size, Rect(dst_tl, Point(dst_br.x + 1, dst_br.y + 1)),
+                               K, R, projector_.scale, xmap, ymap);
+
+    return Rect(dst_tl, dst_br);
+#endif
+}
+
+Point cv::detail::CylindricalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R,
+                                             int interp_mode, int border_mode,
+                                             cuda::GpuMat & dst)
+{
+#ifndef HAVE_OPENCV_CUDAWARPING
+    (void)src;
+    (void)K;
+    (void)R;
+    (void)interp_mode;
+    (void)border_mode;
+    (void)dst;
+    throw_no_cuda();
+    return Point();
+#else
+    Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    cuda::remap(src, dst, d_xmap_, d_ymap_, interp_mode, border_mode);
+    return dst_roi.tl();
+#endif
+}
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 7745c86c5c..03877c0910 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -3020,7 +3020,7 @@ void printVersionInfo(bool useStdOut)
     if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
 #endif
 #if CV_NEON
-    cpu_features += " neon"; // NEON is currently not checked at runtime
+    if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
 #endif
 
     cpu_features.erase(0, 1); // erase initial space
diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index 82e74878be..013d08e54a 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -134,8 +134,6 @@ public:
 
     virtual HRESULT STDMETHODCALLTYPE Clone(
         /* [out] */ IEnumPIDMap **ppIEnumPIDMap) = 0;
-
-    virtual ~IEnumPIDMap() {}
 };
 
 interface IMPEG2PIDMap : public IUnknown
@@ -151,8 +149,6 @@ interface IMPEG2PIDMap : public IUnknown
 
     virtual HRESULT STDMETHODCALLTYPE EnumPIDMap(
         /* [out] */ IEnumPIDMap **pIEnumPIDMap) = 0;
-
-    virtual ~IMPEG2PIDMap() {}
 };
 
 #endif
@@ -238,8 +234,6 @@ interface ISampleGrabberCB : public IUnknown
         double SampleTime,
         BYTE *pBuffer,
         LONG BufferLen) = 0;
-
-    virtual ~ISampleGrabberCB() {}
 };
 
 interface ISampleGrabber : public IUnknown
@@ -266,8 +260,6 @@ interface ISampleGrabber : public IUnknown
     virtual HRESULT STDMETHODCALLTYPE SetCallback(
         ISampleGrabberCB *pCallback,
         LONG WhichMethodToCallback) = 0;
-
-    virtual ~ISampleGrabber() {}
 };
 
 #ifndef HEADER
diff --git a/modules/videoio/src/cap_images.cpp b/modules/videoio/src/cap_images.cpp
index a92211ca36..253261adc1 100644
--- a/modules/videoio/src/cap_images.cpp
+++ b/modules/videoio/src/cap_images.cpp
@@ -135,6 +135,8 @@ double CvCapture_Images::getProperty(int id) const
         return 0;
     case CV_CAP_PROP_POS_FRAMES:
         return currentframe;
+    case CV_CAP_PROP_FRAME_COUNT:
+        return length;
     case CV_CAP_PROP_POS_AVI_RATIO:
         return (double)currentframe / (double)(length - 1);
     case CV_CAP_PROP_FRAME_WIDTH:
diff --git a/platforms/linux/arm-gnueabi.toolchain.cmake b/platforms/linux/arm-gnueabi.toolchain.cmake
index 2c5b7406d8..448dfa6b1c 100644
--- a/platforms/linux/arm-gnueabi.toolchain.cmake
+++ b/platforms/linux/arm-gnueabi.toolchain.cmake
@@ -5,13 +5,12 @@ set(CMAKE_SYSTEM_PROCESSOR arm)
 set(GCC_COMPILER_VERSION "4.6" CACHE STRING "GCC Compiler version")
 
 set(FLOAT_ABI_SUFFIX "")
-
 if (NOT SOFTFP)
   set(FLOAT_ABI_SUFFIX "hf")
 endif()
 
-set(CMAKE_C_COMPILER    arm-linux-gnueabi${FLOAT_ABI_SUFFIX}-gcc-${GCC_COMPILER_VERSION})
-set(CMAKE_CXX_COMPILER  arm-linux-gnueabi${FLOAT_ABI_SUFFIX}-g++-${GCC_COMPILER_VERSION})
+find_program(CMAKE_C_COMPILER NAMES arm-linux-gnueabi${FLOAT_ABI_SUFFIX}-gcc-${GCC_COMPILER_VERSION})
+find_program(CMAKE_CXX_COMPILER NAMES arm-linux-gnueabi${FLOAT_ABI_SUFFIX}-g++-${GCC_COMPILER_VERSION})
 set(ARM_LINUX_SYSROOT /usr/arm-linux-gnueabi${FLOAT_ABI_SUFFIX} CACHE PATH "ARM cross compilation system root")
 
 set(CMAKE_CXX_FLAGS           ""                    CACHE STRING "c++ flags")
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index e1021de513..467ca162a7 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -22,6 +22,10 @@ if(ANDROID AND BUILD_ANDROID_EXAMPLES)
   add_subdirectory(android)
 endif()
 
+if(INSTALL_PYTHON_EXAMPLES)
+  add_subdirectory(python2)
+endif()
+
 #
 # END OF BUILD CASE 1: Build samples with library sources
 #
diff --git a/samples/cpp/grabcut.cpp b/samples/cpp/grabcut.cpp
index 110e0ff770..7ab28f66b0 100644
--- a/samples/cpp/grabcut.cpp
+++ b/samples/cpp/grabcut.cpp
@@ -22,10 +22,10 @@ static void help()
         "\tleft mouse button - set rectangle\n"
         "\n"
         "\tCTRL+left mouse button - set GC_BGD pixels\n"
-        "\tSHIFT+left mouse button - set CG_FGD pixels\n"
+        "\tSHIFT+left mouse button - set GC_FGD pixels\n"
         "\n"
         "\tCTRL+right mouse button - set GC_PR_BGD pixels\n"
-        "\tSHIFT+right mouse button - set CG_PR_FGD pixels\n" << endl;
+        "\tSHIFT+right mouse button - set GC_PR_FGD pixels\n" << endl;
 }
 
 const Scalar RED = Scalar(0,0,255);
diff --git a/samples/cpp/mask_tmpl.cpp b/samples/cpp/mask_tmpl.cpp
new file mode 100644
index 0000000000..2b6bb77bf1
--- /dev/null
+++ b/samples/cpp/mask_tmpl.cpp
@@ -0,0 +1,72 @@
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+
+#include <cctype>
+#include <iostream>
+#include <iterator>
+#include <stdio.h>
+
+using namespace std;
+using namespace cv;
+
+static void help()
+{
+    cout << "\nThis program demonstrates template match with mask.\n"
+            "Usage:\n"
+            "./mask_tmpl <image_name> <template_name> <mask_name>, Default is ../data/lena_tmpl.jpg\n"
+            << endl;
+}
+
+int main( int argc, const char** argv )
+{
+    const char* filename = argc == 4 ? argv[1] : "../data/lena_tmpl.jpg";
+    const char* tmplname = argc == 4 ? argv[2] : "../data/tmpl.png";
+    const char* maskname = argc == 4 ? argv[3] : "../data/mask.png";
+
+    Mat img = imread(filename);
+    Mat tmpl = imread(tmplname);
+    Mat mask = imread(maskname);
+    Mat res;
+
+    if(img.empty())
+    {
+        help();
+        cout << "can not open " << filename << endl;
+        return -1;
+    }
+
+    if(tmpl.empty())
+    {
+        help();
+        cout << "can not open " << tmplname << endl;
+        return -1;
+    }
+
+    if(mask.empty())
+    {
+        help();
+        cout << "can not open " << maskname << endl;
+        return -1;
+    }
+
+    //int method = CV_TM_SQDIFF;
+    int method = CV_TM_CCORR_NORMED;
+    matchTemplate(img, tmpl, res, method, mask);
+
+    double minVal, maxVal;
+    Point minLoc, maxLoc;
+    Rect rect;
+    minMaxLoc(res, &minVal, &maxVal, &minLoc, &maxLoc);
+
+    if(method == CV_TM_SQDIFF || method == CV_TM_SQDIFF_NORMED)
+        rect = Rect(minLoc, tmpl.size());
+    else
+        rect = Rect(maxLoc, tmpl.size());
+
+    rectangle(img, rect, Scalar(0, 255, 0), 2);
+
+    imshow("detected template", img);
+    waitKey();
+
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/HighGUI/GDAL_IO/gdal-image.cpp b/samples/cpp/tutorial_code/HighGUI/GDAL_IO/gdal-image.cpp
index 48ef254406..6e7c950a26 100644
--- a/samples/cpp/tutorial_code/HighGUI/GDAL_IO/gdal-image.cpp
+++ b/samples/cpp/tutorial_code/HighGUI/GDAL_IO/gdal-image.cpp
@@ -1,13 +1,13 @@
-/**
+/*
  * gdal_image.cpp -- Load GIS data into OpenCV Containers using the Geospatial Data Abstraction Library
 */
 
-/// OpenCV Headers
+// OpenCV Headers
 #include "opencv2/core/core.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
 
-/// C++ Standard Libraries
+// C++ Standard Libraries
 #include <cmath>
 #include <iostream>
 #include <stdexcept>
@@ -15,22 +15,22 @@
 
 using namespace std;
 
-/// define the corner points
-///    Note that GDAL can natively determine this
+// define the corner points
+//    Note that GDAL library can natively determine this
 cv::Point2d tl( -122.441017, 37.815664 );
 cv::Point2d tr( -122.370919, 37.815311 );
 cv::Point2d bl( -122.441533, 37.747167 );
 cv::Point2d br( -122.3715,   37.746814 );
 
-/// determine dem corners
+// determine dem corners
 cv::Point2d dem_bl( -122.0, 38);
 cv::Point2d dem_tr( -123.0, 37);
 
-/// range of the heat map colors
+// range of the heat map colors
 std::vector<std::pair<cv::Vec3b,double> > color_range;
 
 
-/// List of all function prototypes
+// List of all function prototypes
 cv::Point2d lerp( const cv::Point2d&, const cv::Point2d&, const double& );
 
 cv::Vec3b get_dem_color( const double& );
@@ -43,7 +43,7 @@ void add_color( cv::Vec3b& pix, const uchar& b, const uchar& g, const uchar& r )
 
 
 
-/**
+/*
  * Linear Interpolation
  * p1 - Point 1
  * p2 - Point 2
@@ -54,7 +54,7 @@ cv::Point2d lerp( cv::Point2d const& p1, cv::Point2d const& p2, const double& t
                         ((1-t)*p1.y) + (t*p2.y));
 }
 
-/**
+/*
  * Interpolate Colors
 */
 template <typename DATATYPE, int N>
@@ -69,7 +69,7 @@ cv::Vec<DATATYPE,N> lerp( cv::Vec<DATATYPE,N> const& minColor,
     return output;
 }
 
-/**
+/*
  * Compute the dem color
 */
 cv::Vec3b get_dem_color( const double& elevation ){
@@ -103,7 +103,7 @@ cv::Vec3b get_dem_color( const double& elevation ){
     return lerp( color_range[idx].first, color_range[idx+1].first, t);
 }
 
-/**
+/*
  * Given a pixel coordinate and the size of the input image, compute the pixel location
  * on the DEM image.
 */
@@ -122,7 +122,7 @@ cv::Point2d world2dem( cv::Point2d const& coordinate, const cv::Size& dem_size
     return output;
 }
 
-/**
+/*
  * Convert a pixel coordinate to world coordinates
 */
 cv::Point2d pixel2world( const int& x, const int& y, const cv::Size& size ){
@@ -139,7 +139,7 @@ cv::Point2d pixel2world( const int& x, const int& y, const cv::Size& size ){
     return lerp( leftSide, rightSide, rx );
 }
 
-/**
+/*
  * Add color to a specific pixel color value
 */
 void add_color( cv::Vec3b& pix, const uchar& b, const uchar& g, const uchar& r ){
@@ -150,12 +150,12 @@ void add_color( cv::Vec3b& pix, const uchar& b, const uchar& g, const uchar& r )
 }
 
 
-/**
+/*
  * Main Function
 */
 int main( int argc, char* argv[] ){
 
-    /**
+    /*
      * Check input arguments
     */
     if( argc < 3 ){
@@ -163,22 +163,22 @@ int main( int argc, char* argv[] ){
         return 1;
     }
 
-    /// load the image (note that we don't have the projection information.  You will
-    /// need to load that yourself or use the full GDAL driver.  The values are pre-defined
-    /// at the top of this file
+    // load the image (note that we don't have the projection information.  You will
+    // need to load that yourself or use the full GDAL driver.  The values are pre-defined
+    // at the top of this file
     cv::Mat image = cv::imread(argv[1], cv::IMREAD_LOAD_GDAL | cv::IMREAD_COLOR );
 
-    /// load the dem model
+    // load the dem model
     cv::Mat dem = cv::imread(argv[2], cv::IMREAD_LOAD_GDAL | cv::IMREAD_ANYDEPTH );
 
-    /// create our output products
+    // create our output products
     cv::Mat output_dem(   image.size(), CV_8UC3 );
     cv::Mat output_dem_flood(   image.size(), CV_8UC3 );
 
-    /// for sanity sake, make sure GDAL Loads it as a signed short
+    // for sanity sake, make sure GDAL Loads it as a signed short
     if( dem.type() != CV_16SC1 ){ throw std::runtime_error("DEM image type must be CV_16SC1"); }
 
-    /// define the color range to create our output DEM heat map
+    // define the color range to create our output DEM heat map
     //  Pair format ( Color, elevation );  Push from low to high
     //  Note:  This would be perfect for a configuration file, but is here for a working demo.
     color_range.push_back( std::pair<cv::Vec3b,double>(cv::Vec3b( 188, 154,  46),   -1));
diff --git a/samples/data/lena_tmpl.jpg b/samples/data/lena_tmpl.jpg
new file mode 100644
index 0000000000..0c9fc20de8
Binary files /dev/null and b/samples/data/lena_tmpl.jpg differ
diff --git a/samples/data/mask.png b/samples/data/mask.png
new file mode 100644
index 0000000000..0666232d49
Binary files /dev/null and b/samples/data/mask.png differ
diff --git a/samples/data/tmpl.png b/samples/data/tmpl.png
new file mode 100644
index 0000000000..999ac704cc
Binary files /dev/null and b/samples/data/tmpl.png differ
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 65fe4ef8cf..10c91991c9 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -3,7 +3,7 @@ SET(OPENCV_CUDA_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc op
                                      opencv_calib3d opencv_cuda opencv_superres
                                      opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc
                                      opencv_cudafeatures2d opencv_cudaoptflow opencv_cudabgsegm
-                                     opencv_cudastereo opencv_cudalegacy)
+                                     opencv_cudastereo opencv_cudalegacy opencv_cudaobjdetect)
 
 ocv_check_dependencies(${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})
 
diff --git a/samples/gpu/cascadeclassifier.cpp b/samples/gpu/cascadeclassifier.cpp
index dbb2895e96..f6209f9fa3 100644
--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
@@ -9,7 +9,7 @@
 #include "opencv2/objdetect/objdetect.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/cuda.hpp"
+#include "opencv2/cudaobjdetect.hpp"
 #include "opencv2/cudaimgproc.hpp"
 #include "opencv2/cudawarping.hpp"
 
@@ -173,13 +173,9 @@ int main(int argc, const char *argv[])
         }
     }
 
-    CascadeClassifier_CUDA cascade_gpu;
-    if (!cascade_gpu.load(cascadeName))
-    {
-        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
-    }
+    Ptr<cuda::CascadeClassifier> cascade_gpu = cuda::CascadeClassifier::create(cascadeName);
 
-    CascadeClassifier cascade_cpu;
+    cv::CascadeClassifier cascade_cpu;
     if (!cascade_cpu.load(cascadeName))
     {
         return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
@@ -206,8 +202,8 @@ int main(int argc, const char *argv[])
 
     namedWindow("result", 1);
 
-    Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;
-    vector<Rect> facesBuf_cpu;
+    Mat frame, frame_cpu, gray_cpu, resized_cpu, frameDisp;
+    vector<Rect> faces;
 
     GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu;
 
@@ -218,7 +214,6 @@ int main(int argc, const char *argv[])
     bool filterRects = true;
     bool helpScreen = false;
 
-    int detections_num;
     for (;;)
     {
         if (isInputCamera || isInputVideo)
@@ -241,40 +236,26 @@ int main(int argc, const char *argv[])
 
         if (useGPU)
         {
-            //cascade_gpu.visualizeInPlace = true;
-            cascade_gpu.findLargestObject = findLargestObject;
+            cascade_gpu->setFindLargestObject(findLargestObject);
+            cascade_gpu->setScaleFactor(1.2);
+            cascade_gpu->setMinNeighbors((filterRects || findLargestObject) ? 4 : 0);
 
-            detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2,
-                                                          (filterRects || findLargestObject) ? 4 : 0);
-            facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
+            cascade_gpu->detectMultiScale(resized_gpu, facesBuf_gpu);
+            cascade_gpu->convert(facesBuf_gpu, faces);
         }
         else
         {
-            Size minSize = cascade_gpu.getClassifierSize();
-            cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2,
+            Size minSize = cascade_gpu->getClassifierSize();
+            cascade_cpu.detectMultiScale(resized_cpu, faces, 1.2,
                                          (filterRects || findLargestObject) ? 4 : 0,
                                          (findLargestObject ? CASCADE_FIND_BIGGEST_OBJECT : 0)
                                             | CASCADE_SCALE_IMAGE,
                                          minSize);
-            detections_num = (int)facesBuf_cpu.size();
         }
 
-        if (!useGPU && detections_num)
+        for (size_t i = 0; i < faces.size(); ++i)
         {
-            for (int i = 0; i < detections_num; ++i)
-            {
-                rectangle(resized_cpu, facesBuf_cpu[i], Scalar(255));
-            }
-        }
-
-        if (useGPU)
-        {
-            resized_gpu.download(resized_cpu);
-
-             for (int i = 0; i < detections_num; ++i)
-             {
-                rectangle(resized_cpu, faces_downloaded.ptr<cv::Rect>()[i], Scalar(255));
-             }
+            rectangle(resized_cpu, faces[i], Scalar(255));
         }
 
         tm.stop();
@@ -283,16 +264,15 @@ int main(int argc, const char *argv[])
 
         //print detections to console
         cout << setfill(' ') << setprecision(2);
-        cout << setw(6) << fixed << fps << " FPS, " << detections_num << " det";
-        if ((filterRects || findLargestObject) && detections_num > 0)
+        cout << setw(6) << fixed << fps << " FPS, " << faces.size() << " det";
+        if ((filterRects || findLargestObject) && !faces.empty())
         {
-            Rect *faceRects = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0];
-            for (int i = 0; i < min(detections_num, 2); ++i)
+            for (size_t i = 0; i < faces.size(); ++i)
             {
-                cout << ", [" << setw(4) << faceRects[i].x
-                     << ", " << setw(4) << faceRects[i].y
-                     << ", " << setw(4) << faceRects[i].width
-                     << ", " << setw(4) << faceRects[i].height << "]";
+                cout << ", [" << setw(4) << faces[i].x
+                     << ", " << setw(4) << faces[i].y
+                     << ", " << setw(4) << faces[i].width
+                     << ", " << setw(4) << faces[i].height << "]";
             }
         }
         cout << endl;
diff --git a/samples/gpu/farneback_optical_flow.cpp b/samples/gpu/farneback_optical_flow.cpp
index 6fc3f931fe..b8ed55ea6c 100644
--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@@ -44,8 +44,8 @@ static void colorizeFlow(const Mat &u, const Mat &v, Mat &dst)
 int main(int argc, char **argv)
 {
     CommandLineParser cmd(argc, argv,
-            "{ l left  | | specify left image }"
-            "{ r right | | specify right image }"
+            "{ l left  | ../data/basketball1.png | specify left image }"
+            "{ r right | ../data/basketball2.png | specify right image }"
             "{ h help  | | print help message }");
 
     cmd.about("Farneback's optical flow sample.");
diff --git a/samples/gpu/hog.cpp b/samples/gpu/hog.cpp
index 59ea44f31b..8b57c89008 100644
--- a/samples/gpu/hog.cpp
+++ b/samples/gpu/hog.cpp
@@ -5,7 +5,7 @@
 #include <iomanip>
 #include <stdexcept>
 #include <opencv2/core/utility.hpp>
-#include "opencv2/cuda.hpp"
+#include "opencv2/cudaobjdetect.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/objdetect.hpp"
 #include "opencv2/imgproc.hpp"
@@ -115,11 +115,19 @@ int main(int argc, char** argv)
 {
     try
     {
+        Args args;
         if (argc < 2)
+        {
             printHelp();
-        Args args = Args::read(argc, argv);
-        if (help_showed)
-            return -1;
+            args.camera_id = 0;
+            args.src_is_camera = true;
+        }
+        else
+        {
+            args = Args::read(argc, argv);
+            if (help_showed)
+                return -1;
+        }
         App app(args);
         app.run();
     }
@@ -244,19 +252,13 @@ void App::run()
     Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
     Size win_stride(args.win_stride_width, args.win_stride_height);
 
-    // Create HOG descriptors and detectors here
-    vector<float> detector;
-    if (win_size == Size(64, 128))
-        detector = cv::cuda::HOGDescriptor::getPeopleDetector64x128();
-    else
-        detector = cv::cuda::HOGDescriptor::getPeopleDetector48x96();
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size);
+    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9);
 
-    cv::cuda::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::cuda::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::cuda::HOGDescriptor::DEFAULT_NLEVELS);
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
-                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
-    gpu_hog.setSVMDetector(detector);
+    // Create HOG descriptors and detectors here
+    Mat detector = gpu_hog->getDefaultPeopleDetector();
+
+    gpu_hog->setSVMDetector(detector);
     cpu_hog.setSVMDetector(detector);
 
     while (running)
@@ -307,9 +309,6 @@ void App::run()
             else img = img_aux;
             img_to_show = img;
 
-            gpu_hog.nlevels = nlevels;
-            cpu_hog.nlevels = nlevels;
-
             vector<Rect> found;
 
             // Perform HOG classification
@@ -317,11 +316,19 @@ void App::run()
             if (use_gpu)
             {
                 gpu_img.upload(img);
-                gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
-                                         Size(0, 0), scale, gr_threshold);
+                gpu_hog->setNumLevels(nlevels);
+                gpu_hog->setHitThreshold(hit_threshold);
+                gpu_hog->setWinStride(win_stride);
+                gpu_hog->setScaleFactor(scale);
+                gpu_hog->setGroupThreshold(gr_threshold);
+                gpu_hog->detectMultiScale(gpu_img, found);
             }
-            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
+            else
+            {
+                cpu_hog.nlevels = nlevels;
+                cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
                                           Size(0, 0), scale, gr_threshold);
+            }
             hogWorkEnd();
 
             // Draw positive classified windows
diff --git a/samples/gpu/opengl.cpp b/samples/gpu/opengl.cpp
index eef8e1a94c..e3e3ddc687 100644
--- a/samples/gpu/opengl.cpp
+++ b/samples/gpu/opengl.cpp
@@ -14,6 +14,9 @@ int main()
     #define NOMINMAX 1
     #include <windows.h>
 #endif
+#if defined(_WIN64)
+    #include <windows.h>
+#endif
 
 #if defined(__APPLE__)
     #include <OpenGL/gl.h>
@@ -55,16 +58,19 @@ void draw(void* userdata)
 
 int main(int argc, char* argv[])
 {
+    string filename;
     if (argc < 2)
     {
         cout << "Usage: " << argv[0] << " image" << endl;
-        return -1;
+        filename = "../data/lena.jpg";
     }
+    else
+        filename = argv[1];
 
-    Mat img = imread(argv[1]);
+    Mat img = imread(filename);
     if (img.empty())
     {
-        cerr << "Can't open image " << argv[1] << endl;
+        cerr << "Can't open image " << filename << endl;
         return -1;
     }
 
diff --git a/samples/gpu/optical_flow.cpp b/samples/gpu/optical_flow.cpp
index 8c6bc74a40..7d625de85b 100644
--- a/samples/gpu/optical_flow.cpp
+++ b/samples/gpu/optical_flow.cpp
@@ -135,23 +135,30 @@ static void showFlow(const char* name, const GpuMat& d_flowx, const GpuMat& d_fl
 
 int main(int argc, const char* argv[])
 {
+    string filename1, filename2;
     if (argc < 3)
     {
-        cerr << "Usage : " << argv[0] << "<frame0> <frame1>" << endl;
-        return -1;
+        cerr << "Usage : " << argv[0] << " <frame0> <frame1>" << endl;
+        filename1 = "../data/basketball1.png";
+        filename2 = "../data/basketball2.png";
+    }
+    else
+    {
+        filename1 = argv[1];
+        filename2 = argv[2];
     }
 
-    Mat frame0 = imread(argv[1], IMREAD_GRAYSCALE);
-    Mat frame1 = imread(argv[2], IMREAD_GRAYSCALE);
+    Mat frame0 = imread(filename1, IMREAD_GRAYSCALE);
+    Mat frame1 = imread(filename2, IMREAD_GRAYSCALE);
 
     if (frame0.empty())
     {
-        cerr << "Can't open image ["  << argv[1] << "]" << endl;
+        cerr << "Can't open image ["  << filename1 << "]" << endl;
         return -1;
     }
     if (frame1.empty())
     {
-        cerr << "Can't open image ["  << argv[2] << "]" << endl;
+        cerr << "Can't open image ["  << filename2 << "]" << endl;
         return -1;
     }
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 2e7faa3341..14910f9a38 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -193,7 +193,7 @@ TEST(cornerHarris)
 TEST(integral)
 {
     Mat src, sum;
-    cuda::GpuMat d_src, d_sum, d_buf;
+    cuda::GpuMat d_src, d_sum;
 
     for (int size = 1000; size <= 4000; size *= 2)
     {
@@ -209,10 +209,10 @@ TEST(integral)
 
         d_src.upload(src);
 
-        cuda::integralBuffered(d_src, d_sum, d_buf);
+        cuda::integral(d_src, d_sum);
 
         CUDA_ON;
-        cuda::integralBuffered(d_src, d_sum, d_buf);
+        cuda::integral(d_src, d_sum);
         CUDA_OFF;
     }
 }
@@ -322,14 +322,14 @@ TEST(FAST)
     FAST(src, keypoints, 20);
     CPU_OFF;
 
-    cuda::FAST_CUDA d_FAST(20);
+    cv::Ptr<cv::cuda::FastFeatureDetector> d_FAST = cv::cuda::FastFeatureDetector::create(20);
     cuda::GpuMat d_src(src);
     cuda::GpuMat d_keypoints;
 
-    d_FAST(d_src, cuda::GpuMat(), d_keypoints);
+    d_FAST->detectAsync(d_src, d_keypoints);
 
     CUDA_ON;
-    d_FAST(d_src, cuda::GpuMat(), d_keypoints);
+    d_FAST->detectAsync(d_src, d_keypoints);
     CUDA_OFF;
 }
 
@@ -350,15 +350,15 @@ TEST(ORB)
     orb->detectAndCompute(src, Mat(), keypoints, descriptors);
     CPU_OFF;
 
-    cuda::ORB_CUDA d_orb;
+    Ptr<cuda::ORB> d_orb = cuda::ORB::create();
     cuda::GpuMat d_src(src);
     cuda::GpuMat d_keypoints;
     cuda::GpuMat d_descriptors;
 
-    d_orb(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
+    d_orb->detectAndComputeAsync(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
 
     CUDA_ON;
-    d_orb(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
+    d_orb->detectAndComputeAsync(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
     CUDA_OFF;
 }
 
@@ -379,14 +379,14 @@ TEST(BruteForceMatcher)
 
     // Init CUDA matcher
 
-    cuda::BFMatcher_CUDA d_matcher(NORM_L2);
+    Ptr<cuda::DescriptorMatcher> d_matcher = cuda::DescriptorMatcher::createBFMatcher(NORM_L2);
 
     cuda::GpuMat d_query(query);
     cuda::GpuMat d_train(train);
 
     // Output
     vector< vector<DMatch> > matches(2);
-    cuda::GpuMat d_trainIdx, d_distance, d_allDist, d_nMatches;
+    cuda::GpuMat d_matches;
 
     SUBTEST << "match";
 
@@ -396,10 +396,10 @@ TEST(BruteForceMatcher)
     matcher.match(query, train, matches[0]);
     CPU_OFF;
 
-    d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+    d_matcher->matchAsync(d_query, d_train, d_matches);
 
     CUDA_ON;
-    d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+    d_matcher->matchAsync(d_query, d_train, d_matches);
     CUDA_OFF;
 
     SUBTEST << "knnMatch";
@@ -410,10 +410,10 @@ TEST(BruteForceMatcher)
     matcher.knnMatch(query, train, matches, 2);
     CPU_OFF;
 
-    d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+    d_matcher->knnMatchAsync(d_query, d_train, d_matches, 2);
 
     CUDA_ON;
-    d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+    d_matcher->knnMatchAsync(d_query, d_train, d_matches, 2);
     CUDA_OFF;
 
     SUBTEST << "radiusMatch";
@@ -426,12 +426,10 @@ TEST(BruteForceMatcher)
     matcher.radiusMatch(query, train, matches, max_distance);
     CPU_OFF;
 
-    d_trainIdx.release();
-
-    d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+    d_matcher->radiusMatchAsync(d_query, d_train, d_matches, max_distance);
 
     CUDA_ON;
-    d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+    d_matcher->radiusMatchAsync(d_query, d_train, d_matches, max_distance);
     CUDA_OFF;
 }
 
diff --git a/samples/gpu/pyrlk_optical_flow.cpp b/samples/gpu/pyrlk_optical_flow.cpp
index dc5de6c0aa..febc28f28d 100644
--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp
@@ -119,8 +119,8 @@ int main(int argc, const char* argv[])
 {
     const char* keys =
         "{ h             help   |       | print help message }"
-        "{ l             left   |       | specify left image }"
-        "{ r             right  |       | specify right image }"
+        "{ l             left   | ../data/pic1.png       | specify left image }"
+        "{ r             right  | ../data/pic2.png       | specify right image }"
         "{ gray                 |       | use grayscale sources [PyrLK Sparse] }"
         "{ win_size             | 21    | specify windows size [PyrLK] }"
         "{ max_level            | 3     | specify max level [PyrLK] }"
diff --git a/samples/python2/CMakeLists.txt b/samples/python2/CMakeLists.txt
new file mode 100644
index 0000000000..7fa245447c
--- /dev/null
+++ b/samples/python2/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(INSTALL_PYTHON_EXAMPLES)
+  file(GLOB install_list *.py )
+  install(FILES ${install_list}
+          DESTINATION ${OPENCV_SAMPLES_SRC_INSTALL_PATH}/python2
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ COMPONENT samples)
+endif()
diff --git a/samples/python2/calibrate.py b/samples/python2/calibrate.py
index 2c759ff972..9f6f60cb73 100755
--- a/samples/python2/calibrate.py
+++ b/samples/python2/calibrate.py
@@ -26,7 +26,7 @@ if __name__ == '__main__':
     try:
         img_mask = img_mask[0]
     except:
-        img_mask = '../cpp/left*.jpg'
+        img_mask = '../data/left*.jpg'
 
     img_names = glob(img_mask)
     debug_dir = args.get('--debug')
diff --git a/samples/python2/deconvolution.py b/samples/python2/deconvolution.py
index 218efe883b..bbb1567bd4 100755
--- a/samples/python2/deconvolution.py
+++ b/samples/python2/deconvolution.py
@@ -119,7 +119,7 @@ if __name__ == '__main__':
     update(None)
 
     while True:
-        ch = cv2.waitKey()
+        ch = cv2.waitKey() & 0xFF
         if ch == 27:
             break
         if ch == ord(' '):
diff --git a/samples/python2/digits_video.py b/samples/python2/digits_video.py
index ca72a93501..bb142eb687 100755
--- a/samples/python2/digits_video.py
+++ b/samples/python2/digits_video.py
@@ -86,7 +86,7 @@ def main():
 
         cv2.imshow('frame', frame)
         cv2.imshow('bin', bin)
-        ch = cv2.waitKey(1)
+        ch = cv2.waitKey(1) & 0xFF
         if ch == 27:
             break
 
diff --git a/samples/python2/edge.py b/samples/python2/edge.py
index bd0c8bde79..413bf8859b 100755
--- a/samples/python2/edge.py
+++ b/samples/python2/edge.py
@@ -45,7 +45,7 @@ if __name__ == '__main__':
         vis /= 2
         vis[edge != 0] = (0, 255, 0)
         cv2.imshow('edge', vis)
-        ch = cv2.waitKey(5)
+        ch = cv2.waitKey(5) & 0xFF
         if ch == 27:
             break
     cv2.destroyAllWindows()
diff --git a/samples/python2/find_obj.py b/samples/python2/find_obj.py
index fb1e0730d9..35bce86fde 100755
--- a/samples/python2/find_obj.py
+++ b/samples/python2/find_obj.py
@@ -3,6 +3,8 @@
 '''
 Feature-based image matching sample.
 
+Note, that you will need the https://github.com/Itseez/opencv_contrib repo for SIFT and SURF
+
 USAGE
   find_obj.py [--feature=<sift|surf|orb|akaze|brisk>[-flann]] [ <image1> <image2> ]
 
@@ -23,19 +25,19 @@ FLANN_INDEX_LSH    = 6
 def init_feature(name):
     chunks = name.split('-')
     if chunks[0] == 'sift':
-        detector = cv2.xfeatures2d.SIFT()
+        detector = cv2.xfeatures2d.SIFT_create()
         norm = cv2.NORM_L2
     elif chunks[0] == 'surf':
-        detector = cv2.xfeatures2d.SURF(800)
+        detector = cv2.xfeatures2d.SURF_create(800)
         norm = cv2.NORM_L2
     elif chunks[0] == 'orb':
-        detector = cv2.ORB(400)
+        detector = cv2.ORB_create(400)
         norm = cv2.NORM_HAMMING
     elif chunks[0] == 'akaze':
-        detector = cv2.AKAZE()
+        detector = cv2.AKAZE_create()
         norm = cv2.NORM_HAMMING
     elif chunks[0] == 'brisk':
-        detector = cv2.BRISK()
+        detector = cv2.BRISK_create()
         norm = cv2.NORM_HAMMING
     else:
         return None, None
diff --git a/samples/python2/fitline.py b/samples/python2/fitline.py
index 08b94d75bc..1c0d9e7709 100755
--- a/samples/python2/fitline.py
+++ b/samples/python2/fitline.py
@@ -79,7 +79,7 @@ if __name__ == '__main__':
     cv2.createTrackbar('outlier %', 'fit line', 30, 100, update)
     while True:
         update()
-        ch = cv2.waitKey(0)
+        ch = cv2.waitKey(0) & 0xFF
         if ch == ord('f'):
             cur_func_name = dist_func_names.next()
         if ch == 27:
diff --git a/samples/python2/lappyr.py b/samples/python2/lappyr.py
index 0c08484de9..3cf2679b08 100755
--- a/samples/python2/lappyr.py
+++ b/samples/python2/lappyr.py
@@ -62,5 +62,5 @@ if __name__ == '__main__':
 
         cv2.imshow('laplacian pyramid filter', res)
 
-        if cv2.waitKey(1) == 27:
+        if cv2.waitKey(1) & 0xFF == 27:
             break
diff --git a/samples/python2/mosse.py b/samples/python2/mosse.py
index 0e2e7eed98..81196dcc36 100755
--- a/samples/python2/mosse.py
+++ b/samples/python2/mosse.py
@@ -168,7 +168,7 @@ class App:
             self.rect_sel.draw(vis)
 
             cv2.imshow('frame', vis)
-            ch = cv2.waitKey(10)
+            ch = cv2.waitKey(10) & 0xFF
             if ch == 27:
                 break
             if ch == ord(' '):
diff --git a/samples/python2/mser.py b/samples/python2/mser.py
index beaa6e7dcb..9d7a65c10f 100755
--- a/samples/python2/mser.py
+++ b/samples/python2/mser.py
@@ -26,13 +26,13 @@ if __name__ == '__main__':
         video_src = 0
 
     cam = video.create_capture(video_src)
-    mser = cv2.MSER()
+    mser = cv2.MSER_create()
     while True:
         ret, img = cam.read()
         gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
         vis = img.copy()
 
-        regions = mser.detect(gray, None)
+        regions = mser.detectRegions(gray, None)
         hulls = [cv2.convexHull(p.reshape(-1, 1, 2)) for p in regions]
         cv2.polylines(vis, hulls, 1, (0, 255, 0))
 
diff --git a/samples/python2/plane_ar.py b/samples/python2/plane_ar.py
index dcb5559cdf..6580be7d05 100755
--- a/samples/python2/plane_ar.py
+++ b/samples/python2/plane_ar.py
@@ -71,7 +71,7 @@ class App:
 
             self.rect_sel.draw(vis)
             cv2.imshow('plane', vis)
-            ch = cv2.waitKey(1)
+            ch = cv2.waitKey(1) & 0xFF
             if ch == ord(' '):
                 self.paused = not self.paused
             if ch == ord('c'):
diff --git a/samples/python2/plane_tracker.py b/samples/python2/plane_tracker.py
index de5d7a0ec9..c32f65a442 100755
--- a/samples/python2/plane_tracker.py
+++ b/samples/python2/plane_tracker.py
@@ -61,7 +61,7 @@ TrackedTarget = namedtuple('TrackedTarget', 'target, p0, p1, H, quad')
 
 class PlaneTracker:
     def __init__(self):
-        self.detector = cv2.ORB( nfeatures = 1000 )
+        self.detector = cv2.ORB_create( nfeatures = 1000 )
         self.matcher = cv2.FlannBasedMatcher(flann_params, {})  # bug : need to pass empty dict (#1329)
         self.targets = []
 
@@ -77,7 +77,7 @@ class PlaneTracker:
                 descs.append(desc)
         descs = np.uint8(descs)
         self.matcher.add([descs])
-        target = PlanarTarget(image = image, rect=rect, keypoints = points, descrs=descs, data=None)
+        target = PlanarTarget(image = image, rect=rect, keypoints = points, descrs=descs, data=data)
         self.targets.append(target)
 
     def clear(self):
@@ -87,10 +87,10 @@ class PlaneTracker:
 
     def track(self, frame):
         '''Returns a list of detected TrackedTarget objects'''
-        self.frame_points, self.frame_descrs = self.detect_features(frame)
-        if len(self.frame_points) < MIN_MATCH_COUNT:
+        frame_points, frame_descrs = self.detect_features(frame)
+        if len(frame_points) < MIN_MATCH_COUNT:
             return []
-        matches = self.matcher.knnMatch(self.frame_descrs, k = 2)
+        matches = self.matcher.knnMatch(frame_descrs, k = 2)
         matches = [m[0] for m in matches if len(m) == 2 and m[0].distance < m[1].distance * 0.75]
         if len(matches) < MIN_MATCH_COUNT:
             return []
@@ -103,7 +103,7 @@ class PlaneTracker:
                 continue
             target = self.targets[imgIdx]
             p0 = [target.keypoints[m.trainIdx].pt for m in matches]
-            p1 = [self.frame_points[m.queryIdx].pt for m in matches]
+            p1 = [frame_points[m.queryIdx].pt for m in matches]
             p0, p1 = np.float32((p0, p1))
             H, status = cv2.findHomography(p0, p1, cv2.RANSAC, 3.0)
             status = status.ravel() != 0
@@ -160,7 +160,7 @@ class App:
 
             self.rect_sel.draw(vis)
             cv2.imshow('plane', vis)
-            ch = cv2.waitKey(1)
+            ch = cv2.waitKey(1) & 0xFF
             if ch == ord(' '):
                 self.paused = not self.paused
             if ch == ord('c'):
diff --git a/samples/python2/squares.py b/samples/python2/squares.py
index c12b884011..84160a2919 100755
--- a/samples/python2/squares.py
+++ b/samples/python2/squares.py
@@ -37,7 +37,7 @@ def find_squares(img):
 
 if __name__ == '__main__':
     from glob import glob
-    for fn in glob('../cpp/pic*.png'):
+    for fn in glob('../data/pic*.png'):
         img = cv2.imread(fn)
         squares = find_squares(img)
         cv2.drawContours( img, squares, -1, (0, 255, 0), 3 )
diff --git a/samples/python2/stereo_match.py b/samples/python2/stereo_match.py
index 5b21617cca..e53ae77025 100755
--- a/samples/python2/stereo_match.py
+++ b/samples/python2/stereo_match.py
@@ -39,16 +39,15 @@ if __name__ == '__main__':
     window_size = 3
     min_disp = 16
     num_disp = 112-min_disp
-    stereo = cv2.StereoSGBM(minDisparity = min_disp,
+    stereo = cv2.StereoSGBM_create(minDisparity = min_disp,
         numDisparities = num_disp,
-        SADWindowSize = window_size,
-        uniquenessRatio = 10,
-        speckleWindowSize = 100,
-        speckleRange = 32,
-        disp12MaxDiff = 1,
+        blockSize = 16,
         P1 = 8*3*window_size**2,
         P2 = 32*3*window_size**2,
-        fullDP = False
+        disp12MaxDiff = 1,
+        uniquenessRatio = 10,
+        speckleWindowSize = 100,
+        speckleRange = 32
     )
 
     print 'computing disparity...'