diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1fadce559d..9cb3be0c33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,19 @@ if(UNIX AND NOT ANDROID)
   endif()
 endif()
 
+# Add these standard paths to the search paths for FIND_PATH
+# to find include files from these locations first
+if(MINGW)
+  if(EXISTS /mingw)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw)
+  endif()
+  if(EXISTS /mingw32)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw32)
+  endif()
+  if(EXISTS /mingw64)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw64)
+  endif()
+endif()
 
 # ----------------------------------------------------------------------------
 # OpenCV cmake options
@@ -110,7 +123,7 @@ endif()
 
 # Optional 3rd party components
 # ===================================================
-OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
diff --git a/android/android.toolchain.cmake b/android/android.toolchain.cmake
index df365fc2c0..9db174a138 100644
--- a/android/android.toolchain.cmake
+++ b/android/android.toolchain.cmake
@@ -1,6 +1,7 @@
 message(STATUS "Android toolchain was moved to platfroms/android!")
 message(STATUS "This file is depricated and will be removed!")
 
+# Copyright (c) 2010-2011, Ethan Rublee
 # Copyright (c) 2011-2013, Andrey Kamaev
 # All rights reserved.
 #
@@ -291,6 +292,9 @@ message(STATUS "This file is depricated and will be removed!")
 #   - March 2013
 #     [+] updated for NDK r8e (x86 version)
 #     [+] support x86_64 version of NDK
+#   - April 2013
+#     [+] support non-release NDK layouts (from Linaro git and Android git)
+#     [~] automatically detect if explicit link to crtbegin_*.o is needed
 # ------------------------------------------------------------------------------
 
 cmake_minimum_required( VERSION 2.6.3 )
@@ -518,24 +522,19 @@ if( NOT ANDROID_NDK )
   endif( ANDROID_NDK )
  endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
 endif( NOT ANDROID_NDK )
+
 # remember found paths
 if( ANDROID_NDK )
  get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- # try to detect change
- if( CMAKE_AR )
-  string( LENGTH "${ANDROID_NDK}" __length )
-  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
-  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK )
-   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
-   " )
-  endif()
-  unset( __androidNdkPreviousPath )
-  unset( __length )
- endif()
  set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
  set( BUILD_WITH_ANDROID_NDK True )
- file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
- string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ if( EXISTS "${ANDROID_NDK}/RELEASE.TXT" )
+  file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
+  string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ else()
+  set( ANDROID_NDK_RELEASE "r1x" )
+  set( ANDROID_NDK_RELEASE_FULL "unreleased" )
+ endif()
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
  get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
  # try to detect change
@@ -562,6 +561,51 @@ else()
       sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
 endif()
 
+# android NDK layout
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT DEFINED ANDROID_NDK_LAYOUT )
+  # try to automatically detect the layout
+  if( EXISTS "${ANDROID_NDK}/RELEASE.TXT")
+   set( ANDROID_NDK_LAYOUT "RELEASE" )
+  elseif( EXISTS "${ANDROID_NDK}/../../linux-x86/toolchain/" )
+   set( ANDROID_NDK_LAYOUT "LINARO" )
+  elseif( EXISTS "${ANDROID_NDK}/../../gcc/" )
+   set( ANDROID_NDK_LAYOUT "ANDROID" )
+  endif()
+ endif()
+ set( ANDROID_NDK_LAYOUT "${ANDROID_NDK_LAYOUT}" CACHE STRING "The inner layout of NDK" )
+ mark_as_advanced( ANDROID_NDK_LAYOUT )
+ if( ANDROID_NDK_LAYOUT STREQUAL "LINARO" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../${ANDROID_NDK_HOST_SYSTEM_NAME}/toolchain" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ elseif( ANDROID_NDK_LAYOUT STREQUAL "ANDROID" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../gcc/${ANDROID_NDK_HOST_SYSTEM_NAME}/arm" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ else() # ANDROID_NDK_LAYOUT STREQUAL "RELEASE"
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/toolchains" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME2}" )
+ endif()
+ get_filename_component( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK_TOOLCHAINS_PATH}" ABSOLUTE )
+
+ # try to detect change of NDK
+ if( CMAKE_AR )
+  string( LENGTH "${ANDROID_NDK_TOOLCHAINS_PATH}" __length )
+  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
+  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK_TOOLCHAINS_PATH )
+   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
+   " )
+  endif()
+  unset( __androidNdkPreviousPath )
+  unset( __length )
+ endif()
+endif()
+
+
 # get all the details about standalone toolchain
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
@@ -589,17 +633,23 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  endif()
 endif()
 
-macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __host_system_name )
+macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
  foreach( __toolchain ${${__availableToolchainsLst}} )
-  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/" )
+  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
    string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
   else()
    set( __gcc_toolchain "${__toolchain}" )
   endif()
-  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__gcc_toolchain}/prebuilt/${__host_system_name}" )
+  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
   if( __machine )
-   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?$" __version "${__gcc_toolchain}" )
-   string( REGEX MATCH "^[^-]+" __arch "${__gcc_toolchain}" )
+   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
+   if( __machine MATCHES i686 )
+    set( __arch "x86" )
+   elseif( __machine MATCHES arm )
+    set( __arch "arm" )
+   elseif( __machine MATCHES mipsel )
+    set( __arch "mipsel" )
+   endif()
    list( APPEND __availableToolchainMachines "${__machine}" )
    list( APPEND __availableToolchainArchs "${__arch}" )
    list( APPEND __availableToolchainCompilerVersions "${__version}" )
@@ -617,29 +667,29 @@ if( BUILD_WITH_ANDROID_NDK )
  set( __availableToolchainMachines "" )
  set( __availableToolchainArchs "" )
  set( __availableToolchainCompilerVersions "" )
- if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_NAME}/" )
+ if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_TOOLCHAIN_NAME}/" )
   # do not go through all toolchains if we know the name
   set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
    if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
    endif()
   endif()
  endif()
  if( NOT __availableToolchains )
-  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK}/toolchains" "${ANDROID_NDK}/toolchains/*" )
+  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK_TOOLCHAINS_PATH}" "${ANDROID_NDK_TOOLCHAINS_PATH}/*" )
   if( __availableToolchains )
    list(SORT __availableToolchainsLst) # we need clang to go after gcc
   endif()
   __LIST_FILTER( __availableToolchainsLst "^[.]" )
   __LIST_FILTER( __availableToolchainsLst "llvm" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
    if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
    endif()
   endif()
  endif()
@@ -770,6 +820,7 @@ else()
   list( GET __availableToolchainArchs ${__idx} __toolchainArch )
   if( __toolchainArch STREQUAL ANDROID_ARCH_FULLNAME )
    list( GET __availableToolchainCompilerVersions ${__idx} __toolchainVersion )
+   string( REPLACE "x" "99" __toolchainVersion "${__toolchainVersion}")
    if( __toolchainVersion VERSION_GREATER __toolchainMaxVersion )
     set( __toolchainMaxVersion "${__toolchainVersion}" )
     set( __toolchainIdx ${__idx} )
@@ -973,11 +1024,11 @@ if( "${ANDROID_TOOLCHAIN_NAME}" STREQUAL "standalone-clang" )
 elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
  string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
  string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-4.6" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
- if( NOT EXISTS "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}/bin/clang${TOOL_OS_SUFFIX}" )
+ if( NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}/bin/clang${TOOL_OS_SUFFIX}" )
   message( FATAL_ERROR "Could not find the Clang compiler driver" )
  endif()
  set( ANDROID_COMPILER_IS_CLANG 1 )
- set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
 else()
  set( ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
  unset( ANDROID_COMPILER_IS_CLANG CACHE )
@@ -991,7 +1042,7 @@ endif()
 
 # setup paths and STL for NDK
 if( BUILD_WITH_ANDROID_NDK )
- set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
  set( ANDROID_SYSROOT "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}" )
 
  if( ANDROID_STL STREQUAL "none" )
@@ -1050,11 +1101,11 @@ if( BUILD_WITH_ANDROID_NDK )
  endif()
  # find libsupc++.a - rtti & exceptions
  if( ANDROID_STL STREQUAL "system_re" OR ANDROID_STL MATCHES "gnustl" )
-  if( ANDROID_NDK_RELEASE STRGREATER "r8" ) # r8b
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  elseif( NOT ANDROID_NDK_RELEASE STRLESS "r7" AND ANDROID_NDK_RELEASE STRLESS "r8b")
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  else( ANDROID_NDK_RELEASE STRLESS "r7" )
+  set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r8b or newer
+  if( NOT EXISTS "${__libsupcxx}" )
+   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r7-r8
+  endif()
+  if( NOT EXISTS "${__libsupcxx}" ) # before r7
    if( ARMEABI_V7A )
     if( ANDROID_FORCE_ARM_BUILD )
      set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
@@ -1104,7 +1155,7 @@ unset( _ndk_ccache )
 
 # setup the cross-compiler
 if( NOT CMAKE_C_COMPILER )
- if( NDK_CCACHE )
+ if( NDK_CCACHE AND NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
   set( CMAKE_C_COMPILER   "${NDK_CCACHE}" CACHE PATH "ccache as C compiler" )
   set( CMAKE_CXX_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C++ compiler" )
   if( ANDROID_COMPILER_IS_CLANG )
@@ -1176,11 +1227,25 @@ set( CMAKE_ASM_SOURCE_FILE_EXTENSIONS s S asm )
 remove_definitions( -DANDROID )
 add_definitions( -DANDROID )
 
-if(ANDROID_SYSROOT MATCHES "[ ;\"]")
- set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+if( ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ if( CMAKE_HOST_WIN32 )
+  # try to convert path to 8.3 form
+  file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "@echo %~s1" )
+  execute_process( COMMAND "$ENV{ComSpec}" /c "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "${ANDROID_SYSROOT}"
+                   OUTPUT_VARIABLE __path OUTPUT_STRIP_TRAILING_WHITESPACE
+                   RESULT_VARIABLE __result ERROR_QUIET )
+  if( __result EQUAL 0 )
+   file( TO_CMAKE_PATH "${__path}" ANDROID_SYSROOT )
+   set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+  else()
+   set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+  endif()
+ else()
+  set( ANDROID_CXX_FLAGS "'--sysroot=${ANDROID_SYSROOT}'" )
+ endif()
  if( NOT _CMAKE_IN_TRY_COMPILE )
-  # quotes will break try_compile and compiler identification
-  message(WARNING "Your Android system root has non-alphanumeric symbols. It can break compiler features detection and the whole build.")
+  # quotes can break try_compile and compiler identification
+  message(WARNING "Path to your Android NDK (or toolchain) has non-alphanumeric symbols.\nThe build might be broken.\n")
  endif()
 else()
  set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
@@ -1251,22 +1316,18 @@ elseif( ARMEABI )
  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv5te -mtune=xscale -msoft-float" )
 endif()
 
+if( ANDROID_STL MATCHES "gnustl" AND (EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}") )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+else()
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+endif()
+
 # STL
 if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
- if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- else()
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- endif()
- if ( X86 AND ANDROID_STL MATCHES "gnustl" AND ANDROID_NDK_RELEASE STREQUAL "r6" )
-  # workaround "undefined reference to `__dso_handle'" problem
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
- endif()
  if( EXISTS "${__libstl}" )
   set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libstl}\"" )
   set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libstl}\"" )
@@ -1285,9 +1346,12 @@ if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
   set( CMAKE_C_LINK_EXECUTABLE       "${CMAKE_C_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
  endif()
  if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} -lm" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} -lm" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} -lm" )
+  if( NOT EXISTS "${ANDROID_LIBM_PATH}" )
+   set( ANDROID_LIBM_PATH -lm )
+  endif()
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} ${ANDROID_LIBM_PATH}" )
  endif()
 endif()
 
@@ -1323,7 +1387,14 @@ if( ARMEABI_V7A )
 endif()
 
 if( ANDROID_NO_UNDEFINED )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ if( MIPS )
+  # there is some sysroot-related problem in mips linker...
+  if( NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined -Wl,-rpath-link,${ANDROID_SYSROOT}/usr/lib" )
+  endif()
+ else()
+  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ endif()
 endif()
 
 if( ANDROID_SO_UNDEFINED )
@@ -1403,9 +1474,9 @@ set( CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FL
 set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
 
 if( MIPS AND BUILD_WITH_ANDROID_NDK AND ANDROID_NDK_RELEASE STREQUAL "r8" )
- set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
- set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
- set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
+ set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
+ set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
+ set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
 endif()
 
 # configure rtti
@@ -1432,6 +1503,43 @@ endif()
 include_directories( SYSTEM "${ANDROID_SYSROOT}/usr/include" ${ANDROID_STL_INCLUDE_DIRS} )
 link_directories( "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
 
+# detect if need link crtbegin_so.o explicitly
+if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
+ set( __cmd "${CMAKE_CXX_CREATE_SHARED_LIBRARY}" )
+ string( REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_C_COMPILER>"   "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}"   __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CXX_FLAGS>" "${CMAKE_CXX_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<LANGUAGE_COMPILE_FLAGS>" "" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_FLAGS>" "${CMAKE_SHARED_LINKER_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS>" "-shared" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET_SONAME>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET>" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain_crtlink_test.so" __cmd "${__cmd}" )
+ string( REPLACE "<OBJECTS>" "\"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_LIBRARIES>" "" __cmd "${__cmd}" )
+ separate_arguments( __cmd )
+ foreach( __var ANDROID_NDK ANDROID_NDK_TOOLCHAINS_PATH ANDROID_STANDALONE_TOOLCHAIN )
+  if( ${__var} )
+   set( __tmp "${${__var}}" )
+   separate_arguments( __tmp )
+   string( REPLACE "${__tmp}" "${${__var}}" __cmd "${__cmd}")
+  endif()
+ endforeach()
+ string( REPLACE "'" "" __cmd "${__cmd}" )
+ string( REPLACE "\"" "" __cmd "${__cmd}" )
+ execute_process( COMMAND ${__cmd} RESULT_VARIABLE __cmd_result OUTPUT_QUIET ERROR_QUIET )
+ if( __cmd_result EQUAL 0 )
+  set( ANDROID_EXPLICIT_CRT_LINK ON )
+ else()
+  set( ANDROID_EXPLICIT_CRT_LINK OFF )
+ endif()
+endif()
+
+if( ANDROID_EXPLICIT_CRT_LINK )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+endif()
+
 # setup output directories
 set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "root for library output, set this to change where android libs are installed to" )
 set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )
@@ -1523,6 +1631,7 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
  foreach( __var NDK_CCACHE  LIBRARY_OUTPUT_PATH_ROOT  ANDROID_FORBID_SYGWIN  ANDROID_SET_OBSOLETE_VARIABLES
                 ANDROID_NDK_HOST_X64
                 ANDROID_NDK
+                ANDROID_NDK_LAYOUT
                 ANDROID_STANDALONE_TOOLCHAIN
                 ANDROID_TOOLCHAIN_NAME
                 ANDROID_ABI
@@ -1536,6 +1645,8 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
                 ANDROID_GOLD_LINKER
                 ANDROID_NOEXECSTACK
                 ANDROID_RELRO
+                ANDROID_LIBM_PATH
+                ANDROID_EXPLICIT_CRT_LINK
                 )
   if( DEFINED ${__var} )
    if( "${__var}" MATCHES " ")
@@ -1579,6 +1690,7 @@ endif()
 #   ANDROID_STANDALONE_TOOLCHAIN
 #   ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
 #   ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
+#   ANDROID_NDK_LAYOUT : the inner NDK structure (RELEASE, LINARO, ANDROID)
 #   LIBRARY_OUTPUT_PATH_ROOT : <any valid path>
 #   NDK_CCACHE : <path to your ccache executable>
 # Obsolete:
@@ -1624,6 +1736,7 @@ endif()
 #   ANDROID_EXCEPTIONS : if exceptions are enabled by the runtime
 #   ANDROID_GCC_TOOLCHAIN_NAME : read-only, differs from ANDROID_TOOLCHAIN_NAME only if clang is used
 #   ANDROID_CLANG_VERSION : version of clang compiler if clang is used
+#   ANDROID_LIBM_PATH : path to libm.so (set to something like $(TOP)/out/target/product/<product_name>/obj/lib/libm.so) to workaround unresolved `sincos`
 #
 # Defaults:
 #   ANDROID_DEFAULT_NDK_API_LEVEL
diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake
index 014066bc7e..2c96274a8c 100644
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@@ -44,12 +44,18 @@ if(OPENCL_FOUND)
   set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
   set(OPENCL_LIBRARIES    ${OPENCL_LIBRARY})
 
-  if (X86_64)
+  if(WIN32 AND X86_64)
     set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64/import)
-  elseif (X86)
+  elseif(WIN32)
     set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
   endif()
 
+  if(X86_64 AND UNIX)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64)
+  elseif(X86 AND UNIX)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32)
+  endif()
+
   if(WITH_OPENCLAMDFFT)
     find_path(CLAMDFFT_ROOT_DIR
               NAMES include/clAmdFft.h
@@ -80,7 +86,7 @@ if(OPENCL_FOUND)
   if(WITH_OPENCLAMDBLAS)
     find_path(CLAMDBLAS_ROOT_DIR
               NAMES include/clAmdBlas.h
-              PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
+              PATHS ENV CLAMDBLAS_PATH ENV ProgramFiles
               PATH_SUFFIXES clAmdBlas AMD/clAmdBlas
               DOC "AMD FFT root directory"
               NO_DEFAULT_PATH)
diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index d606a650a2..f27176d663 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -49,7 +49,7 @@ if(PYTHON_EXECUTABLE)
 
   if(NOT ANDROID AND NOT IOS)
     if(CMAKE_HOST_UNIX)
-      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import *; print get_python_lib()"
+      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import *; print(get_python_lib())"
                       RESULT_VARIABLE PYTHON_CVPY_PROCESS
                       OUTPUT_VARIABLE PYTHON_STD_PACKAGES_PATH
                       OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -80,7 +80,7 @@ if(PYTHON_EXECUTABLE)
 
     if(NOT PYTHON_NUMPY_INCLUDE_DIR)
       # Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
-      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print numpy.distutils.misc_util.get_numpy_include_dirs()[0]"
+      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(numpy.distutils.misc_util.get_numpy_include_dirs()[0])"
                       RESULT_VARIABLE PYTHON_NUMPY_PROCESS
                       OUTPUT_VARIABLE PYTHON_NUMPY_INCLUDE_DIR
                       OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -92,7 +92,7 @@ if(PYTHON_EXECUTABLE)
     endif()
 
     if(PYTHON_NUMPY_INCLUDE_DIR)
-      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print numpy.version.version"
+      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.version.version)"
                         RESULT_VARIABLE PYTHON_NUMPY_PROCESS
                         OUTPUT_VARIABLE PYTHON_NUMPY_VERSION
                         OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index dbf2a2b7ea..96d480584e 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -110,10 +110,33 @@ endif(WITH_GIGEAPI)
 # --- Dc1394 ---
 ocv_clear_vars(HAVE_DC1394 HAVE_DC1394_2)
 if(WITH_1394)
-  CHECK_MODULE(libdc1394-2 HAVE_DC1394_2)
-  if(NOT HAVE_DC1394_2)
-    CHECK_MODULE(libdc1394 HAVE_DC1394)
-  endif()
+  if(WIN32 AND MINGW)
+      find_path(CMU1394_INCLUDE_PATH "/1394common.h"
+                PATH_SUFFIXES include
+                DOC "The path to cmu1394 headers")
+      find_path(DC1394_2_INCLUDE_PATH "/dc1394/dc1394.h"
+                PATH_SUFFIXES include
+                DOC "The path to DC1394 2.x headers")
+      if(CMU1394_INCLUDE_PATH AND DC1394_2_INCLUDE_PATH)
+        set(CMU1394_LIB_DIR  "${CMU1394_INCLUDE_PATH}/../lib"  CACHE PATH "Full path of CMU1394 library directory")
+        set(DC1394_2_LIB_DIR "${DC1394_2_INCLUDE_PATH}/../lib" CACHE PATH "Full path of DC1394 2.x library directory")
+        if(EXISTS "${CMU1394_LIB_DIR}/lib1394camera.a" AND EXISTS "${DC1394_2_LIB_DIR}/libdc1394.a")
+          set(HAVE_DC1394_2 TRUE)
+        endif()
+      endif()
+      if(HAVE_DC1394_2)
+        ocv_parse_pkg("libdc1394-2" "${DC1394_2_LIB_DIR}/pkgconfig" "")
+        ocv_include_directories(${DC1394_2_INCLUDE_PATH})
+        set(HIGHGUI_LIBRARIES ${HIGHGUI_LIBRARIES}
+            "${DC1394_2_LIB_DIR}/libdc1394.a"
+            "${CMU1394_LIB_DIR}/lib1394camera.a")
+      endif(HAVE_DC1394_2)
+  else(WIN32 AND MINGW)
+    CHECK_MODULE(libdc1394-2 HAVE_DC1394_2)
+    if(NOT HAVE_DC1394_2)
+      CHECK_MODULE(libdc1394 HAVE_DC1394)
+    endif()
+  endif(WIN32 AND MINGW)
 endif(WITH_1394)
 
 # --- xine ---
@@ -226,7 +249,7 @@ endif(WITH_MSMF)
 
 # --- Extra HighGUI libs on Windows ---
 if(WIN32)
-  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 vfw32)
+  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 setupapi ws2_32 vfw32)
   if(MINGW64)
     list(APPEND HIGHGUI_LIBRARIES avifil32 avicap32 winmm msvfw32)
     list(REMOVE_ITEM HIGHGUI_LIBRARIES vfw32)
diff --git a/cmake/OpenCVFindXimea.cmake b/cmake/OpenCVFindXimea.cmake
index 5600275f47..27e2a78ad4 100644
--- a/cmake/OpenCVFindXimea.cmake
+++ b/cmake/OpenCVFindXimea.cmake
@@ -9,6 +9,7 @@
 #
 # Created: 5 Aug 2011 by Marian Zajko (marian.zajko@ximea.com)
 # Updated: 25 June 2012 by Igor Kuzmin (parafin@ximea.com)
+# Updated: 22 October 2012 by Marian Zajko (marian.zajko@ximea.com)
 #
 
 set(XIMEA_FOUND)
@@ -18,11 +19,15 @@ set(XIMEA_LIBRARY_DIR)
 if(WIN32)
   # Try to find the XIMEA API path in registry.
   GET_FILENAME_COMPONENT(XIMEA_PATH "[HKEY_CURRENT_USER\\Software\\XIMEA\\CamSupport\\API;Path]" ABSOLUTE)
-
-  if(EXISTS XIMEA_PATH)
+ 
+  if(EXISTS ${XIMEA_PATH})
     set(XIMEA_FOUND 1)
     # set LIB folders
-    set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x86")
+    if(CMAKE_CL_64)
+      set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x64")
+    else()
+      set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x86")
+    endif()
   else()
     set(XIMEA_FOUND 0)
   endif()
@@ -38,5 +43,4 @@ endif()
 
 mark_as_advanced(FORCE XIMEA_FOUND)
 mark_as_advanced(FORCE XIMEA_PATH)
-mark_as_advanced(FORCE XIMEA_LIBRARY_DIR)
-
+mark_as_advanced(FORCE XIMEA_LIBRARY_DIR)
\ No newline at end of file
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index f44e3df8dc..00eb3cfa4a 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -470,7 +470,8 @@ endmacro()
 #   ocv_create_module(<extra link dependencies>)
 #   ocv_create_module(SKIP_LINK)
 macro(ocv_create_module)
-  add_library(${the_module} ${OPENCV_MODULE_TYPE} ${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES})
+  add_library(${the_module} ${OPENCV_MODULE_TYPE} ${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES}
+    "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp")
   if(NOT the_module STREQUAL opencv_ts)
     set_target_properties(${the_module} PROPERTIES COMPILE_DEFINITIONS OPENCV_NOSTL)
   endif()
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 5547a113c7..59366eb03b 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -501,6 +501,13 @@ macro(ocv_parse_header2 LIBNAME HDR_PATH VARNAME)
   endif()
 endmacro()
 
+# read single version info from the pkg file
+macro(ocv_parse_pkg LIBNAME PKG_PATH SCOPE)
+  if(EXISTS "${PKG_PATH}/${LIBNAME}.pc")
+    file(STRINGS "${PKG_PATH}/${LIBNAME}.pc" line_to_parse REGEX "^Version:[ \t]+[0-9.]*.*$" LIMIT_COUNT 1)
+    STRING(REGEX REPLACE ".*Version: ([^ ]+).*" "\\1" ALIASOF_${LIBNAME}_VERSION "${line_to_parse}" )
+  endif()
+endmacro()
 
 ################################################################################################
 # short command to setup source group
diff --git a/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst b/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
index 9309b05c1d..4b3ffbcae2 100644
--- a/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
+++ b/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
@@ -85,7 +85,7 @@ This tutorial code's is shown lines below. You can also download it from `here <
      std::vector< DMatch > good_matches;
 
      for( int i = 0; i < descriptors_1.rows; i++ )
-     { if( matches[i].distance < 2*min_dist )
+     { if( matches[i].distance <= 2*min_dist )
        { good_matches.push_back( matches[i]); }
      }
 
@@ -127,6 +127,3 @@ Result
    .. image:: images/Feature_FlannMatcher_Keypoints_Result.jpg
       :align: center
       :height: 250pt
-
-
-
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 64462eea8a..2486eb19a8 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -278,8 +278,8 @@ CV_EXPORTS int recoverPose( InputArray E, InputArray points1, InputArray points2
 
 
 //! finds coordinates of epipolar lines corresponding the specified points
-CV_EXPORTS void computeCorrespondEpilines( InputArray points, int whichImage,
-                                           InputArray F, OutputArray lines );
+CV_EXPORTS_W void computeCorrespondEpilines( InputArray points, int whichImage,
+                                             InputArray F, OutputArray lines );
 
 CV_EXPORTS_W void triangulatePoints( InputArray projMatr1, InputArray projMatr2,
                                      InputArray projPoints1, InputArray projPoints2,
diff --git a/modules/core/doc/basic_structures.rst b/modules/core/doc/basic_structures.rst
index fe13a3462b..70c7c0ebe2 100644
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@@ -1741,7 +1741,7 @@ Returns the depth of a matrix element.
 
 .. ocv:function:: int Mat::depth() const
 
-The method returns the identifier of the matrix element depth (the type of each individual channel). For example, for a 16-bit signed 3-channel array, the method returns ``CV_16S`` . A complete list of matrix types contains the following values:
+The method returns the identifier of the matrix element depth (the type of each individual channel). For example, for a 16-bit signed element array, the method returns ``CV_16S`` . A complete list of matrix types contains the following values:
 
 * ``CV_8U``     - 8-bit unsigned integers ( ``0..255``     )
 
diff --git a/modules/core/include/opencv2/core/cuda/limits.hpp b/modules/core/include/opencv2/core/cuda/limits.hpp
index 4b265da0e5..0439de795c 100644
--- a/modules/core/include/opencv2/core/cuda/limits.hpp
+++ b/modules/core/include/opencv2/core/cuda/limits.hpp
@@ -43,193 +43,80 @@
 #ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
 #define __OPENCV_GPU_LIMITS_GPU_HPP__
 
-#include <limits>
+#include <limits.h>
+#include <float.h>
 #include "common.hpp"
 
 namespace cv { namespace gpu { namespace cudev
 {
-    template<class T> struct numeric_limits
-    {
-        typedef T type;
-        __device__ __forceinline__ static type min()  { return type(); };
-        __device__ __forceinline__ static type max() { return type(); };
-        __device__ __forceinline__ static type epsilon() { return type(); }
-        __device__ __forceinline__ static type round_error() { return type(); }
-        __device__ __forceinline__ static type denorm_min()  { return type(); }
-        __device__ __forceinline__ static type infinity() { return type(); }
-        __device__ __forceinline__ static type quiet_NaN() { return type(); }
-        __device__ __forceinline__ static type signaling_NaN() { return T(); }
-        static const bool is_signed;
-    };
 
-    template<> struct numeric_limits<bool>
-    {
-        typedef bool type;
-        __device__ __forceinline__ static type min() { return false; };
-        __device__ __forceinline__ static type max() { return true;  };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <class T> struct numeric_limits;
 
-    template<> struct numeric_limits<char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (char)-1 == -1;
-    };
+template <> struct numeric_limits<bool>
+{
+    __device__ __forceinline__ static bool min() { return false; }
+    __device__ __forceinline__ static bool max() { return true;  }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<signed char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return SCHAR_MIN; };
-        __device__ __forceinline__ static type max() { return SCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (signed char)-1 == -1;
-    };
+template <> struct numeric_limits<signed char>
+{
+    __device__ __forceinline__ static signed char min() { return SCHAR_MIN; }
+    __device__ __forceinline__ static signed char max() { return SCHAR_MAX; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<unsigned char>
-    {
-        typedef unsigned char type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<unsigned char>
+{
+    __device__ __forceinline__ static unsigned char min() { return 0; }
+    __device__ __forceinline__ static unsigned char max() { return UCHAR_MAX; }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<short>
-    {
-        typedef short type;
-        __device__ __forceinline__ static type min() { return SHRT_MIN; };
-        __device__ __forceinline__ static type max() { return SHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<short>
+{
+    __device__ __forceinline__ static short min() { return SHRT_MIN; }
+    __device__ __forceinline__ static short max() { return SHRT_MAX; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<unsigned short>
-    {
-        typedef unsigned short type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return USHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<unsigned short>
+{
+    __device__ __forceinline__ static unsigned short min() { return 0; }
+    __device__ __forceinline__ static unsigned short max() { return USHRT_MAX; }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<int>
-    {
-        typedef int type;
-        __device__ __forceinline__ static type min() { return INT_MIN; };
-        __device__ __forceinline__ static type max() { return INT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<int>
+{
+    __device__ __forceinline__ static int min() { return INT_MIN; }
+    __device__ __forceinline__ static int max() { return INT_MAX; }
+    static const bool is_signed = true;
+};
 
+template <> struct numeric_limits<unsigned int>
+{
+    __device__ __forceinline__ static unsigned int min() { return 0; }
+    __device__ __forceinline__ static unsigned int max() { return UINT_MAX; }
+    static const bool is_signed = false;
+};
 
-    template<> struct numeric_limits<unsigned int>
-    {
-        typedef unsigned int type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UINT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<float>
+{
+    __device__ __forceinline__ static float min() { return FLT_MIN; }
+    __device__ __forceinline__ static float max() { return FLT_MAX; }
+    __device__ __forceinline__ static float epsilon() { return FLT_EPSILON; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<long>
-    {
-        typedef long type;
-        __device__ __forceinline__ static type min() { return LONG_MIN; };
-        __device__ __forceinline__ static type max() { return LONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<double>
+{
+    __device__ __forceinline__ static double min() { return DBL_MIN; }
+    __device__ __forceinline__ static double max() { return DBL_MAX; }
+    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
+    static const bool is_signed = true;
+};
 
-    template<> struct numeric_limits<unsigned long>
-    {
-        typedef unsigned long type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return ULONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<float>
-    {
-        typedef float type;
-        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
-        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-
-    template<> struct numeric_limits<double>
-    {
-        typedef double type;
-        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
-        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
 }}} // namespace cv { namespace gpu { namespace cudev {
 
 #endif // __OPENCV_GPU_LIMITS_GPU_HPP__
diff --git a/modules/core/include/opencv2/core/gpu.hpp b/modules/core/include/opencv2/core/gpu.hpp
index 775a9d0267..9bacfc1a26 100644
--- a/modules/core/include/opencv2/core/gpu.hpp
+++ b/modules/core/include/opencv2/core/gpu.hpp
@@ -375,19 +375,6 @@ public:
     //! returns true if stream object is not default (!= 0)
     operator bool_type() const;
 
-    // obsolete methods
-
-    void enqueueDownload(const GpuMat& src, OutputArray dst);
-
-    void enqueueUpload(InputArray src, GpuMat& dst);
-
-    void enqueueCopy(const GpuMat& src, OutputArray dst);
-
-    void enqueueMemSet(GpuMat& src, Scalar val);
-    void enqueueMemSet(GpuMat& src, Scalar val, InputArray mask);
-
-    void enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha = 1.0, double beta = 0.0);
-
     class Impl;
 
 private:
@@ -529,10 +516,10 @@ public:
     size_t totalConstMem() const;
 
     //! major compute capability
-    int major() const;
+    int majorVersion() const;
 
     //! minor compute capability
-    int minor() const;
+    int minorVersion() const;
 
     //! alignment requirement for textures
     size_t textureAlignment() const;
diff --git a/modules/core/include/opencv2/core/gpu.inl.hpp b/modules/core/include/opencv2/core/gpu.inl.hpp
index b44c2b1511..13861170cb 100644
--- a/modules/core/include/opencv2/core/gpu.inl.hpp
+++ b/modules/core/include/opencv2/core/gpu.inl.hpp
@@ -525,42 +525,6 @@ void swap(CudaMem& a, CudaMem& b)
 
 //////////////////////////////// Stream ///////////////////////////////
 
-inline
-void Stream::enqueueDownload(const GpuMat& src, OutputArray dst)
-{
-    src.download(dst, *this);
-}
-
-inline
-void Stream::enqueueUpload(InputArray src, GpuMat& dst)
-{
-    dst.upload(src, *this);
-}
-
-inline
-void Stream::enqueueCopy(const GpuMat& src, OutputArray dst)
-{
-    src.copyTo(dst, *this);
-}
-
-inline
-void Stream::enqueueMemSet(GpuMat& src, Scalar val)
-{
-    src.setTo(val, *this);
-}
-
-inline
-void Stream::enqueueMemSet(GpuMat& src, Scalar val, InputArray mask)
-{
-    src.setTo(val, mask, *this);
-}
-
-inline
-void Stream::enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha, double beta)
-{
-    src.convertTo(dst, dtype, alpha, beta, *this);
-}
-
 inline
 Stream::Stream(const Ptr<Impl>& impl)
     : impl_(impl)
@@ -619,7 +583,7 @@ size_t DeviceInfo::totalMemory() const
 inline
 bool DeviceInfo::supports(FeatureSet feature_set) const
 {
-    int version = major() * 10 + minor();
+    int version = majorVersion() * 10 + minorVersion();
     return version >= feature_set;
 }
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 8c9b10ceaa..b1162a1426 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -78,7 +78,8 @@ public:
         EXPR              = 6 << KIND_SHIFT,
         OPENGL_BUFFER     = 7 << KIND_SHIFT,
         CUDA_MEM          = 8 << KIND_SHIFT,
-        GPU_MAT           = 9 << KIND_SHIFT
+        GPU_MAT           = 9 << KIND_SHIFT,
+        OCL_MAT           =10 << KIND_SHIFT
     };
 
     _InputArray();
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 6115e3de1e..52c63d4ca5 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -77,6 +77,7 @@ struct CV_EXPORTS Matx_AddOp {};
 struct CV_EXPORTS Matx_SubOp {};
 struct CV_EXPORTS Matx_ScaleOp {};
 struct CV_EXPORTS Matx_MulOp {};
+struct CV_EXPORTS Matx_DivOp {};
 struct CV_EXPORTS Matx_MatMulOp {};
 struct CV_EXPORTS Matx_TOp {};
 
@@ -174,6 +175,7 @@ public:
     Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp);
     template<typename _T2> Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp);
     Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp);
     template<int l> Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp);
     Matx(const Matx<_Tp, n, m>& a, Matx_TOp);
 
@@ -746,6 +748,13 @@ Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_Mul
         val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
 }
 
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
+}
+
 template<typename _Tp, int m, int n> template<int l> inline
 Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
 {
@@ -1162,6 +1171,12 @@ Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
     return (const Vec<_Tp, m>&)(c);
 }
 
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_DivOp());
+}
+
 template<typename _Tp, int m, int n> static inline
 bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
 {
@@ -1337,4 +1352,4 @@ template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const V
 
 } // cv
 
-#endif // __OPENCV_CORE_MATX_HPP__
\ No newline at end of file
+#endif // __OPENCV_CORE_MATX_HPP__
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index 12961b32c4..6d3cd9b41d 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -71,6 +71,30 @@
 #  endif
 #endif
 
+#ifdef _OPENMP
+#  define HAVE_OPENMP
+#endif
+
+#ifdef __APPLE__
+#  define HAVE_GCD
+#endif
+
+#if defined _MSC_VER && _MSC_VER >= 1600
+#  define HAVE_CONCURRENCY
+#endif
+
+#if defined HAVE_TBB
+#  define CV_PARALLEL_FRAMEWORK "tbb"
+#elif defined HAVE_CSTRIPES
+#  define CV_PARALLEL_FRAMEWORK "cstripes"
+#elif defined HAVE_OPENMP
+#  define CV_PARALLEL_FRAMEWORK "openmp"
+#elif defined HAVE_GCD
+#  define CV_PARALLEL_FRAMEWORK "gcd"
+#elif defined HAVE_CONCURRENCY
+#  define CV_PARALLEL_FRAMEWORK "ms-concurrency"
+#endif
+
 namespace cv
 {
 #ifdef HAVE_TBB
diff --git a/modules/core/perf/perf_reduce.cpp b/modules/core/perf/perf_reduce.cpp
index 93d3a14166..7b74b0e7e3 100644
--- a/modules/core/perf/perf_reduce.cpp
+++ b/modules/core/perf/perf_reduce.cpp
@@ -34,7 +34,8 @@ PERF_TEST_P(Size_MatType_ROp, reduceR,
     declare.in(src, WARMUP_RNG).out(vec);
     declare.time(100);
 
-    TEST_CYCLE() reduce(src, vec, 0, reduceOp, ddepth);
+    int runs = 15;
+    TEST_CYCLE_MULTIRUN(runs) reduce(src, vec, 0, reduceOp, ddepth);
 
     SANITY_CHECK(vec, 1);
 }
@@ -65,4 +66,3 @@ PERF_TEST_P(Size_MatType_ROp, reduceC,
 
     SANITY_CHECK(vec, 1);
 }
-
diff --git a/modules/core/src/gpu_info.cpp b/modules/core/src/gpu_info.cpp
index 7520380caa..5a1e567463 100644
--- a/modules/core/src/gpu_info.cpp
+++ b/modules/core/src/gpu_info.cpp
@@ -119,7 +119,7 @@ bool cv::gpu::deviceSupports(FeatureSet feature_set)
     else
     {
         DeviceInfo dev(devId);
-        version = dev.major() * 10 + dev.minor();
+        version = dev.majorVersion() * 10 + dev.minorVersion();
         if (devId < cache_size)
             versions[devId] = version;
     }
@@ -455,7 +455,7 @@ size_t cv::gpu::DeviceInfo::totalConstMem() const
 #endif
 }
 
-int cv::gpu::DeviceInfo::major() const
+int cv::gpu::DeviceInfo::majorVersion() const
 {
 #ifndef HAVE_CUDA
     throw_no_cuda();
@@ -465,7 +465,7 @@ int cv::gpu::DeviceInfo::major() const
 #endif
 }
 
-int cv::gpu::DeviceInfo::minor() const
+int cv::gpu::DeviceInfo::minorVersion() const
 {
 #ifndef HAVE_CUDA
     throw_no_cuda();
@@ -569,7 +569,12 @@ int cv::gpu::DeviceInfo::maxTexture1DMipmap() const
     throw_no_cuda();
     return 0;
 #else
-    return deviceProps().get(device_id_)->maxTexture1DMipmap;
+    #if CUDA_VERSION >= 5000
+        return deviceProps().get(device_id_)->maxTexture1DMipmap;
+    #else
+        CV_Error(Error::StsNotImplemented, "This function requires CUDA 5.0");
+        return 0;
+    #endif
 #endif
 }
 
@@ -599,7 +604,12 @@ Vec2i cv::gpu::DeviceInfo::maxTexture2DMipmap() const
     throw_no_cuda();
     return Vec2i();
 #else
-    return Vec2i(deviceProps().get(device_id_)->maxTexture2DMipmap);
+    #if CUDA_VERSION >= 5000
+        return Vec2i(deviceProps().get(device_id_)->maxTexture2DMipmap);
+    #else
+        CV_Error(Error::StsNotImplemented, "This function requires CUDA 5.0");
+        return Vec2i();
+    #endif
 #endif
 }
 
@@ -898,12 +908,12 @@ bool cv::gpu::DeviceInfo::isCompatible() const
     return false;
 #else
     // Check PTX compatibility
-    if (TargetArchs::hasEqualOrLessPtx(major(), minor()))
+    if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
         return true;
 
     // Check BIN compatibility
-    for (int i = minor(); i >= 0; --i)
-        if (TargetArchs::hasBin(major(), i))
+    for (int i = minorVersion(); i >= 0; --i)
+        if (TargetArchs::hasBin(majorVersion(), i))
             return true;
 
     return false;
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index 158ff8e45c..404c5b4341 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -2850,9 +2850,9 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp
 
     if( _mean.data )
     {
-        CV_Assert( _mean.size() == mean_sz );        
+        CV_Assert( _mean.size() == mean_sz );
         _mean.convertTo(mean, ctype);
-        covar_flags |= CV_COVAR_USE_AVG; 
+        covar_flags |= CV_COVAR_USE_AVG;
     }
 
     calcCovarMatrix( data, covar, mean, covar_flags, ctype );
@@ -2896,6 +2896,36 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp
     return *this;
 }
 
+template <typename T>
+int computeCumulativeEnergy(const Mat& eigenvalues, double retainedVariance)
+{
+    CV_DbgAssert( eigenvalues.type() == DataType<T>::type );
+
+    Mat g(eigenvalues.size(), DataType<T>::type);
+
+    for(int ig = 0; ig < g.rows; ig++)
+    {
+        g.at<T>(ig, 0) = 0;
+        for(int im = 0; im <= ig; im++)
+        {
+            g.at<T>(ig,0) += eigenvalues.at<T>(im,0);
+        }
+    }
+
+    int L;
+
+    for(L = 0; L < eigenvalues.rows; L++)
+    {
+        double energy = g.at<T>(L, 0) / g.at<T>(g.rows - 1, 0);
+        if(energy > retainedVariance)
+            break;
+    }
+
+    L = std::max(2, L);
+
+    return L;
+}
+
 PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, double retainedVariance)
 {
     Mat data = _data.getMat(), _mean = __mean.getMat();
@@ -2972,26 +3002,11 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, double reta
     }
 
     // compute the cumulative energy content for each eigenvector
-    Mat g(eigenvalues.size(), ctype);
-
-    for(int ig = 0; ig < g.rows; ig++)
-    {
-        g.at<float>(ig,0) = 0;
-        for(int im = 0; im <= ig; im++)
-        {
-            g.at<float>(ig,0) += eigenvalues.at<float>(im,0);
-        }
-    }
-
     int L;
-    for(L = 0; L < eigenvalues.rows; L++)
-    {
-        double energy = g.at<float>(L, 0) / g.at<float>(g.rows - 1, 0);
-        if(energy > retainedVariance)
-            break;
-    }
-
-    L = std::max(2, L);
+    if (ctype == CV_32F)
+        L = computeCumulativeEnergy<float>(eigenvalues, retainedVariance);
+    else
+        L = computeCumulativeEnergy<double>(eigenvalues, retainedVariance);
 
     // use clone() to physically copy the data and thus deallocate the original matrices
     eigenvalues = eigenvalues.rowRange(0,L).clone();
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 053dd1cef0..d2032b2e5c 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -995,6 +995,11 @@ Mat _InputArray::getMat(int i) const
         return !v.empty() ? Mat(size(i), t, (void*)&v[0]) : Mat();
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     if( k == STD_VECTOR_MAT )
     {
         const std::vector<Mat>& v = *(const std::vector<Mat>*)obj;
@@ -1100,6 +1105,11 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
         return;
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == STD_VECTOR_MAT );
     //if( k == STD_VECTOR_MAT )
     {
@@ -1224,6 +1234,11 @@ Size _InputArray::size(int i) const
         return d_mat->size();
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == CUDA_MEM );
     //if( k == CUDA_MEM )
     {
@@ -1338,6 +1353,11 @@ bool _InputArray::empty() const
     if( k == OPENGL_BUFFER )
         return ((const ogl::Buffer*)obj)->empty();
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     if( k == GPU_MAT )
         return ((const gpu::GpuMat*)obj)->empty();
 
@@ -1573,6 +1593,11 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i, bool all
         return;
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     if( k == NONE )
     {
         CV_Error(CV_StsNullPtr, "create() called for the missing output array" );
@@ -1684,6 +1709,11 @@ void _OutputArray::release() const
         return;
     }
 
+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
     CV_Assert( k == STD_VECTOR_MAT );
     //if( k == STD_VECTOR_MAT )
     {
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 0b2a845ac1..51b165275f 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -61,17 +61,6 @@
     #endif
 #endif
 
-#ifdef _OPENMP
-    #define HAVE_OPENMP
-#endif
-
-#ifdef __APPLE__
-    #define HAVE_GCD
-#endif
-
-#if defined _MSC_VER && _MSC_VER >= 1600
-    #define HAVE_CONCURRENCY
-#endif
 
 /* IMPORTANT: always use the same order of defines
    1. HAVE_TBB         - 3rdparty library, should be explicitly enabled
@@ -110,10 +99,6 @@
     #endif
 #endif
 
-#if defined HAVE_TBB || defined HAVE_CSTRIPES || defined HAVE_OPENMP || defined HAVE_GCD || defined HAVE_CONCURRENCY
-   #define HAVE_PARALLEL_FRAMEWORK
-#endif
-
 namespace cv
 {
     ParallelLoopBody::~ParallelLoopBody() {}
@@ -121,7 +106,7 @@ namespace cv
 
 namespace
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
     class ParallelLoopBodyWrapper
     {
     public:
@@ -218,7 +203,7 @@ public:
 static SchedPtr pplScheduler;
 #endif
 
-#endif // HAVE_PARALLEL_FRAMEWORK
+#endif // CV_PARALLEL_FRAMEWORK
 
 } //namespace
 
@@ -226,7 +211,7 @@ static SchedPtr pplScheduler;
 
 void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
 
     if(numThreads != 0)
     {
@@ -281,7 +266,7 @@ void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body,
     }
     else
 
-#endif // HAVE_PARALLEL_FRAMEWORK
+#endif // CV_PARALLEL_FRAMEWORK
     {
         (void)nstripes;
         body(range);
@@ -290,7 +275,7 @@ void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body,
 
 int cv::getNumThreads(void)
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
 
     if(numThreads == 0)
         return 1;
@@ -333,7 +318,7 @@ int cv::getNumThreads(void)
 void cv::setNumThreads( int threads )
 {
     (void)threads;
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
     numThreads = threads;
 #endif
 
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index 6b36883cfe..1fbc4242fb 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -75,6 +75,7 @@ protected:
     bool TestSparseMat();
     bool TestVec();
     bool TestMatxMultiplication();
+    bool TestMatxElementwiseDivison();
     bool TestSubMatAccess();
     bool TestExp();
     bool TestSVD();
@@ -891,6 +892,28 @@ bool CV_OperationsTest::TestMatxMultiplication()
     return true;
 }
 
+bool CV_OperationsTest::TestMatxElementwiseDivison()
+{
+    try
+    {
+        Matx22f mat(2, 4, 6, 8);
+        Matx22f mat2(2, 2, 2, 2);
+
+        Matx22f res = mat / mat2;
+
+        if(res(0, 0) != 1.0) throw test_excep();
+        if(res(0, 1) != 2.0) throw test_excep();
+        if(res(1, 0) != 3.0) throw test_excep();
+        if(res(1, 1) != 4.0) throw test_excep();
+    }
+    catch(const test_excep&)
+    {
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        return false;
+    }
+    return true;
+}
+
 
 bool CV_OperationsTest::TestVec()
 {
@@ -1109,6 +1132,9 @@ void CV_OperationsTest::run( int /* start_from */)
     if (!TestMatxMultiplication())
         return;
 
+    if (!TestMatxElementwiseDivison())
+        return;
+
     if (!TestSubMatAccess())
         return;
 
diff --git a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
index 552bf8491b..4c49cad61a 100644
--- a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
@@ -189,7 +189,7 @@ For each query descriptor, finds the training descriptors not farther than the s
 
     :param compactResult: Parameter used when the mask (or masks) is not empty. If  ``compactResult``  is false, the  ``matches``  vector has the same size as  ``queryDescriptors``  rows. If  ``compactResult``  is true, the  ``matches``  vector does not contain matches for fully masked-out query descriptors.
 
-    :param maxDistance: Threshold for the distance between matched descriptors.
+    :param maxDistance: Threshold for the distance between matched descriptors. Distance means here metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured in Pixels)!
 
 For each query descriptor, the methods find such training descriptors that the distance between the query descriptor and the training descriptor is equal or smaller than ``maxDistance``. Found matches are returned in the distance increasing order.
 
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 301b216d98..4b21ddb09a 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -206,6 +206,8 @@ public:
                                      OutputArray descriptors,
                                      bool useProvidedKeypoints=false ) const = 0;
 
+    CV_WRAP void compute( const Mat& image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, CV_OUT Mat& descriptors ) const;
+
     // Create feature detector and descriptor extractor by name.
     CV_WRAP static Ptr<Feature2D> create( const String& name );
 };
diff --git a/modules/features2d/src/descriptors.cpp b/modules/features2d/src/descriptors.cpp
index f8aba8958b..4f434032ec 100644
--- a/modules/features2d/src/descriptors.cpp
+++ b/modules/features2d/src/descriptors.cpp
@@ -105,6 +105,12 @@ Ptr<DescriptorExtractor> DescriptorExtractor::create(const String& descriptorExt
     return Algorithm::create<DescriptorExtractor>("Feature2D." + descriptorExtractorType);
 }
 
+
+CV_WRAP void Feature2D::compute( const Mat& image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, CV_OUT Mat& descriptors ) const
+{
+   DescriptorExtractor::compute(image, keypoints, descriptors);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /****************************************************************************************\
diff --git a/modules/gpu/doc/initalization_and_information.rst b/modules/gpu/doc/initalization_and_information.rst
index ad4b29d420..abfc0860cf 100644
--- a/modules/gpu/doc/initalization_and_information.rst
+++ b/modules/gpu/doc/initalization_and_information.rst
@@ -147,10 +147,10 @@ Class providing functionality for querying the specified GPU properties. ::
         size_t totalConstMem() const;
 
         //! major compute capability
-        int major() const;
+        int majorVersion() const;
 
         //! minor compute capability
-        int minor() const;
+        int minorVersion() const;
 
         //! alignment requirement for textures
         size_t textureAlignment() const;
@@ -313,19 +313,19 @@ Returns the device name.
 
 
 
-gpu::DeviceInfo::major
-----------------------
+gpu::DeviceInfo::majorVersion
+-----------------------------
 Returns the major compute capability version.
 
-.. ocv:function:: int gpu::DeviceInfo::major()
+.. ocv:function:: int gpu::DeviceInfo::majorVersion()
 
 
 
-gpu::DeviceInfo::minor
-----------------------
+gpu::DeviceInfo::minorVersion
+-----------------------------
 Returns the minor compute capability version.
 
-.. ocv:function:: int gpu::DeviceInfo::minor()
+.. ocv:function:: int gpu::DeviceInfo::minorVersion()
 
 
 
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 0f1da83cef..74867b48dd 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -458,7 +458,7 @@ public:
 
                 // generate integral for scale
                 gpu::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
-                gpu::integralBuffered(src, sint, buff);
+                gpu::integral(src, sint, buff);
 
                 // calculate job
                 int totalWidth = level.workArea.width / step;
diff --git a/modules/gpuarithm/doc/arithm.rst b/modules/gpuarithm/doc/arithm.rst
index 8a051bc49c..09b7220c09 100644
--- a/modules/gpuarithm/doc/arithm.rst
+++ b/modules/gpuarithm/doc/arithm.rst
@@ -6,10 +6,10 @@ Arithm Operations on Matrices
 
 
 gpu::gemm
-------------------
+---------
 Performs generalized matrix multiplication.
 
-.. ocv:function:: void gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::gemm(InputArray src1, InputArray src2, double alpha, InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null())
 
     :param src1: First multiplied input matrix that should have  ``CV_32FC1`` , ``CV_64FC1`` , ``CV_32FC2`` , or  ``CV_64FC2``  type.
 
@@ -44,38 +44,40 @@ The function performs generalized matrix multiplication similar to the ``gemm``
 
 
 gpu::mulSpectrums
----------------------
+-----------------
 Performs a per-element multiplication of two Fourier spectrums.
 
-.. ocv:function:: void gpu::mulSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null())
 
-    :param a: First spectrum.
+    :param src1: First spectrum.
 
-    :param b: Second spectrum with the same size and type as  ``a`` .
+    :param src2: Second spectrum with the same size and type as  ``a`` .
 
-    :param c: Destination spectrum.
+    :param dst: Destination spectrum.
 
     :param flags: Mock parameter used for CPU/GPU interfaces similarity.
 
     :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
 
-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+    :param stream: Stream for the asynchronous version.
+
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
 
 .. seealso:: :ocv:func:`mulSpectrums`
 
 
 
 gpu::mulAndScaleSpectrums
------------------------------
+-------------------------
 Performs a per-element multiplication of two Fourier spectrums and scales the result.
 
-.. ocv:function:: void gpu::mulAndScaleSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null())
 
-    :param a: First spectrum.
+    :param src1: First spectrum.
 
-    :param b: Second spectrum with the same size and type as  ``a`` .
+    :param src2: Second spectrum with the same size and type as  ``a`` .
 
-    :param c: Destination spectrum.
+    :param dst: Destination spectrum.
 
     :param flags: Mock parameter used for CPU/GPU interfaces similarity.
 
@@ -83,17 +85,17 @@ Performs a per-element multiplication of two Fourier spectrums and scales the re
 
     :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
 
-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
 
 .. seealso:: :ocv:func:`mulSpectrums`
 
 
 
 gpu::dft
-------------
+--------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
 
-.. ocv:function:: void gpu::dft( const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null())
 
     :param src: Source matrix (real or complex).
 
@@ -125,46 +127,25 @@ The source matrix should be continuous, otherwise reallocation and data copying
 
 
 
-gpu::ConvolveBuf
+gpu::Convolution
 ----------------
-.. ocv:struct:: gpu::ConvolveBuf
+.. ocv:class:: gpu::Convolution : public Algorithm
 
-Class providing a memory buffer for :ocv:func:`gpu::convolve` function, plus it allows to adjust some specific parameters. ::
+Base class for convolution (or cross-correlation) operator. ::
 
-    struct CV_EXPORTS ConvolveBuf
+    class CV_EXPORTS Convolution : public Algorithm
     {
-        Size result_size;
-        Size block_size;
-        Size user_block_size;
-        Size dft_size;
-        int spect_len;
-
-        GpuMat image_spect, templ_spect, result_spect;
-        GpuMat image_block, templ_block, result_data;
-
-        void create(Size image_size, Size templ_size);
-        static Size estimateBlockSize(Size result_size, Size templ_size);
+    public:
+        virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
     };
 
-You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
 
 
-
-gpu::ConvolveBuf::create
-------------------------
-.. ocv:function:: gpu::ConvolveBuf::create(Size image_size, Size templ_size)
-
-Constructs a buffer for :ocv:func:`gpu::convolve` function with respective arguments.
-
-
-
-gpu::convolve
------------------
+gpu::Convolution::convolve
+---------------------------
 Computes a convolution (or cross-correlation) of two images.
 
-.. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr=false)
-
-.. ocv:function:: void gpu::convolve( const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::Convolution::convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null())
 
     :param image: Source image. Only  ``CV_32FC1`` images are supported for now.
 
@@ -174,38 +155,14 @@ Computes a convolution (or cross-correlation) of two images.
 
     :param ccorr: Flags to evaluate cross-correlation instead of convolution.
 
-    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`gpu::ConvolveBuf`.
-
     :param stream: Stream for the asynchronous version.
 
-.. seealso:: :ocv:func:`gpu::filter2D`
 
 
+gpu::createConvolution
+----------------------
+Creates implementation for :ocv:class:`gpu::Convolution` .
 
-gpu::integral
------------------
-Computes an integral image.
+.. ocv:function:: Ptr<Convolution> createConvolution(Size user_block_size = Size())
 
-.. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-
-    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`integral`
-
-
-
-gpu::sqrIntegral
---------------------
-Computes a squared integral image.
-
-.. ocv:function:: void gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-
-    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
-
-    :param stream: Stream for the asynchronous version.
+    :param user_block_size: Block size. If you leave default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
diff --git a/modules/gpuarithm/doc/core.rst b/modules/gpuarithm/doc/core.rst
index 50599bcf2a..624ea3e7b3 100644
--- a/modules/gpuarithm/doc/core.rst
+++ b/modules/gpuarithm/doc/core.rst
@@ -6,12 +6,12 @@ Core Operations on Matrices
 
 
 gpu::merge
---------------
+----------
 Makes a multi-channel matrix out of several single-channel matrices.
 
-.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null())
 
     :param src: Array/vector of source matrices.
 
@@ -26,12 +26,12 @@ Makes a multi-channel matrix out of several single-channel matrices.
 
 
 gpu::split
---------------
+----------
 Copies each plane of a multi-channel matrix into an array.
 
-.. ocv:function:: void gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::split(InputArray src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
 
     :param src: Source matrix.
 
@@ -43,15 +43,95 @@ Copies each plane of a multi-channel matrix into an array.
 
 
 
+gpu::transpose
+--------------
+Transposes a matrix.
+
+.. ocv:function:: void gpu::transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`transpose`
+
+
+
+gpu::flip
+---------
+Flips a 2D matrix around vertical, horizontal, or both axes.
+
+.. ocv:function:: void gpu::flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
+
+    :param dst: Destination matrix.
+
+    :param flipCode: Flip mode for the source:
+
+        * ``0`` Flips around x-axis.
+
+        * ``> 0`` Flips around y-axis.
+
+        * ``< 0`` Flips around both axes.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`flip`
+
+
+
+gpu::LookUpTable
+----------------
+.. ocv:class:: gpu::LookUpTable : public Algorithm
+
+Base class for transform using lookup table. ::
+
+    class CV_EXPORTS LookUpTable : public Algorithm
+    {
+    public:
+        virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+    };
+
+.. seealso:: :ocv:func:`LUT`
+
+
+
+gpu::LookUpTable::transform
+---------------------------
+Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))`` .
+
+.. ocv:function:: void gpu::LookUpTable::transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::createLookUpTable
+----------------------
+Creates implementation for :ocv:class:`gpu::LookUpTable` .
+
+.. ocv:function:: Ptr<LookUpTable> createLookUpTable(InputArray lut)
+
+    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
+
+
+
 gpu::copyMakeBorder
 -----------------------
 Forms a border around an image.
 
-.. ocv:function:: void gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType, Scalar value = Scalar(), Stream& stream = Stream::Null())
 
-    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and  ``CV_32FC1`` types are supported.
+    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and ``CV_32FC1`` types are supported.
 
-    :param dst: Destination image with the same type as  ``src``. The size is  ``Size(src.cols+left+right, src.rows+top+bottom)`` .
+    :param dst: Destination image with the same type as  ``src``. The size is ``Size(src.cols+left+right, src.rows+top+bottom)`` .
 
     :param top:
 
@@ -68,61 +148,3 @@ Forms a border around an image.
     :param stream: Stream for the asynchronous version.
 
 .. seealso:: :ocv:func:`copyMakeBorder`
-
-
-
-gpu::transpose
-------------------
-Transposes a matrix.
-
-.. ocv:function:: void gpu::transpose( const GpuMat& src1, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc).
-
-    :param dst: Destination matrix.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`transpose`
-
-
-
-gpu::flip
--------------
-Flips a 2D matrix around vertical, horizontal, or both axes.
-
-.. ocv:function:: void gpu::flip( const GpuMat& a, GpuMat& b, int flipCode, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
-
-    :param b: Destination matrix.
-
-    :param flipCode: Flip mode for the source:
-
-        * ``0`` Flips around x-axis.
-
-        * ``>0`` Flips around y-axis.
-
-        * ``<0`` Flips around both axes.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`flip`
-
-
-
-gpu::LUT
-------------
-Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))``
-
-.. ocv:function:: void gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
-
-    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
-
-    :param dst: Destination matrix with the same depth as  ``lut``  and the same number of channels as  ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`LUT`
diff --git a/modules/gpuarithm/doc/element_operations.rst b/modules/gpuarithm/doc/element_operations.rst
index eae2ad7a2a..eb616c1c39 100644
--- a/modules/gpuarithm/doc/element_operations.rst
+++ b/modules/gpuarithm/doc/element_operations.rst
@@ -6,20 +6,16 @@ Per-element Operations
 
 
 gpu::add
-------------
+--------
 Computes a matrix-matrix or matrix-scalar sum.
 
-.. ocv:function:: void gpu::add( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::add( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or scalar.
 
-    :param a: First source matrix.
+    :param src2: Second source matrix or scalar. Matrix should have the same size and type as ``src1`` .
 
-    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
-
-    :param sc: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
 
     :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
 
@@ -32,20 +28,16 @@ Computes a matrix-matrix or matrix-scalar sum.
 
 
 gpu::subtract
------------------
+-------------
 Computes a matrix-matrix or matrix-scalar difference.
 
-.. ocv:function:: void gpu::subtract( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::subtract( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or scalar.
 
-    :param a: First source matrix.
+    :param src2: Second source matrix or scalar. Matrix should have the same size and type as ``src1`` .
 
-    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
-
-    :param sc: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
 
     :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
 
@@ -58,20 +50,16 @@ Computes a matrix-matrix or matrix-scalar difference.
 
 
 gpu::multiply
------------------
+-------------
 Computes a matrix-matrix or matrix-scalar per-element product.
 
-.. ocv:function:: void gpu::multiply( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::multiply( const GpuMat& a, const Scalar& sc, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or scalar.
 
-    :param a: First source matrix.
+    :param src2: Second source matrix or scalar.
 
-    :param b: Second source matrix to be multiplied by ``a`` elements.
-
-    :param sc: A scalar to be multiplied by ``a`` elements.
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
 
     :param scale: Optional scale factor.
 
@@ -87,19 +75,15 @@ gpu::divide
 -----------
 Computes a matrix-matrix or matrix-scalar division.
 
-.. ocv:function:: void gpu::divide( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::divide( double scale, const GpuMat& b, GpuMat& c, int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or a scalar.
 
-    :param a: First source matrix or a scalar.
+    :param src2: Second source matrix or scalar.
 
-    :param b: Second source matrix. The ``a`` elements are divided by it.
-
-    :param sc: A scalar to be divided by the elements of ``a`` matrix.
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
 
     :param scale: Optional scale factor.
 
@@ -113,11 +97,296 @@ This function, in contrast to :ocv:func:`divide`, uses a round-down rounding mod
 
 
 
+gpu::absdiff
+------------
+Computes per-element absolute difference of two matrices (or of a matrix and scalar).
+
+.. ocv:function:: void gpu::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`absdiff`
+
+
+
+gpu::abs
+--------
+Computes an absolute value of each matrix element.
+
+.. ocv:function:: void gpu::abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`abs`
+
+
+
+gpu::sqr
+--------
+Computes a square value of each matrix element.
+
+.. ocv:function:: void gpu::sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::sqrt
+---------
+Computes a square root of each matrix element.
+
+.. ocv:function:: void gpu::sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`sqrt`
+
+
+
+gpu::exp
+--------
+Computes an exponent of each matrix element.
+
+.. ocv:function:: void gpu::exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`exp`
+
+
+
+gpu::log
+--------
+Computes a natural logarithm of absolute value of each matrix element.
+
+.. ocv:function:: void gpu::log(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`log`
+
+
+
+gpu::pow
+--------
+Raises every matrix element to a power.
+
+.. ocv:function:: void gpu::pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param power: Exponent of power.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+The function ``pow`` raises every element of the input matrix to ``power`` :
+
+.. math::
+
+    \texttt{dst} (I) =  \fork{\texttt{src}(I)^power}{if \texttt{power} is integer}{|\texttt{src}(I)|^power}{otherwise}
+
+.. seealso:: :ocv:func:`pow`
+
+
+
+gpu::compare
+------------
+Compares elements of two matrices (or of a matrix and scalar).
+
+.. ocv:function:: void gpu::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param cmpop: Flag specifying the relation between the elements to be checked:
+
+            * **CMP_EQ:** ``a(.) == b(.)``
+            * **CMP_GT:** ``a(.) < b(.)``
+            * **CMP_GE:** ``a(.) <= b(.)``
+            * **CMP_LT:** ``a(.) < b(.)``
+            * **CMP_LE:** ``a(.) <= b(.)``
+            * **CMP_NE:** ``a(.) != b(.)``
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`compare`
+
+
+
+gpu::bitwise_not
+----------------
+Performs a per-element bitwise inversion.
+
+.. ocv:function:: void gpu::bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_or
+---------------
+Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_and
+----------------
+Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_xor
+----------------
+Performs a per-element bitwise ``exclusive or`` operation of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::rshift
+-----------
+Performs pixel by pixel right shift of an image by a constant value.
+
+.. ocv:function:: void gpu::rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
+
+    :param val: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::lshift
+-----------
+Performs pixel by pixel right left of an image by a constant value.
+
+.. ocv:function:: void gpu::lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
+
+    :param val: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::min
+--------
+Computes the per-element minimum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`min`
+
+
+
+gpu::max
+--------
+Computes the per-element maximum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`max`
+
+
+
 gpu::addWeighted
 ----------------
 Computes the weighted sum of two arrays.
 
-.. ocv:function:: void gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
 
     :param src1: First source array.
 
@@ -147,311 +416,11 @@ where ``I`` is a multi-dimensional index of array elements. In case of multi-cha
 
 
 
-gpu::abs
-------------
-Computes an absolute value of each matrix element.
-
-.. ocv:function:: void gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`abs`
-
-
-
-gpu::sqr
-------------
-Computes a square value of each matrix element.
-
-.. ocv:function:: void gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::sqrt
-------------
-Computes a square root of each matrix element.
-
-.. ocv:function:: void gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`sqrt`
-
-
-
-gpu::exp
-------------
-Computes an exponent of each matrix element.
-
-.. ocv:function:: void gpu::exp( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param b: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`exp`
-
-
-
-gpu::log
-------------
-Computes a natural logarithm of absolute value of each matrix element.
-
-.. ocv:function:: void gpu::log( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param b: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`log`
-
-
-
-gpu::pow
-------------
-Raises every matrix element to a power.
-
-.. ocv:function:: void gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports all type, except ``CV_64F`` depth.
-
-    :param power: Exponent of power.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-The function ``pow`` raises every element of the input matrix to ``p`` :
-
-.. math::
-
-    \texttt{dst} (I) =  \fork{\texttt{src}(I)^p}{if \texttt{p} is integer}{|\texttt{src}(I)|^p}{otherwise}
-
-.. seealso:: :ocv:func:`pow`
-
-
-
-gpu::absdiff
-----------------
-Computes per-element absolute difference of two matrices (or of a matrix and scalar).
-
-.. ocv:function:: void gpu::absdiff( const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::absdiff( const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream=Stream::Null() )
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix to be added to ``a`` .
-
-    :param s: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`absdiff`
-
-
-
-gpu::compare
-----------------
-Compares elements of two matrices.
-
-.. ocv:function:: void gpu::compare( const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null())
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix with the same size and type as ``a`` .
-
-    :param sc: A scalar to be compared with ``a`` .
-
-    :param c: Destination matrix with the same size as ``a`` and the ``CV_8UC1`` type.
-
-    :param cmpop: Flag specifying the relation between the elements to be checked:
-
-            * **CMP_EQ:** ``a(.) == b(.)``
-            * **CMP_GT:** ``a(.) < b(.)``
-            * **CMP_GE:** ``a(.) <= b(.)``
-            * **CMP_LT:** ``a(.) < b(.)``
-            * **CMP_LE:** ``a(.) <= b(.)``
-            * **CMP_NE:** ``a(.) != b(.)``
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`compare`
-
-
-
-gpu::bitwise_not
---------------------
-Performs a per-element bitwise inversion.
-
-.. ocv:function:: void gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-
-    :param src: Source matrix.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_or
--------------------
-Performs a per-element bitwise disjunction of two matrices or of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_and
---------------------
-Performs a per-element bitwise conjunction of two matrices or of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_xor
---------------------
-Performs a per-element bitwise ``exclusive or`` operation of two matrices of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::rshift
---------------------
-Performs pixel by pixel right shift of an image by a constant value.
-
-.. ocv:function:: void gpu::rshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
-
-    :param sc: Constant values, one per channel.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::lshift
---------------------
-Performs pixel by pixel right left of an image by a constant value.
-
-.. ocv:function:: void gpu::lshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
-
-    :param sc: Constant values, one per channel.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::min
-------------
-Computes the per-element minimum of two matrices (or a matrix and a scalar).
-
-.. ocv:function:: void gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`min`
-
-
-
-gpu::max
-------------
-Computes the per-element maximum of two matrices (or a matrix and a scalar).
-
-.. ocv:function:: void gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`max`
-
-
-
 gpu::threshold
-------------------
+--------------
 Applies a fixed-level threshold to each array element.
 
-.. ocv:function:: double gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
+.. ocv:function:: double gpu::threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
 
     :param src: Source array (single-channel).
 
@@ -470,12 +439,12 @@ Applies a fixed-level threshold to each array element.
 
 
 gpu::magnitude
-------------------
+--------------
 Computes magnitudes of complex matrix elements.
 
-.. ocv:function:: void gpu::magnitude( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null())
 
-.. ocv:function:: void gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())
 
     :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
 
@@ -492,12 +461,12 @@ Computes magnitudes of complex matrix elements.
 
 
 gpu::magnitudeSqr
----------------------
+-----------------
 Computes squared magnitudes of complex matrix elements.
 
-.. ocv:function:: void gpu::magnitudeSqr( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream=Stream::Null() )
 
-.. ocv:function:: void gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())
 
     :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
 
@@ -512,10 +481,10 @@ Computes squared magnitudes of complex matrix elements.
 
 
 gpu::phase
---------------
+----------
 Computes polar angles of complex matrix elements.
 
-.. ocv:function:: void gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())
 
     :param x: Source matrix containing real components ( ``CV_32FC1`` ).
 
@@ -532,10 +501,10 @@ Computes polar angles of complex matrix elements.
 
 
 gpu::cartToPolar
---------------------
+----------------
 Converts Cartesian coordinates into polar.
 
-.. ocv:function:: void gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())
 
     :param x: Source matrix containing real components ( ``CV_32FC1`` ).
 
@@ -554,10 +523,10 @@ Converts Cartesian coordinates into polar.
 
 
 gpu::polarToCart
---------------------
+----------------
 Converts polar coordinates into Cartesian.
 
-.. ocv:function:: void gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees=false, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null())
 
     :param magnitude: Source matrix containing magnitudes ( ``CV_32FC1`` ).
 
diff --git a/modules/gpuarithm/doc/reductions.rst b/modules/gpuarithm/doc/reductions.rst
index 938efc35bb..b34c2d860d 100644
--- a/modules/gpuarithm/doc/reductions.rst
+++ b/modules/gpuarithm/doc/reductions.rst
@@ -6,16 +6,16 @@ Matrix Reductions
 
 
 gpu::norm
--------------
+---------
 Returns the norm of a matrix (or difference of two matrices).
 
-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType=NORM_L2)
+.. ocv:function:: double gpu::norm(InputArray src1, int normType)
 
-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, GpuMat& buf)
+.. ocv:function:: double gpu::norm(InputArray src1, int normType, GpuMat& buf)
 
-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: double gpu::norm(InputArray src1, int normType, InputArray mask, GpuMat& buf)
 
-.. ocv:function:: double gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2)
+.. ocv:function:: double gpu::norm(InputArray src1, InputArray src2, int normType=NORM_L2)
 
     :param src1: Source matrix. Any matrices except 64F are supported.
 
@@ -32,14 +32,14 @@ Returns the norm of a matrix (or difference of two matrices).
 
 
 gpu::sum
-------------
+--------
 Returns the sum of matrix elements.
 
-.. ocv:function:: Scalar gpu::sum(const GpuMat& src)
+.. ocv:function:: Scalar gpu::sum(InputArray src)
 
-.. ocv:function:: Scalar gpu::sum(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sum(InputArray src, GpuMat& buf)
 
-.. ocv:function:: Scalar gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sum(InputArray src, InputArray mask, GpuMat& buf)
 
     :param src: Source image of any depth except for ``CV_64F`` .
 
@@ -52,14 +52,14 @@ Returns the sum of matrix elements.
 
 
 gpu::absSum
----------------
+-----------
 Returns the sum of absolute values for matrix elements.
 
-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src)
+.. ocv:function:: Scalar gpu::absSum(InputArray src)
 
-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: Scalar gpu::absSum(InputArray src, GpuMat& buf)
 
-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: Scalar gpu::absSum(InputArray src, InputArray mask, GpuMat& buf)
 
     :param src: Source image of any depth except for ``CV_64F`` .
 
@@ -70,14 +70,14 @@ Returns the sum of absolute values for matrix elements.
 
 
 gpu::sqrSum
----------------
+-----------
 Returns the squared sum of matrix elements.
 
-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src)
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src)
 
-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src, GpuMat& buf)
 
-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src, InputArray mask, GpuMat& buf)
 
     :param src: Source image of any depth except for ``CV_64F`` .
 
@@ -88,12 +88,12 @@ Returns the squared sum of matrix elements.
 
 
 gpu::minMax
----------------
+-----------
 Finds global minimum and maximum matrix elements and returns their values.
 
-.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat())
+.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
 
-.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf)
 
     :param src: Single-channel source image.
 
@@ -112,12 +112,12 @@ The function does not work with ``CV_64F`` images on GPUs with the compute capab
 
 
 gpu::minMaxLoc
-------------------
+--------------
 Finds global minimum and maximum matrix elements and returns their values with locations.
 
-.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, const GpuMat& mask=GpuMat())
+.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, InputArray mask=noArray())
 
-.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
+.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray mask, GpuMat& valbuf, GpuMat& locbuf)
 
     :param src: Single-channel source image.
 
@@ -142,12 +142,12 @@ Finds global minimum and maximum matrix elements and returns their values with l
 
 
 gpu::countNonZero
----------------------
+-----------------
 Counts non-zero matrix elements.
 
-.. ocv:function:: int gpu::countNonZero(const GpuMat& src)
+.. ocv:function:: int gpu::countNonZero(InputArray src)
 
-.. ocv:function:: int gpu::countNonZero(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: int gpu::countNonZero(InputArray src, GpuMat& buf)
 
     :param src: Single-channel source image.
 
@@ -163,7 +163,7 @@ gpu::reduce
 -----------
 Reduces a matrix to a vector.
 
-.. ocv:function:: void gpu::reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())
 
     :param mtx: Source 2D matrix.
 
@@ -183,48 +183,20 @@ Reduces a matrix to a vector.
 
     :param dtype: When it is negative, the destination vector will have the same type as the source matrix. Otherwise, its type will be  ``CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), mtx.channels())`` .
 
+    :param stream: Stream for the asynchronous version.
+
 The function ``reduce`` reduces the matrix to a vector by treating the matrix rows/columns as a set of 1D vectors and performing the specified operation on the vectors until a single row/column is obtained. For example, the function can be used to compute horizontal and vertical projections of a raster image. In case of ``CV_REDUCE_SUM`` and ``CV_REDUCE_AVG`` , the output may have a larger element bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction modes.
 
 .. seealso:: :ocv:func:`reduce`
 
 
 
-gpu::normalize
---------------
-Normalizes the norm or value range of an array.
-
-.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat())
-
-.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-
-    :param src: input array.
-
-    :param dst: output array of the same size as  ``src`` .
-
-    :param alpha: norm value to normalize to or the lower range boundary in case of the range normalization.
-
-    :param beta: upper range boundary in case of the range normalization; it is not used for the norm normalization.
-
-    :param normType: normalization type (see the details below).
-
-    :param dtype: when negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
-
-    :param mask: optional operation mask.
-
-    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-.. seealso:: :ocv:func:`normalize`
-
-
-
 gpu::meanStdDev
--------------------
+---------------
 Computes a mean value and a standard deviation of matrix elements.
 
-.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev)
-.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)
+.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev)
+.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)
 
     :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.
 
@@ -239,10 +211,10 @@ Computes a mean value and a standard deviation of matrix elements.
 
 
 gpu::rectStdDev
--------------------
+---------------
 Computes a standard deviation of integral images.
 
-.. ocv:function:: void gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null())
 
     :param src: Source image. Only the ``CV_32SC1`` type is supported.
 
@@ -253,3 +225,71 @@ Computes a standard deviation of integral images.
     :param rect: Rectangular window.
 
     :param stream: Stream for the asynchronous version.
+
+
+
+gpu::normalize
+--------------
+Normalizes the norm or value range of an array.
+
+.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
+
+.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha, double beta, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+
+    :param src: Input array.
+
+    :param dst: Output array of the same size as  ``src`` .
+
+    :param alpha: Norm value to normalize to or the lower range boundary in case of the range normalization.
+
+    :param beta: Upper range boundary in case of the range normalization; it is not used for the norm normalization.
+
+    :param normType: Normalization type ( ``NORM_MINMAX`` , ``NORM_L2`` , ``NORM_L1`` or ``NORM_INF`` ).
+
+    :param dtype: When negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
+
+    :param mask: Optional operation mask.
+
+    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`normalize`
+
+
+
+gpu::integral
+-------------
+Computes an integral image.
+
+.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
+
+    :param buffer: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`integral`
+
+
+
+gpu::sqrIntegral
+----------------
+Computes a squared integral image.
+
+.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param stream: Stream for the asynchronous version.
diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
index 4edc29ba4d..8fbe296d80 100644
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@@ -49,263 +49,317 @@
 
 #include "opencv2/core/gpu.hpp"
 
+#if defined __GNUC__
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__ __attribute__ ((deprecated))
+#elif (defined WIN32 || defined _WIN32)
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__ __declspec(deprecated)
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__
+#else
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__
+#endif
+
 namespace cv { namespace gpu {
 
-//! adds one matrix to another (c = a + b)
-CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! adds scalar to a matrix (c = a + s)
-CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+//! adds one matrix to another (dst = src1 + src2)
+CV_EXPORTS void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
 
-//! subtracts one matrix from another (c = a - b)
-CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! subtracts scalar from a matrix (c = a - s)
-CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+//! subtracts one matrix from another (dst = src1 - src2)
+CV_EXPORTS void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
 
-//! computes element-wise weighted product of the two arrays (c = scale * a * b)
-CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! weighted multiplies matrix to a scalar (c = scale * a * s)
-CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+//! computes element-wise weighted product of the two arrays (dst = scale * src1 * src2)
+CV_EXPORTS void multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes element-wise weighted quotient of the two arrays (dst = scale * (src1 / src2))
+CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
 
-//! computes element-wise weighted quotient of the two arrays (c = a / b)
-CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! computes element-wise weighted quotient of matrix and scalar (c = a / s)
-CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
 //! computes element-wise weighted reciprocal of an array (dst = scale/src2)
-CV_EXPORTS void divide(double scale, const GpuMat& b, GpuMat& c, int dtype = -1, Stream& stream = Stream::Null());
+static inline void divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+{
+    divide(src1, src2, dst, 1.0, dtype, stream);
+}
+
+//! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+CV_EXPORTS void absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes absolute value of each matrix element
+CV_EXPORTS void abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes square of each pixel in an image
+CV_EXPORTS void sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes square root of each pixel in an image
+CV_EXPORTS void sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes exponent of each matrix element
+CV_EXPORTS void exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes natural logarithm of absolute value of each matrix element
+CV_EXPORTS void log(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes power of each matrix element:
+//!    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
+//!    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
+CV_EXPORTS void pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null());
+
+//! compares elements of two arrays (dst = src1 <cmpop> src2)
+CV_EXPORTS void compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null());
+
+//! performs per-elements bit-wise inversion
+CV_EXPORTS void bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise disjunction of two arrays
+CV_EXPORTS void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise conjunction of two arrays
+CV_EXPORTS void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise "exclusive or" operation
+CV_EXPORTS void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! pixel by pixel right shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with integers elements
+CV_EXPORTS void rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+//! pixel by pixel left shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
 
 //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
-CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
+CV_EXPORTS void addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst,
                             int dtype = -1, Stream& stream = Stream::Null());
 
 //! adds scaled array to another one (dst = alpha*src1 + src2)
-static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
+static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
 {
     addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
 }
 
-//! computes element-wise absolute difference of two arrays (c = abs(a - b))
-CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
-//! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());
+//! applies fixed threshold to the image
+CV_EXPORTS double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
 
-//! computes absolute value of each matrix element
-//! supports CV_16S and CV_32F depth
-CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+//! computes magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes square of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+//! computes squared magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes square root of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+//! computes magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes exponent of each matrix element (b = e**a)
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
+//! computes squared magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
+//! computes angle of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
-//! computes power of each matrix element:
-//    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
-//    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
-//! supports all, except depth == CV_64F
-CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());
+//! converts Cartesian coordinates to polar
+//! supports only floating-point source
+CV_EXPORTS void cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
-//! compares elements of two arrays (c = a <cmpop> b)
-CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
-CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
+//! converts polar coordinates to Cartesian
+//! supports only floating-point source
+CV_EXPORTS void polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
-//! performs per-elements bit-wise inversion
-CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! makes multi-channel array out of several single-channel arrays
+CV_EXPORTS void merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null());
+CV_EXPORTS void merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! calculates per-element bit-wise disjunction of two arrays
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise disjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise conjunction of two arrays
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise conjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise "exclusive or" operation
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise "exclusive or" of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel right shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with integers elements
-CV_EXPORTS void rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel left shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of two arrays (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of array and scalar (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of two arrays (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of array and scalar (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! implements generalized matrix product algorithm GEMM from BLAS
-CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
-    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
+//! copies each plane of a multi-channel array to a dedicated array
+CV_EXPORTS void split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null());
+CV_EXPORTS void split(InputArray src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
 
 //! transposes the matrix
 //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
-CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());
+CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null());
 
 //! reverses the order of the rows, columns or both in a matrix
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
-CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());
+CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());
 
 //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
 //! destination array will have the depth type as lut and the same channels number as source
 //! supports CV_8UC1, CV_8UC3 types
-CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
+class CV_EXPORTS LookUpTable : public Algorithm
+{
+public:
+    virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+CV_EXPORTS Ptr<LookUpTable> createLookUpTable(InputArray lut);
 
-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());
+__OPENCV_GPUARITHM_DEPR_BEFORE__ void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
+inline void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream)
+{
+    createLookUpTable(lut)->transform(src, dst, stream);
+}
 
-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
-
-//! computes magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitude(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitudeSqr(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes angle (angle(i)) of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts Cartesian coordinates to polar
-//! supports only floating-point source
-CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts polar coordinates to Cartesian
-//! supports only floating-point source
-CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
-                          int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat());
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b,
-                          int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+//! copies 2D array to a larger destination array and pads borders with user-specifiable constant
+CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType,
+                               Scalar value = Scalar(), Stream& stream = Stream::Null());
 
 //! computes norm of array
 //! supports NORM_INF, NORM_L1, NORM_L2
 //! supports all matrices except 64F
-CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf);
+static inline double norm(InputArray src, int normType)
+{
+    GpuMat buf;
+    return norm(src, normType, GpuMat(), buf);
+}
+static inline double norm(InputArray src, int normType, GpuMat& buf)
+{
+    return norm(src, normType, GpuMat(), buf);
+}
 
 //! computes norm of the difference between two arrays
 //! supports NORM_INF, NORM_L1, NORM_L2
 //! supports only CV_8UC1 type
-CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
+CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2);
+static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2)
+{
+    GpuMat buf;
+    return norm(src1, src2, buf, normType);
+}
 
 //! computes sum of array elements
 //! supports only single channel images
-CV_EXPORTS Scalar sum(const GpuMat& src);
-CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar sum(InputArray src)
+{
+    GpuMat buf;
+    return sum(src, GpuMat(), buf);
+}
+static inline Scalar sum(InputArray src, GpuMat& buf)
+{
+    return sum(src, GpuMat(), buf);
+}
 
 //! computes sum of array elements absolute values
 //! supports only single channel images
-CV_EXPORTS Scalar absSum(const GpuMat& src);
-CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar absSum(InputArray src)
+{
+    GpuMat buf;
+    return absSum(src, GpuMat(), buf);
+}
+static inline Scalar absSum(InputArray src, GpuMat& buf)
+{
+    return absSum(src, GpuMat(), buf);
+}
 
 //! computes squared sum of array elements
 //! supports only single channel images
-CV_EXPORTS Scalar sqrSum(const GpuMat& src);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar sqrSum(InputArray src)
+{
+    GpuMat buf;
+    return sqrSum(src, GpuMat(), buf);
+}
+static inline Scalar sqrSum(InputArray src, GpuMat& buf)
+{
+    return sqrSum(src, GpuMat(), buf);
+}
 
 //! finds global minimum and maximum array elements and returns their values
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf);
+static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
+{
+    GpuMat buf;
+    minMax(src, minVal, maxVal, mask, buf);
+}
 
 //! finds global minimum and maximum array elements and returns their values with locations
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
-                          const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                          const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
+CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                          InputArray mask, GpuMat& valbuf, GpuMat& locbuf);
+static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
+                             InputArray mask=noArray())
+{
+    GpuMat valBuf, locBuf;
+    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
+}
 
 //! counts non-zero array elements
-CV_EXPORTS int countNonZero(const GpuMat& src);
-CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf);
+static inline int countNonZero(const GpuMat& src)
+{
+    GpuMat buf;
+    return countNonZero(src, buf);
+}
 
 //! reduces a matrix to a vector
-CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
+CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
 
 //! computes mean value and standard deviation of all or selected array elements
 //! supports only CV_8UC1 type
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
-//! buffered version
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
+CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
+static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev)
+{
+    GpuMat buf;
+    meanStdDev(src, mean, stddev, buf);
+}
 
 //! computes the standard deviation of integral images
 //! supports only CV_32SC1 source type and CV_32FC1 sqr type
 //! output will have CV_32FC1 type
-CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());
+CV_EXPORTS void rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null());
 
-//! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType,
-                               const Scalar& value = Scalar(), Stream& stream = Stream::Null());
-
-//! applies fixed threshold to the image
-CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
+//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
+CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta,
+                          int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0,
+                             int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
+{
+    GpuMat norm_buf;
+    GpuMat cvt_buf;
+    normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf);
+}
 
 //! computes the integral image
 //! sum will have CV_32S type, but will contain unsigned int values
 //! supports only CV_8UC1 source type
-CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());
-//! buffered version
-CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());
+CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null());
+static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
+{
+    integral(src, sum, buffer, stream);
+}
+static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
+{
+    GpuMat buffer;
+    integral(src, sum, buffer, stream);
+}
 
 //! computes squared integral image
 //! result matrix will have 64F type, but will contain 64U values
 //! supports source images of 8UC1 type only
-CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());
+CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null());
+static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
+{
+    GpuMat buffer;
+    sqrIntegral(src, sqsum, buffer, stream);
+}
+
+CV_EXPORTS void gemm(InputArray src1, InputArray src2, double alpha,
+                     InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null());
 
 //! performs per-element multiplication of two full (not packed) Fourier spectrums
 //! supports 32FC2 matrixes only (interleaved format)
-CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());
+CV_EXPORTS void mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null());
 
 //! performs per-element multiplication of two full (not packed) Fourier spectrums
 //! supports 32FC2 matrixes only (interleaved format)
-CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
+CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
 
 //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
 //! Param dft_size is the size of DFT transform.
@@ -318,9 +372,25 @@ CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c
 //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
 //!
 //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
-CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
+CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
 
-struct CV_EXPORTS ConvolveBuf
+//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
+//! supports source images of 32FC1 type only
+//! result matrix will have 32FC1 type
+class CV_EXPORTS Convolution : public Algorithm
+{
+public:
+    virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
+};
+CV_EXPORTS Ptr<Convolution> createConvolution(Size user_block_size = Size());
+
+__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
+inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr , Stream& stream)
+{
+    createConvolution()->convolve(image, templ, result, ccorr, stream);
+}
+
+struct ConvolveBuf
 {
     Size result_size;
     Size block_size;
@@ -331,16 +401,19 @@ struct CV_EXPORTS ConvolveBuf
     GpuMat image_spect, templ_spect, result_spect;
     GpuMat image_block, templ_block, result_data;
 
-    void create(Size image_size, Size templ_size);
-    static Size estimateBlockSize(Size result_size, Size templ_size);
+    void create(Size, Size){}
+    static Size estimateBlockSize(Size, Size){ return Size(); }
 };
 
-//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
-//! supports source images of 32FC1 type only
-//! result matrix will have 32FC1 type
-CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
-CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());
+__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
+inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream)
+{
+    createConvolution(buf.user_block_size)->convolve(image, templ, result, ccorr, stream);
+}
 
 }} // namespace cv { namespace gpu {
 
+#undef __OPENCV_GPUARITHM_DEPR_BEFORE__
+#undef __OPENCV_GPUARITHM_DEPR_AFTER__
+
 #endif /* __OPENCV_GPUARITHM_HPP__ */
diff --git a/modules/gpuarithm/perf/perf_arithm.cpp b/modules/gpuarithm/perf/perf_arithm.cpp
index b553fc2126..dfeafa0fa4 100644
--- a/modules/gpuarithm/perf/perf_arithm.cpp
+++ b/modules/gpuarithm/perf/perf_arithm.cpp
@@ -228,10 +228,11 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
         cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1);
         d_templ.upload(templ);
 
-        cv::gpu::GpuMat dst;
-        cv::gpu::ConvolveBuf d_buf;
+        cv::Ptr<cv::gpu::Convolution> convolution = cv::gpu::createConvolution();
 
-        TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -265,7 +266,7 @@ PERF_TEST_P(Sz, Integral,
         cv::gpu::GpuMat dst;
         cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::integralBuffered(d_src, dst, d_buf);
+        TEST_CYCLE() cv::gpu::integral(d_src, dst, d_buf);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -293,9 +294,9 @@ PERF_TEST_P(Sz, IntegralSqr,
     if (PERF_RUN_GPU())
     {
         const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat dst, buf;
 
-        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst);
+        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst, buf);
 
         GPU_SANITY_CHECK(dst);
     }
diff --git a/modules/gpuarithm/perf/perf_core.cpp b/modules/gpuarithm/perf/perf_core.cpp
index eab6d87366..0add472ca3 100644
--- a/modules/gpuarithm/perf/perf_core.cpp
+++ b/modules/gpuarithm/perf/perf_core.cpp
@@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel,
 
     if (PERF_RUN_GPU())
     {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,
 
     if (PERF_RUN_GPU())
     {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
diff --git a/modules/gpuarithm/perf/perf_reductions.cpp b/modules/gpuarithm/perf/perf_reductions.cpp
index 8d73180dc2..c541ce0e28 100644
--- a/modules/gpuarithm/perf/perf_reductions.cpp
+++ b/modules/gpuarithm/perf/perf_reductions.cpp
@@ -108,9 +108,10 @@ PERF_TEST_P(Sz_Norm, NormDiff,
     {
         const cv::gpu::GpuMat d_src1(src1);
         const cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_buf;
         double gpu_dst;
 
-        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, normType);
+        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, d_buf, normType);
 
         SANITY_CHECK(gpu_dst);
 
diff --git a/modules/gpuarithm/src/arithm.cpp b/modules/gpuarithm/src/arithm.cpp
index a6cd1cb62e..eb7d710e6e 100644
--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
@@ -47,21 +47,14 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::gemm(InputArray, InputArray, double, InputArray, double, OutputArray, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::mulSpectrums(InputArray, InputArray, OutputArray, int, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, float, bool, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); }
+Ptr<Convolution> cv::gpu::createConvolution(Size) { throw_no_cuda(); return Ptr<Convolution>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -169,23 +162,27 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // gemm
 
-void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
+void cv::gpu::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray _src3, double beta, OutputArray _dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
-    (void)src1;
-    (void)src2;
-    (void)alpha;
-    (void)src3;
-    (void)beta;
-    (void)dst;
-    (void)flags;
-    (void)stream;
-    CV_Error(cv::Error::StsNotImplemented, "The library was build without CUBLAS");
+    (void) _src1;
+    (void) _src2;
+    (void) alpha;
+    (void) _src3;
+    (void) beta;
+    (void) _dst;
+    (void) flags;
+    (void) stream;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUBLAS");
 #else
     // CUBLAS works with column-major matrices
 
-    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
-    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src3 = _src3.getGpuMat();
+
+    CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 );
+    CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) );
 
     if (src1.depth() == CV_64F)
     {
@@ -208,10 +205,11 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
     Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
     Size dstSize(src2Size.width, src1Size.height);
 
-    CV_Assert(src1Size.width == src2Size.height);
-    CV_Assert(src3.empty() || src3Size == dstSize);
+    CV_Assert( src1Size.width == src2Size.height );
+    CV_Assert( src3.empty() || src3Size == dstSize );
 
-    dst.create(dstSize, src1.type());
+    _dst.create(dstSize, src1.type());
+    GpuMat dst = _dst.getGpuMat();
 
     if (beta != 0)
     {
@@ -294,116 +292,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
 #endif
 }
 
-////////////////////////////////////////////////////////////////////////
-// integral
-
-void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
-{
-    GpuMat buffer;
-    gpu::integralBuffered(src, sum, buffer, s);
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    cv::Size whole;
-    cv::Point offset;
-
-    src.locateROI(whole, offset);
-
-    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
-        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
-    {
-        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
-
-        cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
-
-        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-
-        sum.setTo(Scalar::all(0), s);
-
-        GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
-        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
-
-        res.copyTo(inner, s);
-    }
-    else
-    {
-#ifndef HAVE_OPENCV_GPULEGACY
-    throw_no_cuda();
-#else
-        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-
-        NcvSize32u roiSize;
-        roiSize.width = src.cols;
-        roiSize.height = src.rows;
-
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
-
-        Ncv32u bufSize;
-        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
-
-        NppStStreamHandler h(stream);
-
-        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
-            sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-#endif
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// sqrIntegral
-
-void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
-{
-#ifndef HAVE_OPENCV_GPULEGACY
-    (void) src;
-    (void) sqsum;
-    (void) s;
-    throw_no_cuda();
-#else
-    CV_Assert(src.type() == CV_8U);
-
-    NcvSize32u roiSize;
-    roiSize.width = src.cols;
-    roiSize.height = src.rows;
-
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
-
-    Ncv32u bufSize;
-    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
-    GpuMat buf(1, bufSize, CV_8U);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStStreamHandler h(stream);
-
-    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
-            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-#endif
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // mulSpectrums
 
@@ -418,12 +306,12 @@ namespace cv { namespace gpu { namespace cudev
 
 #endif
 
-void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
+void cv::gpu::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
 {
 #ifndef HAVE_CUFFT
-    (void) a;
-    (void) b;
-    (void) c;
+    (void) _src1;
+    (void) _src2;
+    (void) _dst;
     (void) flags;
     (void) conjB;
     (void) stream;
@@ -432,16 +320,19 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
     (void) flags;
 
     typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
-
     static Caller callers[] = { cudev::mulSpectrums, cudev::mulSpectrums_CONJ };
 
-    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
-    CV_Assert(a.size() == b.size());
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
 
-    c.create(a.size(), CV_32FC2);
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
+    CV_Assert( src1.size() == src2.size() );
+
+    _dst.create(src1.size(), CV_32FC2);
+    GpuMat dst = _dst.getGpuMat();
 
     Caller caller = callers[(int)conjB];
-    caller(a, b, c, StreamAccessor::getStream(stream));
+    caller(src1, src2, dst, StreamAccessor::getStream(stream));
 #endif
 }
 
@@ -459,12 +350,12 @@ namespace cv { namespace gpu { namespace cudev
 
 #endif
 
-void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
+void cv::gpu::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
 {
 #ifndef HAVE_CUFFT
-    (void) a;
-    (void) b;
-    (void) c;
+    (void) _src1;
+    (void) _src2;
+    (void) _dst;
     (void) flags;
     (void) scale;
     (void) conjB;
@@ -476,53 +367,57 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
     typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
     static Caller callers[] = { cudev::mulAndScaleSpectrums, cudev::mulAndScaleSpectrums_CONJ };
 
-    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
-    CV_Assert(a.size() == b.size());
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
 
-    c.create(a.size(), CV_32FC2);
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
+    CV_Assert( src1.size() == src2.size() );
+
+    _dst.create(src1.size(), CV_32FC2);
+    GpuMat dst = _dst.getGpuMat();
 
     Caller caller = callers[(int)conjB];
-    caller(a, b, scale, c, StreamAccessor::getStream(stream));
+    caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
 #endif
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // dft
 
-void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
+void cv::gpu::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
 {
 #ifndef HAVE_CUFFT
-    (void) src;
-    (void) dst;
+    (void) _src;
+    (void) _dst;
     (void) dft_size;
     (void) flags;
     (void) stream;
     throw_no_cuda();
 #else
+    GpuMat src = _src.getGpuMat();
 
-    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
+    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
 
     // We don't support unpacked output (in the case of real input)
-    CV_Assert(!(flags & DFT_COMPLEX_OUTPUT));
+    CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
 
-    bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1);
-    int is_row_dft = flags & DFT_ROWS;
-    int is_scaled_dft = flags & DFT_SCALE;
-    int is_inverse = flags & DFT_INVERSE;
-    bool is_complex_input = src.channels() == 2;
-    bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
+    const bool is_1d_input       = (dft_size.height == 1) || (dft_size.width == 1);
+    const bool is_row_dft        = (flags & DFT_ROWS) != 0;
+    const bool is_scaled_dft     = (flags & DFT_SCALE) != 0;
+    const bool is_inverse        = (flags & DFT_INVERSE) != 0;
+    const bool is_complex_input  = src.channels() == 2;
+    const bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
 
     // We don't support real-to-real transform
-    CV_Assert(is_complex_input || is_complex_output);
+    CV_Assert( is_complex_input || is_complex_output );
 
-    GpuMat src_data;
+    GpuMat src_cont = src;
 
     // Make sure here we work with the continuous input,
     // as CUFFT can't handle gaps
-    src_data = src;
-    createContinuous(src.rows, src.cols, src.type(), src_data);
-    if (src_data.data != src.data)
-        src.copyTo(src_data);
+    createContinuous(src.rows, src.cols, src.type(), src_cont);
+    if (src_cont.data != src.data)
+        src.copyTo(src_cont, stream);
 
     Size dft_size_opt = dft_size;
     if (is_1d_input && !is_row_dft)
@@ -532,17 +427,17 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stre
         dft_size_opt.height = std::min(dft_size.width, dft_size.height);
     }
 
+    CV_Assert( dft_size_opt.width > 1 );
+
     cufftType dft_type = CUFFT_R2C;
     if (is_complex_input)
         dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
 
-    CV_Assert(dft_size_opt.width > 1);
-
     cufftHandle plan;
     if (is_1d_input || is_row_dft)
-        cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height);
+        cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
     else
-        cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);
+        cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
 
     cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
 
@@ -550,171 +445,191 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stre
     {
         if (is_complex_output)
         {
-            createContinuous(dft_size, CV_32FC2, dst);
+            createContinuous(dft_size, CV_32FC2, _dst);
+            GpuMat dst = _dst.getGpuMat();
+
             cufftSafeCall(cufftExecC2C(
-                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
+                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
                     is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
         }
         else
         {
-            createContinuous(dft_size, CV_32F, dst);
+            createContinuous(dft_size, CV_32F, _dst);
+            GpuMat dst = _dst.getGpuMat();
+
             cufftSafeCall(cufftExecC2R(
-                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
+                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
         }
     }
     else
     {
         // We could swap dft_size for efficiency. Here we must reflect it
         if (dft_size == dft_size_opt)
-            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, dst);
+            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
         else
-            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, dst);
+            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
+
+        GpuMat dst = _dst.getGpuMat();
 
         cufftSafeCall(cufftExecR2C(
-                plan, src_data.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
+                plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
     }
 
-    cufftSafeCall(cufftDestroy(plan));
+    cufftSafeCall( cufftDestroy(plan) );
 
     if (is_scaled_dft)
-        multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);
+        gpu::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
 
 #endif
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// convolve
+// Convolution
 
-void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
+#ifdef HAVE_CUFFT
+
+namespace
 {
-    result_size = Size(image_size.width - templ_size.width + 1,
-                       image_size.height - templ_size.height + 1);
-
-    block_size = user_block_size;
-    if (user_block_size.width == 0 || user_block_size.height == 0)
-        block_size = estimateBlockSize(result_size, templ_size);
-
-    dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
-    dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
-
-    // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
-    // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
-    if (dft_size.width > 8192)
-        dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
-    if (dft_size.height > 8192)
-        dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
-
-    // To avoid wasting time doing small DFTs
-    dft_size.width = std::max(dft_size.width, 512);
-    dft_size.height = std::max(dft_size.height, 512);
-
-    createContinuous(dft_size, CV_32F, image_block);
-    createContinuous(dft_size, CV_32F, templ_block);
-    createContinuous(dft_size, CV_32F, result_data);
-
-    spect_len = dft_size.height * (dft_size.width / 2 + 1);
-    createContinuous(1, spect_len, CV_32FC2, image_spect);
-    createContinuous(1, spect_len, CV_32FC2, templ_spect);
-    createContinuous(1, spect_len, CV_32FC2, result_spect);
-
-    // Use maximum result matrix block size for the estimated DFT block size
-    block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
-    block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
-}
-
-
-Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
-{
-    int width = (result_size.width + 2) / 3;
-    int height = (result_size.height + 2) / 3;
-    width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);
-    return Size(width, height);
-}
-
-
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
-{
-    ConvolveBuf buf;
-    gpu::convolve(image, templ, result, ccorr, buf);
-}
-
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
-{
-#ifndef HAVE_CUFFT
-    (void) image;
-    (void) templ;
-    (void) result;
-    (void) ccorr;
-    (void) buf;
-    (void) stream;
-    throw_no_cuda();
-#else
-    using namespace cv::gpu::cudev::imgproc;
-
-    CV_Assert(image.type() == CV_32F);
-    CV_Assert(templ.type() == CV_32F);
-
-    buf.create(image.size(), templ.size());
-    result.create(buf.result_size, CV_32F);
-
-    Size& block_size = buf.block_size;
-    Size& dft_size = buf.dft_size;
-
-    GpuMat& image_block = buf.image_block;
-    GpuMat& templ_block = buf.templ_block;
-    GpuMat& result_data = buf.result_data;
-
-    GpuMat& image_spect = buf.image_spect;
-    GpuMat& templ_spect = buf.templ_spect;
-    GpuMat& result_spect = buf.result_spect;
-
-    cufftHandle planR2C, planC2R;
-    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
-
-    cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
-    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
-
-    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-    gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
-                        templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
-
-    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
-                               templ_spect.ptr<cufftComplex>()));
-
-    // Process all blocks of the result matrix
-    for (int y = 0; y < result.rows; y += block_size.height)
+    class ConvolutionImpl : public Convolution
     {
-        for (int x = 0; x < result.cols; x += block_size.width)
-        {
-            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
-                                std::min(y + dft_size.height, image.rows) - y);
-            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
-                             image.step);
-            gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
-                                0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
+    public:
+        explicit ConvolutionImpl(Size user_block_size_) : user_block_size(user_block_size_) {}
 
-            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
-                                       image_spect.ptr<cufftComplex>()));
-            gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
-                                      1.f / dft_size.area(), ccorr, stream);
-            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
-                                       result_data.ptr<cufftReal>()));
+        void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null());
 
-            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
-                                 std::min(y + block_size.height, result.rows) - y);
-            GpuMat result_roi(result_roi_size, result.type(),
-                              (void*)(result.ptr<float>(y) + x), result.step);
-            GpuMat result_block(result_roi_size, result_data.type(),
-                                result_data.ptr(), result_data.step);
+    private:
+        void create(Size image_size, Size templ_size);
+        static Size estimateBlockSize(Size result_size);
 
-            result_block.copyTo(result_roi, stream);
-        }
+        Size result_size;
+        Size block_size;
+        Size user_block_size;
+        Size dft_size;
+        int spect_len;
+
+        GpuMat image_spect, templ_spect, result_spect;
+        GpuMat image_block, templ_block, result_data;
+    };
+
+    void ConvolutionImpl::create(Size image_size, Size templ_size)
+    {
+        result_size = Size(image_size.width - templ_size.width + 1,
+                           image_size.height - templ_size.height + 1);
+
+        block_size = user_block_size;
+        if (user_block_size.width == 0 || user_block_size.height == 0)
+            block_size = estimateBlockSize(result_size);
+
+        dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
+        dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
+
+        // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
+        // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
+        if (dft_size.width > 8192)
+            dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
+        if (dft_size.height > 8192)
+            dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
+
+        // To avoid wasting time doing small DFTs
+        dft_size.width = std::max(dft_size.width, 512);
+        dft_size.height = std::max(dft_size.height, 512);
+
+        createContinuous(dft_size, CV_32F, image_block);
+        createContinuous(dft_size, CV_32F, templ_block);
+        createContinuous(dft_size, CV_32F, result_data);
+
+        spect_len = dft_size.height * (dft_size.width / 2 + 1);
+        createContinuous(1, spect_len, CV_32FC2, image_spect);
+        createContinuous(1, spect_len, CV_32FC2, templ_spect);
+        createContinuous(1, spect_len, CV_32FC2, result_spect);
+
+        // Use maximum result matrix block size for the estimated DFT block size
+        block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
+        block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
     }
 
-    cufftSafeCall(cufftDestroy(planR2C));
-    cufftSafeCall(cufftDestroy(planC2R));
+    Size ConvolutionImpl::estimateBlockSize(Size result_size)
+    {
+        int width = (result_size.width + 2) / 3;
+        int height = (result_size.height + 2) / 3;
+        width = std::min(width, result_size.width);
+        height = std::min(height, result_size.height);
+        return Size(width, height);
+    }
+
+    void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
+    {
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.type() == CV_32FC1 );
+        CV_Assert( templ.type() == CV_32FC1 );
+
+        create(image.size(), templ.size());
+
+        _result.create(result_size, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        cufftHandle planR2C, planC2R;
+        cufftSafeCall( cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R) );
+        cufftSafeCall( cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C) );
+
+        cufftSafeCall( cufftSetStream(planR2C, stream) );
+        cufftSafeCall( cufftSetStream(planC2R, stream) );
+
+        GpuMat templ_roi(templ.size(), CV_32FC1, templ.data, templ.step);
+        gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
+                            templ_block.cols - templ_roi.cols, 0, Scalar(), _stream);
+
+        cufftSafeCall( cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), templ_spect.ptr<cufftComplex>()) );
+
+        // Process all blocks of the result matrix
+        for (int y = 0; y < result.rows; y += block_size.height)
+        {
+            for (int x = 0; x < result.cols; x += block_size.width)
+            {
+                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                    std::min(y + dft_size.height, image.rows) - y);
+                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
+                                 image.step);
+                gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                                    0, image_block.cols - image_roi.cols, 0, Scalar(), _stream);
+
+                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
+                                           image_spect.ptr<cufftComplex>()));
+                gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                                          1.f / dft_size.area(), ccorr, _stream);
+                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
+                                           result_data.ptr<cufftReal>()));
+
+                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                     std::min(y + block_size.height, result.rows) - y);
+                GpuMat result_roi(result_roi_size, result.type(),
+                                  (void*)(result.ptr<float>(y) + x), result.step);
+                GpuMat result_block(result_roi_size, result_data.type(),
+                                    result_data.ptr(), result_data.step);
+
+                result_block.copyTo(result_roi, _stream);
+            }
+        }
+
+        cufftSafeCall( cufftDestroy(planR2C) );
+        cufftSafeCall( cufftDestroy(planC2R) );
+    }
+}
+
+#endif
+
+Ptr<Convolution> cv::gpu::createConvolution(Size user_block_size)
+{
+#ifndef HAVE_CUFFT
+    (void) user_block_size;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<Convolution>();
+#else
+    return new ConvolutionImpl(user_block_size);
 #endif
 }
 
diff --git a/modules/gpuarithm/src/core.cpp b/modules/gpuarithm/src/core.cpp
index bd0277cde2..22887796ab 100644
--- a/modules/gpuarithm/src/core.cpp
+++ b/modules/gpuarithm/src/core.cpp
@@ -47,19 +47,19 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::merge(const std::vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+void cv::gpu::merge(const GpuMat*, size_t, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::merge(const std::vector<GpuMat>&, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::split(const GpuMat& /*src*/, std::vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+void cv::gpu::split(InputArray, GpuMat*, Stream&) { throw_no_cuda(); }
+void cv::gpu::split(InputArray, std::vector<GpuMat>&, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr<LookUpTable>(); }
 
-void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
+void cv::gpu::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int, Scalar, Stream&) { throw_no_cuda(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -70,22 +70,27 @@ namespace cv { namespace gpu { namespace cudev
 {
     namespace split_merge
     {
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
+        void merge(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
+        void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
     }
 }}}
 
 namespace
 {
-    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
+    void merge_caller(const GpuMat* src, size_t n, OutputArray _dst, Stream& stream)
     {
-        using namespace ::cv::gpu::cudev::split_merge;
+        CV_Assert( src != 0 );
+        CV_Assert( n > 0 && n <= 4 );
 
-        CV_Assert(src);
-        CV_Assert(n > 0);
+        const int depth = src[0].depth();
+        const Size size = src[0].size();
 
-        int depth = src[0].depth();
-        Size size = src[0].size();
+        for (size_t i = 0; i < n; ++i)
+        {
+            CV_Assert( src[i].size() == size );
+            CV_Assert( src[i].depth() == depth );
+            CV_Assert( src[i].channels() == 1 );
+        }
 
         if (depth == CV_64F)
         {
@@ -93,43 +98,32 @@ namespace
                 CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        bool single_channel_only = true;
-        int total_channels = 0;
-
-        for (size_t i = 0; i < n; ++i)
+        if (n == 1)
         {
-            CV_Assert(src[i].size() == size);
-            CV_Assert(src[i].depth() == depth);
-            single_channel_only = single_channel_only && src[i].channels() == 1;
-            total_channels += src[i].channels();
+            src[0].copyTo(_dst, stream);
         }
-
-        CV_Assert(single_channel_only);
-        CV_Assert(total_channels <= 4);
-
-        if (total_channels == 1)
-            src[0].copyTo(dst);
         else
         {
-            dst.create(size, CV_MAKETYPE(depth, total_channels));
+            _dst.create(size, CV_MAKE_TYPE(depth, (int)n));
+            GpuMat dst = _dst.getGpuMat();
 
             PtrStepSzb src_as_devmem[4];
             for(size_t i = 0; i < n; ++i)
                 src_as_devmem[i] = src[i];
 
             PtrStepSzb dst_as_devmem(dst);
-            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
+            cv::gpu::cudev::split_merge::merge(src_as_devmem, dst_as_devmem, (int)n, CV_ELEM_SIZE(depth), StreamAccessor::getStream(stream));
         }
     }
 
-    void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
+    void split_caller(const GpuMat& src, GpuMat* dst, Stream& stream)
     {
-        using namespace ::cv::gpu::cudev::split_merge;
+        CV_Assert( dst != 0 );
 
-        CV_Assert(dst);
+        const int depth = src.depth();
+        const int num_channels = src.channels();
 
-        int depth = src.depth();
-        int num_channels = src.channels();
+        CV_Assert( num_channels <= 4 );
 
         if (depth == CV_64F)
         {
@@ -139,45 +133,45 @@ namespace
 
         if (num_channels == 1)
         {
-            src.copyTo(dst[0]);
+            src.copyTo(dst[0], stream);
             return;
         }
 
         for (int i = 0; i < num_channels; ++i)
             dst[i].create(src.size(), depth);
 
-        CV_Assert(num_channels <= 4);
-
         PtrStepSzb dst_as_devmem[4];
         for (int i = 0; i < num_channels; ++i)
             dst_as_devmem[i] = dst[i];
 
         PtrStepSzb src_as_devmem(src);
-        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
+        cv::gpu::cudev::split_merge::split(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), StreamAccessor::getStream(stream));
     }
 }
 
-void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
+void cv::gpu::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
 {
-    ::merge(src, n, dst, StreamAccessor::getStream(stream));
+    merge_caller(src, n, dst, stream);
 }
 
 
-void cv::gpu::merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream)
+void cv::gpu::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
 {
-    ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
+    merge_caller(&src[0], src.size(), dst, stream);
 }
 
-void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
+void cv::gpu::split(InputArray _src, GpuMat* dst, Stream& stream)
 {
-    ::split(src, dst, StreamAccessor::getStream(stream));
+    GpuMat src = _src.getGpuMat();
+    split_caller(src, dst, stream);
 }
 
-void cv::gpu::split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream)
+void cv::gpu::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
 {
+    GpuMat src = _src.getGpuMat();
     dst.resize(src.channels());
     if(src.channels() > 0)
-        ::split(src, &dst[0], StreamAccessor::getStream(stream));
+        split_caller(src, &dst[0], stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -188,13 +182,16 @@ namespace arithm
     template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
 }
 
-void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
+void cv::gpu::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
 {
+    GpuMat src = _src.getGpuMat();
+
     CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
 
-    dst.create( src.cols, src.rows, src.type() );
+    _dst.create( src.cols, src.rows, src.type() );
+    GpuMat dst = _dst.getGpuMat();
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     if (src.elemSize() == 1)
     {
@@ -266,7 +263,7 @@ namespace
     };
 }
 
-void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
+void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
     static const func_t funcs[6][4] =
@@ -279,10 +276,13 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
         {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
     };
 
+    GpuMat src = _src.getGpuMat();
+
     CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
     CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
 
-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
 
     funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
 }
@@ -290,93 +290,214 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // LUT
 
-void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
+#if (CUDA_VERSION >= 5000)
+
+namespace
 {
-    const int cn = src.channels();
-
-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
-    CV_Assert( lut.depth() == CV_8U );
-    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
-    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
-
-    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
-
-    NppiSize sz;
-    sz.height = src.rows;
-    sz.width = src.cols;
-
-    Mat nppLut;
-    lut.convertTo(nppLut, CV_32S);
-
-    int nValues3[] = {256, 256, 256};
-
-    Npp32s pLevels[256];
-    for (int i = 0; i < 256; ++i)
-        pLevels[i] = i;
-
-    const Npp32s* pLevels3[3];
-
-#if (CUDA_VERSION <= 4020)
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-    GpuMat d_pLevels;
-    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
-    {
-#if (CUDA_VERSION <= 4020)
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
-#else
-        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
-#endif
-    }
-    else
+    class LookUpTableImpl : public LookUpTable
     {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        int nValues3[3];
         const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];
 
-        Mat nppLut3[3];
-        if (nppLut.channels() == 1)
+        GpuMat d_pLevels;
+        GpuMat d_nppLut;
+        GpuMat d_nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        Npp32s pLevels[256];
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+
+        d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+
+        GpuMat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = _lut.getGpuMat();
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut.upload(Mat(1, 256, hLut.type(), hLut.data));
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(d_nppLut, CV_32S);
+
+        if (lut_cn == 1)
         {
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
-#else
-            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
             pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
-#endif
+        }
+        else
+        {
+            gpu::split(d_nppLut, d_nppLut3);
+
+            pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
+            pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
+            pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
+        }
+    }
+
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#else //  (CUDA_VERSION >= 5000)
+
+namespace
+{
+    class LookUpTableImpl : public LookUpTable
+    {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        Npp32s pLevels[256];
+        int nValues3[3];
+        const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];
+
+        Mat nppLut;
+        Mat nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+
+        Mat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = Mat(_lut.getGpuMat());
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut = hLut;
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(nppLut, CV_32S);
+
+        if (lut_cn == 1)
+        {
+            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
         }
         else
         {
             cv::split(nppLut, nppLut3);
 
-#if (CUDA_VERSION <= 4020)
             pValues3[0] = nppLut3[0].ptr<Npp32s>();
             pValues3[1] = nppLut3[1].ptr<Npp32s>();
             pValues3[2] = nppLut3[2].ptr<Npp32s>();
-#else
-            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
-            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
-            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
-
-            pValues3[0] = d_nppLut0.ptr<Npp32s>();
-            pValues3[1] = d_nppLut1.ptr<Npp32s>();
-            pValues3[2] = d_nppLut2.ptr<Npp32s>();
-#endif
         }
-
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
     }
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif //  (CUDA_VERSION >= 5000)
+
+Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray lut)
+{
+    return new LookUpTableImpl(lut);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -408,14 +529,17 @@ typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
 typedef Npp32s Npp32s_a;
 #endif
 
-void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
+void cv::gpu::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream)
 {
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
+    GpuMat src = _src.getGpuMat();
 
-    dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
     {
diff --git a/modules/gpuarithm/src/cuda/div_inv.cu b/modules/gpuarithm/src/cuda/div_inv.cu
deleted file mode 100644
index 9cfda933c7..0000000000
--- a/modules/gpuarithm/src/cuda/div_inv.cu
+++ /dev/null
@@ -1,144 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
-
-#include "arithm_func_traits.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace arithm
-{
-    template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
-    {
-        S val;
-
-        __host__ explicit DivInv(S val_) : val(val_) {}
-
-        __device__ __forceinline__ D operator ()(T a) const
-        {
-            return a != 0 ? saturate_cast<D>(val / a) : 0;
-        }
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
-    {
-    };
-}}}
-
-namespace arithm
-{
-    template <typename T, typename S, typename D>
-    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        DivInv<T, S, D> op(static_cast<S>(val));
-        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
-    }
-
-    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}
-
-#endif // CUDA_DISABLER
diff --git a/modules/gpuarithm/src/cuda/div_scalar.cu b/modules/gpuarithm/src/cuda/div_scalar.cu
index 42ba90cb0c..464c4adf87 100644
--- a/modules/gpuarithm/src/cuda/div_scalar.cu
+++ b/modules/gpuarithm/src/cuda/div_scalar.cu
@@ -66,6 +66,18 @@ namespace arithm
             return saturate_cast<D>(a / val);
         }
     };
+
+    template <typename T, typename S, typename D> struct DivScalarInv : unary_function<T, D>
+    {
+        S val;
+
+        explicit DivScalarInv(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return a != 0 ? saturate_cast<D>(val / a) : 0;
+        }
+    };
 }
 
 namespace cv { namespace gpu { namespace cudev
@@ -73,72 +85,84 @@ namespace cv { namespace gpu { namespace cudev
     template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
     };
+
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalarInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
 }}}
 
 namespace arithm
 {
     template <typename T, typename S, typename D>
-    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream)
     {
-        DivScalar<T, S, D> op(static_cast<S>(val));
-        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        if (inv)
+        {
+            DivScalarInv<T, S, D> op(static_cast<S>(val));
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            DivScalar<T, S, D> op(static_cast<S>(val));
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
     }
 
-    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 }
 
 #endif // CUDA_DISABLER
diff --git a/modules/gpuarithm/src/cuda/split_merge.cu b/modules/gpuarithm/src/cuda/split_merge.cu
index 93aea3791a..388441c634 100644
--- a/modules/gpuarithm/src/cuda/split_merge.cu
+++ b/modules/gpuarithm/src/cuda/split_merge.cu
@@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace cudev
         }
 
 
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
+        void merge(const PtrStepSzb* src, PtrStepSzb& dst,
                                      int total_channels, size_t elem_size,
                                      const cudaStream_t& stream)
         {
@@ -487,7 +487,7 @@ namespace cv { namespace gpu { namespace cudev
         }
 
 
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+        void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
         {
             static SplitFunction split_func_tbl[] =
             {
diff --git a/modules/gpuarithm/src/cuda/sub_scalar.cu b/modules/gpuarithm/src/cuda/sub_scalar.cu
index 05c0cc703b..619ab4310f 100644
--- a/modules/gpuarithm/src/cuda/sub_scalar.cu
+++ b/modules/gpuarithm/src/cuda/sub_scalar.cu
@@ -58,12 +58,13 @@ namespace arithm
     template <typename T, typename S, typename D> struct SubScalar : unary_function<T, D>
     {
         S val;
+        int scale;
 
-        __host__ explicit SubScalar(S val_) : val(val_) {}
+        __host__ SubScalar(S val_, int scale_) : val(val_), scale(scale_) {}
 
         __device__ __forceinline__ D operator ()(T a) const
         {
-            return saturate_cast<D>(a - val);
+            return saturate_cast<D>(scale * (a - val));
         }
     };
 }
@@ -78,9 +79,9 @@ namespace cv { namespace gpu { namespace cudev
 namespace arithm
 {
     template <typename T, typename S, typename D>
-    void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
     {
-        SubScalar<T, S, D> op(static_cast<S>(val));
+        SubScalar<T, S, D> op(static_cast<S>(val), inv ? -1 : 1);
 
         if (mask.data)
             cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
@@ -88,61 +89,61 @@ namespace arithm
             cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
 #endif // CUDA_DISABLER
diff --git a/modules/gpuarithm/src/element_operations.cpp b/modules/gpuarithm/src/element_operations.cpp
index e818331061..3ec4f84f66 100644
--- a/modules/gpuarithm/src/element_operations.cpp
+++ b/modules/gpuarithm/src/element_operations.cpp
@@ -47,76 +47,119 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::add(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::subtract(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::multiply(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::divide(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::absdiff(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::abs(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::sqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::sqrt(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::exp(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::log(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::pow(InputArray, double, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::compare(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::divide(double, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::bitwise_not(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::bitwise_or(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::bitwise_and(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::bitwise_xor(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::rshift(InputArray, Scalar_<int>, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::lshift(InputArray, Scalar_<int>, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::abs(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::min(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::max(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::sqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::addWeighted(InputArray, double, InputArray, double, double, OutputArray, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::sqrt(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+double cv::gpu::threshold(InputArray, OutputArray, double, double, int, Stream&) {throw_no_cuda(); return 0.0;}
 
-void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::compare(const GpuMat&, Scalar, GpuMat&, int, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::bitwise_or(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::bitwise_and(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::bitwise_xor(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::rshift(const GpuMat&, Scalar_<int>, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::lshift(const GpuMat&, Scalar_<int>, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::addWeighted(const GpuMat&, double, const GpuMat&, double, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
-
-double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_no_cuda(); return 0.0;}
-
-void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitude(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitude(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitudeSqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitudeSqr(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::phase(InputArray, InputArray, OutputArray, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::cartToPolar(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::polarToCart(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); }
 
 #else
 
+////////////////////////////////////////////////////////////////////////
+// arithm_op
+
+namespace
+{
+    typedef void (*mat_mat_func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op);
+    typedef void (*mat_scalar_func_t)(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op);
+
+    void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, double scale, int dtype, Stream& stream,
+                   mat_mat_func_t mat_mat_func, mat_scalar_func_t mat_scalar_func, int op = 0)
+    {
+        const int kind1 = _src1.kind();
+        const int kind2 = _src2.kind();
+
+        const bool isScalar1 = (kind1 == _InputArray::MATX);
+        const bool isScalar2 = (kind2 == _InputArray::MATX);
+        CV_Assert( !isScalar1 || !isScalar2 );
+
+        GpuMat src1;
+        if (!isScalar1)
+            src1 = _src1.getGpuMat();
+
+        GpuMat src2;
+        if (!isScalar2)
+            src2 = _src2.getGpuMat();
+
+        Mat scalar;
+        if (isScalar1)
+            scalar = _src1.getMat();
+        else if (isScalar2)
+            scalar = _src2.getMat();
+
+        Scalar val;
+        if (!scalar.empty())
+        {
+            CV_Assert( scalar.total() <= 4 );
+            scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
+        }
+
+        GpuMat mask = _mask.getGpuMat();
+
+        const int sdepth = src1.empty() ? src2.depth() : src1.depth();
+        const int cn = src1.empty() ? src2.channels() : src1.channels();
+        const Size size = src1.empty() ? src2.size() : src1.size();
+
+        if (dtype < 0)
+            dtype = sdepth;
+
+        const int ddepth = CV_MAT_DEPTH(dtype);
+
+        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+        CV_Assert( !scalar.empty() || (src2.type() == src1.type() && src2.size() == src1.size()) );
+        CV_Assert( mask.empty() || (cn == 1 && mask.size() == size && mask.type() == CV_8UC1) );
+
+        if (sdepth == CV_64F || ddepth == CV_64F)
+        {
+            if (!deviceSupports(NATIVE_DOUBLE))
+                CV_Error(Error::StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        _dst.create(size, CV_MAKE_TYPE(ddepth, cn));
+        GpuMat dst = _dst.getGpuMat();
+
+        if (isScalar1)
+            mat_scalar_func(src2, val, true, dst, mask, scale, stream, op);
+        else if (isScalar2)
+            mat_scalar_func(src1, val, false, dst, mask, scale, stream, op);
+        else
+            mat_mat_func(src1, src2, dst, mask, scale, stream, op);
+    }
+}
+
+
 ////////////////////////////////////////////////////////////////////////
 // Basic arithmetical operations (add subtract multiply divide)
 
@@ -302,98 +345,81 @@ namespace arithm
     void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
+static void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
 {
-    using namespace arithm;
-
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
         {
-            addMat<unsigned char, unsigned char>,
-            addMat<unsigned char, signed char>,
-            addMat<unsigned char, unsigned short>,
-            addMat<unsigned char, short>,
-            addMat<unsigned char, int>,
-            addMat<unsigned char, float>,
-            addMat<unsigned char, double>
+            arithm::addMat<unsigned char, unsigned char>,
+            arithm::addMat<unsigned char, signed char>,
+            arithm::addMat<unsigned char, unsigned short>,
+            arithm::addMat<unsigned char, short>,
+            arithm::addMat<unsigned char, int>,
+            arithm::addMat<unsigned char, float>,
+            arithm::addMat<unsigned char, double>
         },
         {
-            addMat<signed char, unsigned char>,
-            addMat<signed char, signed char>,
-            addMat<signed char, unsigned short>,
-            addMat<signed char, short>,
-            addMat<signed char, int>,
-            addMat<signed char, float>,
-            addMat<signed char, double>
+            arithm::addMat<signed char, unsigned char>,
+            arithm::addMat<signed char, signed char>,
+            arithm::addMat<signed char, unsigned short>,
+            arithm::addMat<signed char, short>,
+            arithm::addMat<signed char, int>,
+            arithm::addMat<signed char, float>,
+            arithm::addMat<signed char, double>
         },
         {
-            0 /*addMat<unsigned short, unsigned char>*/,
-            0 /*addMat<unsigned short, signed char>*/,
-            addMat<unsigned short, unsigned short>,
-            addMat<unsigned short, short>,
-            addMat<unsigned short, int>,
-            addMat<unsigned short, float>,
-            addMat<unsigned short, double>
+            0 /*arithm::addMat<unsigned short, unsigned char>*/,
+            0 /*arithm::addMat<unsigned short, signed char>*/,
+            arithm::addMat<unsigned short, unsigned short>,
+            arithm::addMat<unsigned short, short>,
+            arithm::addMat<unsigned short, int>,
+            arithm::addMat<unsigned short, float>,
+            arithm::addMat<unsigned short, double>
         },
         {
-            0 /*addMat<short, unsigned char>*/,
-            0 /*addMat<short, signed char>*/,
-            addMat<short, unsigned short>,
-            addMat<short, short>,
-            addMat<short, int>,
-            addMat<short, float>,
-            addMat<short, double>
+            0 /*arithm::addMat<short, unsigned char>*/,
+            0 /*arithm::addMat<short, signed char>*/,
+            arithm::addMat<short, unsigned short>,
+            arithm::addMat<short, short>,
+            arithm::addMat<short, int>,
+            arithm::addMat<short, float>,
+            arithm::addMat<short, double>
         },
         {
-            0 /*addMat<int, unsigned char>*/,
-            0 /*addMat<int, signed char>*/,
-            0 /*addMat<int, unsigned short>*/,
-            0 /*addMat<int, short>*/,
-            addMat<int, int>,
-            addMat<int, float>,
-            addMat<int, double>
+            0 /*arithm::addMat<int, unsigned char>*/,
+            0 /*arithm::addMat<int, signed char>*/,
+            0 /*arithm::addMat<int, unsigned short>*/,
+            0 /*arithm::addMat<int, short>*/,
+            arithm::addMat<int, int>,
+            arithm::addMat<int, float>,
+            arithm::addMat<int, double>
         },
         {
-            0 /*addMat<float, unsigned char>*/,
-            0 /*addMat<float, signed char>*/,
-            0 /*addMat<float, unsigned short>*/,
-            0 /*addMat<float, short>*/,
-            0 /*addMat<float, int>*/,
-            addMat<float, float>,
-            addMat<float, double>
+            0 /*arithm::addMat<float, unsigned char>*/,
+            0 /*arithm::addMat<float, signed char>*/,
+            0 /*arithm::addMat<float, unsigned short>*/,
+            0 /*arithm::addMat<float, short>*/,
+            0 /*arithm::addMat<float, int>*/,
+            arithm::addMat<float, float>,
+            arithm::addMat<float, double>
         },
         {
-            0 /*addMat<double, unsigned char>*/,
-            0 /*addMat<double, signed char>*/,
-            0 /*addMat<double, unsigned short>*/,
-            0 /*addMat<double, short>*/,
-            0 /*addMat<double, int>*/,
-            0 /*addMat<double, float>*/,
-            addMat<double, double>
+            0 /*arithm::addMat<double, unsigned char>*/,
+            0 /*arithm::addMat<double, signed char>*/,
+            0 /*arithm::addMat<double, unsigned short>*/,
+            0 /*arithm::addMat<double, short>*/,
+            0 /*arithm::addMat<double, int>*/,
+            0 /*arithm::addMat<double, float>*/,
+            arithm::addMat<double, double>
         }
     };
 
-    if (dtype < 0)
-        dtype = src1.depth();
-
     const int sdepth = src1.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int ddepth = dst.depth();
     const int cn = src1.channels();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
@@ -413,10 +439,10 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             {
                 const int vcols = src1_.cols >> 2;
 
-                addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                arithm::addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                                  stream);
 
                 return;
             }
@@ -424,10 +450,10 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             {
                 const int vcols = src1_.cols >> 1;
 
-                addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                arithm::addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                                  stream);
 
                 return;
             }
@@ -448,75 +474,73 @@ namespace arithm
     void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
+static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
 {
-    using namespace arithm;
-
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
         {
-            addScalar<unsigned char, float, unsigned char>,
-            addScalar<unsigned char, float, signed char>,
-            addScalar<unsigned char, float, unsigned short>,
-            addScalar<unsigned char, float, short>,
-            addScalar<unsigned char, float, int>,
-            addScalar<unsigned char, float, float>,
-            addScalar<unsigned char, double, double>
+            arithm::addScalar<unsigned char, float, unsigned char>,
+            arithm::addScalar<unsigned char, float, signed char>,
+            arithm::addScalar<unsigned char, float, unsigned short>,
+            arithm::addScalar<unsigned char, float, short>,
+            arithm::addScalar<unsigned char, float, int>,
+            arithm::addScalar<unsigned char, float, float>,
+            arithm::addScalar<unsigned char, double, double>
         },
         {
-            addScalar<signed char, float, unsigned char>,
-            addScalar<signed char, float, signed char>,
-            addScalar<signed char, float, unsigned short>,
-            addScalar<signed char, float, short>,
-            addScalar<signed char, float, int>,
-            addScalar<signed char, float, float>,
-            addScalar<signed char, double, double>
+            arithm::addScalar<signed char, float, unsigned char>,
+            arithm::addScalar<signed char, float, signed char>,
+            arithm::addScalar<signed char, float, unsigned short>,
+            arithm::addScalar<signed char, float, short>,
+            arithm::addScalar<signed char, float, int>,
+            arithm::addScalar<signed char, float, float>,
+            arithm::addScalar<signed char, double, double>
         },
         {
-            0 /*addScalar<unsigned short, float, unsigned char>*/,
-            0 /*addScalar<unsigned short, float, signed char>*/,
-            addScalar<unsigned short, float, unsigned short>,
-            addScalar<unsigned short, float, short>,
-            addScalar<unsigned short, float, int>,
-            addScalar<unsigned short, float, float>,
-            addScalar<unsigned short, double, double>
+            0 /*arithm::addScalar<unsigned short, float, unsigned char>*/,
+            0 /*arithm::addScalar<unsigned short, float, signed char>*/,
+            arithm::addScalar<unsigned short, float, unsigned short>,
+            arithm::addScalar<unsigned short, float, short>,
+            arithm::addScalar<unsigned short, float, int>,
+            arithm::addScalar<unsigned short, float, float>,
+            arithm::addScalar<unsigned short, double, double>
         },
         {
-            0 /*addScalar<short, float, unsigned char>*/,
-            0 /*addScalar<short, float, signed char>*/,
-            addScalar<short, float, unsigned short>,
-            addScalar<short, float, short>,
-            addScalar<short, float, int>,
-            addScalar<short, float, float>,
-            addScalar<short, double, double>
+            0 /*arithm::addScalar<short, float, unsigned char>*/,
+            0 /*arithm::addScalar<short, float, signed char>*/,
+            arithm::addScalar<short, float, unsigned short>,
+            arithm::addScalar<short, float, short>,
+            arithm::addScalar<short, float, int>,
+            arithm::addScalar<short, float, float>,
+            arithm::addScalar<short, double, double>
         },
         {
-            0 /*addScalar<int, float, unsigned char>*/,
-            0 /*addScalar<int, float, signed char>*/,
-            0 /*addScalar<int, float, unsigned short>*/,
-            0 /*addScalar<int, float, short>*/,
-            addScalar<int, float, int>,
-            addScalar<int, float, float>,
-            addScalar<int, double, double>
+            0 /*arithm::addScalar<int, float, unsigned char>*/,
+            0 /*arithm::addScalar<int, float, signed char>*/,
+            0 /*arithm::addScalar<int, float, unsigned short>*/,
+            0 /*arithm::addScalar<int, float, short>*/,
+            arithm::addScalar<int, float, int>,
+            arithm::addScalar<int, float, float>,
+            arithm::addScalar<int, double, double>
         },
         {
-            0 /*addScalar<float, float, unsigned char>*/,
-            0 /*addScalar<float, float, signed char>*/,
-            0 /*addScalar<float, float, unsigned short>*/,
-            0 /*addScalar<float, float, short>*/,
-            0 /*addScalar<float, float, int>*/,
-            addScalar<float, float, float>,
-            addScalar<float, double, double>
+            0 /*arithm::addScalar<float, float, unsigned char>*/,
+            0 /*arithm::addScalar<float, float, signed char>*/,
+            0 /*arithm::addScalar<float, float, unsigned short>*/,
+            0 /*arithm::addScalar<float, float, short>*/,
+            0 /*arithm::addScalar<float, float, int>*/,
+            arithm::addScalar<float, float, float>,
+            arithm::addScalar<float, double, double>
         },
         {
-            0 /*addScalar<double, double, unsigned char>*/,
-            0 /*addScalar<double, double, signed char>*/,
-            0 /*addScalar<double, double, unsigned short>*/,
-            0 /*addScalar<double, double, short>*/,
-            0 /*addScalar<double, double, int>*/,
-            0 /*addScalar<double, double, float>*/,
-            addScalar<double, double, double>
+            0 /*arithm::addScalar<double, double, unsigned char>*/,
+            0 /*arithm::addScalar<double, double, signed char>*/,
+            0 /*arithm::addScalar<double, double, unsigned short>*/,
+            0 /*arithm::addScalar<double, double, short>*/,
+            0 /*arithm::addScalar<double, double, int>*/,
+            0 /*arithm::addScalar<double, double, float>*/,
+            arithm::addScalar<double, double, double>
         }
     };
 
@@ -532,31 +556,16 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
         {0                                                    , 0                                                     , 0                                                    , 0                                                    }
     };
 
-    if (dtype < 0)
-        dtype = src.depth();
-
     const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int ddepth = dst.depth();
     const int cn = src.channels();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( cn <= 4 );
-    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
     if (ddepth == sdepth && cn > 1 && npp_func != 0)
     {
-        npp_func(src, sc, dst, stream);
+        npp_func(src, val, dst, stream);
         return;
     }
 
@@ -567,7 +576,12 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, sc.val[0], dst, mask, stream);
+    func(src, val[0], dst, mask, stream);
+}
+
+void cv::gpu::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, addMat, addScalar);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -582,98 +596,81 @@ namespace arithm
     void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
+static void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
 {
-    using namespace arithm;
-
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
         {
-            subMat<unsigned char, unsigned char>,
-            subMat<unsigned char, signed char>,
-            subMat<unsigned char, unsigned short>,
-            subMat<unsigned char, short>,
-            subMat<unsigned char, int>,
-            subMat<unsigned char, float>,
-            subMat<unsigned char, double>
+            arithm::subMat<unsigned char, unsigned char>,
+            arithm::subMat<unsigned char, signed char>,
+            arithm::subMat<unsigned char, unsigned short>,
+            arithm::subMat<unsigned char, short>,
+            arithm::subMat<unsigned char, int>,
+            arithm::subMat<unsigned char, float>,
+            arithm::subMat<unsigned char, double>
         },
         {
-            subMat<signed char, unsigned char>,
-            subMat<signed char, signed char>,
-            subMat<signed char, unsigned short>,
-            subMat<signed char, short>,
-            subMat<signed char, int>,
-            subMat<signed char, float>,
-            subMat<signed char, double>
+            arithm::subMat<signed char, unsigned char>,
+            arithm::subMat<signed char, signed char>,
+            arithm::subMat<signed char, unsigned short>,
+            arithm::subMat<signed char, short>,
+            arithm::subMat<signed char, int>,
+            arithm::subMat<signed char, float>,
+            arithm::subMat<signed char, double>
         },
         {
-            0 /*subMat<unsigned short, unsigned char>*/,
-            0 /*subMat<unsigned short, signed char>*/,
-            subMat<unsigned short, unsigned short>,
-            subMat<unsigned short, short>,
-            subMat<unsigned short, int>,
-            subMat<unsigned short, float>,
-            subMat<unsigned short, double>
+            0 /*arithm::subMat<unsigned short, unsigned char>*/,
+            0 /*arithm::subMat<unsigned short, signed char>*/,
+            arithm::subMat<unsigned short, unsigned short>,
+            arithm::subMat<unsigned short, short>,
+            arithm::subMat<unsigned short, int>,
+            arithm::subMat<unsigned short, float>,
+            arithm::subMat<unsigned short, double>
         },
         {
-            0 /*subMat<short, unsigned char>*/,
-            0 /*subMat<short, signed char>*/,
-            subMat<short, unsigned short>,
-            subMat<short, short>,
-            subMat<short, int>,
-            subMat<short, float>,
-            subMat<short, double>
+            0 /*arithm::subMat<short, unsigned char>*/,
+            0 /*arithm::subMat<short, signed char>*/,
+            arithm::subMat<short, unsigned short>,
+            arithm::subMat<short, short>,
+            arithm::subMat<short, int>,
+            arithm::subMat<short, float>,
+            arithm::subMat<short, double>
         },
         {
-            0 /*subMat<int, unsigned char>*/,
-            0 /*subMat<int, signed char>*/,
-            0 /*subMat<int, unsigned short>*/,
-            0 /*subMat<int, short>*/,
-            subMat<int, int>,
-            subMat<int, float>,
-            subMat<int, double>
+            0 /*arithm::subMat<int, unsigned char>*/,
+            0 /*arithm::subMat<int, signed char>*/,
+            0 /*arithm::subMat<int, unsigned short>*/,
+            0 /*arithm::subMat<int, short>*/,
+            arithm::subMat<int, int>,
+            arithm::subMat<int, float>,
+            arithm::subMat<int, double>
         },
         {
-            0 /*subMat<float, unsigned char>*/,
-            0 /*subMat<float, signed char>*/,
-            0 /*subMat<float, unsigned short>*/,
-            0 /*subMat<float, short>*/,
-            0 /*subMat<float, int>*/,
-            subMat<float, float>,
-            subMat<float, double>
+            0 /*arithm::subMat<float, unsigned char>*/,
+            0 /*arithm::subMat<float, signed char>*/,
+            0 /*arithm::subMat<float, unsigned short>*/,
+            0 /*arithm::subMat<float, short>*/,
+            0 /*arithm::subMat<float, int>*/,
+            arithm::subMat<float, float>,
+            arithm::subMat<float, double>
         },
         {
-            0 /*subMat<double, unsigned char>*/,
-            0 /*subMat<double, signed char>*/,
-            0 /*subMat<double, unsigned short>*/,
-            0 /*subMat<double, short>*/,
-            0 /*subMat<double, int>*/,
-            0 /*subMat<double, float>*/,
-            subMat<double, double>
+            0 /*arithm::subMat<double, unsigned char>*/,
+            0 /*arithm::subMat<double, signed char>*/,
+            0 /*arithm::subMat<double, unsigned short>*/,
+            0 /*arithm::subMat<double, short>*/,
+            0 /*arithm::subMat<double, int>*/,
+            0 /*arithm::subMat<double, float>*/,
+            arithm::subMat<double, double>
         }
     };
 
-    if (dtype < 0)
-        dtype = src1.depth();
-
     const int sdepth = src1.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int ddepth = dst.depth();
     const int cn = src1.channels();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
@@ -693,10 +690,10 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             {
                 const int vcols = src1_.cols >> 2;
 
-                subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                arithm::subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                                  stream);
 
                 return;
             }
@@ -704,10 +701,10 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             {
                 const int vcols = src1_.cols >> 1;
 
-                subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                arithm::subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                                  stream);
 
                 return;
             }
@@ -725,78 +722,76 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
 namespace arithm
 {
     template <typename T, typename S, typename D>
-    void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
+static void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
 {
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
         {
-            subScalar<unsigned char, float, unsigned char>,
-            subScalar<unsigned char, float, signed char>,
-            subScalar<unsigned char, float, unsigned short>,
-            subScalar<unsigned char, float, short>,
-            subScalar<unsigned char, float, int>,
-            subScalar<unsigned char, float, float>,
-            subScalar<unsigned char, double, double>
+            arithm::subScalar<unsigned char, float, unsigned char>,
+            arithm::subScalar<unsigned char, float, signed char>,
+            arithm::subScalar<unsigned char, float, unsigned short>,
+            arithm::subScalar<unsigned char, float, short>,
+            arithm::subScalar<unsigned char, float, int>,
+            arithm::subScalar<unsigned char, float, float>,
+            arithm::subScalar<unsigned char, double, double>
         },
         {
-            subScalar<signed char, float, unsigned char>,
-            subScalar<signed char, float, signed char>,
-            subScalar<signed char, float, unsigned short>,
-            subScalar<signed char, float, short>,
-            subScalar<signed char, float, int>,
-            subScalar<signed char, float, float>,
-            subScalar<signed char, double, double>
+            arithm::subScalar<signed char, float, unsigned char>,
+            arithm::subScalar<signed char, float, signed char>,
+            arithm::subScalar<signed char, float, unsigned short>,
+            arithm::subScalar<signed char, float, short>,
+            arithm::subScalar<signed char, float, int>,
+            arithm::subScalar<signed char, float, float>,
+            arithm::subScalar<signed char, double, double>
         },
         {
-            0 /*subScalar<unsigned short, float, unsigned char>*/,
-            0 /*subScalar<unsigned short, float, signed char>*/,
-            subScalar<unsigned short, float, unsigned short>,
-            subScalar<unsigned short, float, short>,
-            subScalar<unsigned short, float, int>,
-            subScalar<unsigned short, float, float>,
-            subScalar<unsigned short, double, double>
+            0 /*arithm::subScalar<unsigned short, float, unsigned char>*/,
+            0 /*arithm::subScalar<unsigned short, float, signed char>*/,
+            arithm::subScalar<unsigned short, float, unsigned short>,
+            arithm::subScalar<unsigned short, float, short>,
+            arithm::subScalar<unsigned short, float, int>,
+            arithm::subScalar<unsigned short, float, float>,
+            arithm::subScalar<unsigned short, double, double>
         },
         {
-            0 /*subScalar<short, float, unsigned char>*/,
-            0 /*subScalar<short, float, signed char>*/,
-            subScalar<short, float, unsigned short>,
-            subScalar<short, float, short>,
-            subScalar<short, float, int>,
-            subScalar<short, float, float>,
-            subScalar<short, double, double>
+            0 /*arithm::subScalar<short, float, unsigned char>*/,
+            0 /*arithm::subScalar<short, float, signed char>*/,
+            arithm::subScalar<short, float, unsigned short>,
+            arithm::subScalar<short, float, short>,
+            arithm::subScalar<short, float, int>,
+            arithm::subScalar<short, float, float>,
+            arithm::subScalar<short, double, double>
         },
         {
-            0 /*subScalar<int, float, unsigned char>*/,
-            0 /*subScalar<int, float, signed char>*/,
-            0 /*subScalar<int, float, unsigned short>*/,
-            0 /*subScalar<int, float, short>*/,
-            subScalar<int, float, int>,
-            subScalar<int, float, float>,
-            subScalar<int, double, double>
+            0 /*arithm::subScalar<int, float, unsigned char>*/,
+            0 /*arithm::subScalar<int, float, signed char>*/,
+            0 /*arithm::subScalar<int, float, unsigned short>*/,
+            0 /*arithm::subScalar<int, float, short>*/,
+            arithm::subScalar<int, float, int>,
+            arithm::subScalar<int, float, float>,
+            arithm::subScalar<int, double, double>
         },
         {
-            0 /*subScalar<float, float, unsigned char>*/,
-            0 /*subScalar<float, float, signed char>*/,
-            0 /*subScalar<float, float, unsigned short>*/,
-            0 /*subScalar<float, float, short>*/,
-            0 /*subScalar<float, float, int>*/,
-            subScalar<float, float, float>,
-            subScalar<float, double, double>
+            0 /*arithm::subScalar<float, float, unsigned char>*/,
+            0 /*arithm::subScalar<float, float, signed char>*/,
+            0 /*arithm::subScalar<float, float, unsigned short>*/,
+            0 /*arithm::subScalar<float, float, short>*/,
+            0 /*arithm::subScalar<float, float, int>*/,
+            arithm::subScalar<float, float, float>,
+            arithm::subScalar<float, double, double>
         },
         {
-            0 /*subScalar<double, double, unsigned char>*/,
-            0 /*subScalar<double, double, signed char>*/,
-            0 /*subScalar<double, double, unsigned short>*/,
-            0 /*subScalar<double, double, short>*/,
-            0 /*subScalar<double, double, int>*/,
-            0 /*subScalar<double, double, float>*/,
-            subScalar<double, double, double>
+            0 /*arithm::subScalar<double, double, unsigned char>*/,
+            0 /*arithm::subScalar<double, double, signed char>*/,
+            0 /*arithm::subScalar<double, double, unsigned short>*/,
+            0 /*arithm::subScalar<double, double, short>*/,
+            0 /*arithm::subScalar<double, double, int>*/,
+            0 /*arithm::subScalar<double, double, float>*/,
+            arithm::subScalar<double, double, double>
         }
     };
 
@@ -812,31 +807,16 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
         {0                                                    , 0                                                     , 0                                                    , 0                                                    }
     };
 
-    if (dtype < 0)
-        dtype = src.depth();
-
     const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int ddepth = dst.depth();
     const int cn = src.channels();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( cn <= 4 );
-    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
-    if (ddepth == sdepth && cn > 1 && npp_func != 0)
+    if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv)
     {
-        npp_func(src, sc, dst, stream);
+        npp_func(src, val, dst, stream);
         return;
     }
 
@@ -847,7 +827,12 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, sc.val[0], dst, mask, stream);
+    func(src, val[0], inv, dst, mask, stream);
+}
+
+void cv::gpu::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, subMat, subScalar);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -863,127 +848,92 @@ namespace arithm
     void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 }
 
-void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
+static void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int)
 {
-    using namespace arithm;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    static const func_t funcs[7][7] =
     {
-        CV_Assert( src1.size() == src2.size() );
-
-        dst.create(src1.size(), src1.type());
-
-        mulMat_8uc4_32f(src1, src2, dst, stream);
-    }
-    else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
-    {
-        CV_Assert( src1.size() == src2.size() );
-
-        dst.create(src1.size(), src1.type());
-
-        mulMat_16sc4_32f(src1, src2, dst, stream);
-    }
-    else
-    {
-        typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
-        static const func_t funcs[7][7] =
         {
-            {
-                mulMat<unsigned char, float, unsigned char>,
-                mulMat<unsigned char, float, signed char>,
-                mulMat<unsigned char, float, unsigned short>,
-                mulMat<unsigned char, float, short>,
-                mulMat<unsigned char, float, int>,
-                mulMat<unsigned char, float, float>,
-                mulMat<unsigned char, double, double>
-            },
-            {
-                mulMat<signed char, float, unsigned char>,
-                mulMat<signed char, float, signed char>,
-                mulMat<signed char, float, unsigned short>,
-                mulMat<signed char, float, short>,
-                mulMat<signed char, float, int>,
-                mulMat<signed char, float, float>,
-                mulMat<signed char, double, double>
-            },
-            {
-                0 /*mulMat<unsigned short, float, unsigned char>*/,
-                0 /*mulMat<unsigned short, float, signed char>*/,
-                mulMat<unsigned short, float, unsigned short>,
-                mulMat<unsigned short, float, short>,
-                mulMat<unsigned short, float, int>,
-                mulMat<unsigned short, float, float>,
-                mulMat<unsigned short, double, double>
-            },
-            {
-                0 /*mulMat<short, float, unsigned char>*/,
-                0 /*mulMat<short, float, signed char>*/,
-                mulMat<short, float, unsigned short>,
-                mulMat<short, float, short>,
-                mulMat<short, float, int>,
-                mulMat<short, float, float>,
-                mulMat<short, double, double>
-            },
-            {
-                0 /*mulMat<int, float, unsigned char>*/,
-                0 /*mulMat<int, float, signed char>*/,
-                0 /*mulMat<int, float, unsigned short>*/,
-                0 /*mulMat<int, float, short>*/,
-                mulMat<int, float, int>,
-                mulMat<int, float, float>,
-                mulMat<int, double, double>
-            },
-            {
-                0 /*mulMat<float, float, unsigned char>*/,
-                0 /*mulMat<float, float, signed char>*/,
-                0 /*mulMat<float, float, unsigned short>*/,
-                0 /*mulMat<float, float, short>*/,
-                0 /*mulMat<float, float, int>*/,
-                mulMat<float, float, float>,
-                mulMat<float, double, double>
-            },
-            {
-                0 /*mulMat<double, double, unsigned char>*/,
-                0 /*mulMat<double, double, signed char>*/,
-                0 /*mulMat<double, double, unsigned short>*/,
-                0 /*mulMat<double, double, short>*/,
-                0 /*mulMat<double, double, int>*/,
-                0 /*mulMat<double, double, float>*/,
-                mulMat<double, double, double>
-            }
-        };
-
-        if (dtype < 0)
-            dtype = src1.depth();
-
-        const int sdepth = src1.depth();
-        const int ddepth = CV_MAT_DEPTH(dtype);
-        const int cn = src1.channels();
-
-        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-        CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-
-        if (sdepth == CV_64F || ddepth == CV_64F)
+            arithm::mulMat<unsigned char, float, unsigned char>,
+            arithm::mulMat<unsigned char, float, signed char>,
+            arithm::mulMat<unsigned char, float, unsigned short>,
+            arithm::mulMat<unsigned char, float, short>,
+            arithm::mulMat<unsigned char, float, int>,
+            arithm::mulMat<unsigned char, float, float>,
+            arithm::mulMat<unsigned char, double, double>
+        },
         {
-            if (!deviceSupports(NATIVE_DOUBLE))
-                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+            arithm::mulMat<signed char, float, unsigned char>,
+            arithm::mulMat<signed char, float, signed char>,
+            arithm::mulMat<signed char, float, unsigned short>,
+            arithm::mulMat<signed char, float, short>,
+            arithm::mulMat<signed char, float, int>,
+            arithm::mulMat<signed char, float, float>,
+            arithm::mulMat<signed char, double, double>
+        },
+        {
+            0 /*arithm::mulMat<unsigned short, float, unsigned char>*/,
+            0 /*arithm::mulMat<unsigned short, float, signed char>*/,
+            arithm::mulMat<unsigned short, float, unsigned short>,
+            arithm::mulMat<unsigned short, float, short>,
+            arithm::mulMat<unsigned short, float, int>,
+            arithm::mulMat<unsigned short, float, float>,
+            arithm::mulMat<unsigned short, double, double>
+        },
+        {
+            0 /*arithm::mulMat<short, float, unsigned char>*/,
+            0 /*arithm::mulMat<short, float, signed char>*/,
+            arithm::mulMat<short, float, unsigned short>,
+            arithm::mulMat<short, float, short>,
+            arithm::mulMat<short, float, int>,
+            arithm::mulMat<short, float, float>,
+            arithm::mulMat<short, double, double>
+        },
+        {
+            0 /*arithm::mulMat<int, float, unsigned char>*/,
+            0 /*arithm::mulMat<int, float, signed char>*/,
+            0 /*arithm::mulMat<int, float, unsigned short>*/,
+            0 /*arithm::mulMat<int, float, short>*/,
+            arithm::mulMat<int, float, int>,
+            arithm::mulMat<int, float, float>,
+            arithm::mulMat<int, double, double>
+        },
+        {
+            0 /*arithm::mulMat<float, float, unsigned char>*/,
+            0 /*arithm::mulMat<float, float, signed char>*/,
+            0 /*arithm::mulMat<float, float, unsigned short>*/,
+            0 /*arithm::mulMat<float, float, short>*/,
+            0 /*arithm::mulMat<float, float, int>*/,
+            arithm::mulMat<float, float, float>,
+            arithm::mulMat<float, double, double>
+        },
+        {
+            0 /*arithm::mulMat<double, double, unsigned char>*/,
+            0 /*arithm::mulMat<double, double, signed char>*/,
+            0 /*arithm::mulMat<double, double, unsigned short>*/,
+            0 /*arithm::mulMat<double, double, short>*/,
+            0 /*arithm::mulMat<double, double, int>*/,
+            0 /*arithm::mulMat<double, double, float>*/,
+            arithm::mulMat<double, double, double>
         }
+    };
 
-        dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+    const int cn = src1.channels();
 
-        PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-        PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-        PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-        const func_t func = funcs[sdepth][ddepth];
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-        if (!func)
-            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+    const func_t func = funcs[sdepth][ddepth];
 
-        func(src1_, src2_, dst_, scale, stream);
-    }
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, scale, stream);
 }
 
 namespace arithm
@@ -992,75 +942,73 @@ namespace arithm
     void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
+static void mulScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int)
 {
-    using namespace arithm;
-
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
         {
-            mulScalar<unsigned char, float, unsigned char>,
-            mulScalar<unsigned char, float, signed char>,
-            mulScalar<unsigned char, float, unsigned short>,
-            mulScalar<unsigned char, float, short>,
-            mulScalar<unsigned char, float, int>,
-            mulScalar<unsigned char, float, float>,
-            mulScalar<unsigned char, double, double>
+            arithm::mulScalar<unsigned char, float, unsigned char>,
+            arithm::mulScalar<unsigned char, float, signed char>,
+            arithm::mulScalar<unsigned char, float, unsigned short>,
+            arithm::mulScalar<unsigned char, float, short>,
+            arithm::mulScalar<unsigned char, float, int>,
+            arithm::mulScalar<unsigned char, float, float>,
+            arithm::mulScalar<unsigned char, double, double>
         },
         {
-            mulScalar<signed char, float, unsigned char>,
-            mulScalar<signed char, float, signed char>,
-            mulScalar<signed char, float, unsigned short>,
-            mulScalar<signed char, float, short>,
-            mulScalar<signed char, float, int>,
-            mulScalar<signed char, float, float>,
-            mulScalar<signed char, double, double>
+            arithm::mulScalar<signed char, float, unsigned char>,
+            arithm::mulScalar<signed char, float, signed char>,
+            arithm::mulScalar<signed char, float, unsigned short>,
+            arithm::mulScalar<signed char, float, short>,
+            arithm::mulScalar<signed char, float, int>,
+            arithm::mulScalar<signed char, float, float>,
+            arithm::mulScalar<signed char, double, double>
         },
         {
-            0 /*mulScalar<unsigned short, float, unsigned char>*/,
-            0 /*mulScalar<unsigned short, float, signed char>*/,
-            mulScalar<unsigned short, float, unsigned short>,
-            mulScalar<unsigned short, float, short>,
-            mulScalar<unsigned short, float, int>,
-            mulScalar<unsigned short, float, float>,
-            mulScalar<unsigned short, double, double>
+            0 /*arithm::mulScalar<unsigned short, float, unsigned char>*/,
+            0 /*arithm::mulScalar<unsigned short, float, signed char>*/,
+            arithm::mulScalar<unsigned short, float, unsigned short>,
+            arithm::mulScalar<unsigned short, float, short>,
+            arithm::mulScalar<unsigned short, float, int>,
+            arithm::mulScalar<unsigned short, float, float>,
+            arithm::mulScalar<unsigned short, double, double>
         },
         {
-            0 /*mulScalar<short, float, unsigned char>*/,
-            0 /*mulScalar<short, float, signed char>*/,
-            mulScalar<short, float, unsigned short>,
-            mulScalar<short, float, short>,
-            mulScalar<short, float, int>,
-            mulScalar<short, float, float>,
-            mulScalar<short, double, double>
+            0 /*arithm::mulScalar<short, float, unsigned char>*/,
+            0 /*arithm::mulScalar<short, float, signed char>*/,
+            arithm::mulScalar<short, float, unsigned short>,
+            arithm::mulScalar<short, float, short>,
+            arithm::mulScalar<short, float, int>,
+            arithm::mulScalar<short, float, float>,
+            arithm::mulScalar<short, double, double>
         },
         {
-            0 /*mulScalar<int, float, unsigned char>*/,
-            0 /*mulScalar<int, float, signed char>*/,
-            0 /*mulScalar<int, float, unsigned short>*/,
-            0 /*mulScalar<int, float, short>*/,
-            mulScalar<int, float, int>,
-            mulScalar<int, float, float>,
-            mulScalar<int, double, double>
+            0 /*arithm::mulScalar<int, float, unsigned char>*/,
+            0 /*arithm::mulScalar<int, float, signed char>*/,
+            0 /*arithm::mulScalar<int, float, unsigned short>*/,
+            0 /*arithm::mulScalar<int, float, short>*/,
+            arithm::mulScalar<int, float, int>,
+            arithm::mulScalar<int, float, float>,
+            arithm::mulScalar<int, double, double>
         },
         {
-            0 /*mulScalar<float, float, unsigned char>*/,
-            0 /*mulScalar<float, float, signed char>*/,
-            0 /*mulScalar<float, float, unsigned short>*/,
-            0 /*mulScalar<float, float, short>*/,
-            0 /*mulScalar<float, float, int>*/,
-            mulScalar<float, float, float>,
-            mulScalar<float, double, double>
+            0 /*arithm::mulScalar<float, float, unsigned char>*/,
+            0 /*arithm::mulScalar<float, float, signed char>*/,
+            0 /*arithm::mulScalar<float, float, unsigned short>*/,
+            0 /*arithm::mulScalar<float, float, short>*/,
+            0 /*arithm::mulScalar<float, float, int>*/,
+            arithm::mulScalar<float, float, float>,
+            arithm::mulScalar<float, double, double>
         },
         {
-            0 /*mulScalar<double, double, unsigned char>*/,
-            0 /*mulScalar<double, double, signed char>*/,
-            0 /*mulScalar<double, double, unsigned short>*/,
-            0 /*mulScalar<double, double, short>*/,
-            0 /*mulScalar<double, double, int>*/,
-            0 /*mulScalar<double, double, float>*/,
-            mulScalar<double, double, double>
+            0 /*arithm::mulScalar<double, double, unsigned char>*/,
+            0 /*arithm::mulScalar<double, double, signed char>*/,
+            0 /*arithm::mulScalar<double, double, unsigned short>*/,
+            0 /*arithm::mulScalar<double, double, short>*/,
+            0 /*arithm::mulScalar<double, double, int>*/,
+            0 /*arithm::mulScalar<double, double, float>*/,
+            arithm::mulScalar<double, double, double>
         }
     };
 
@@ -1076,32 +1024,21 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
         {0                                                    , 0, 0                                                    , 0                                                    }
     };
 
-    if (dtype < 0)
-        dtype = src.depth();
-
     const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int ddepth = dst.depth();
     const int cn = src.channels();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( cn <= 4 );
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    const Scalar nsc(sc.val[0] * scale, sc.val[1] * scale, sc.val[2] * scale, sc.val[3] * scale);
+    val[0] *= scale;
+    val[1] *= scale;
+    val[2] *= scale;
+    val[3] *= scale;
 
     const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
     if (ddepth == sdepth && cn > 1 && npp_func != 0)
     {
-        npp_func(src, nsc, dst, stream);
+        npp_func(src, val, dst, stream);
         return;
     }
 
@@ -1112,7 +1049,39 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, nsc.val[0], dst, stream);
+    func(src, val[0], dst, stream);
+}
+
+void cv::gpu::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream)
+{
+    if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = _src1.getGpuMat();
+        GpuMat src2 = _src2.getGpuMat();
+
+        CV_Assert( src1.size() == src2.size() );
+
+        _dst.create(src1.size(), src1.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        arithm::mulMat_8uc4_32f(src1, src2, dst, StreamAccessor::getStream(stream));
+    }
+    else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = _src1.getGpuMat();
+        GpuMat src2 = _src2.getGpuMat();
+
+        CV_Assert( src1.size() == src2.size() );
+
+        _dst.create(src1.size(), src1.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        arithm::mulMat_16sc4_32f(src1, src2, dst, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, mulMat, mulScalar);
+    }
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1128,204 +1097,167 @@ namespace arithm
     void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 }
 
-void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
+static void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int)
 {
-    using namespace arithm;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    static const func_t funcs[7][7] =
     {
-        CV_Assert( src1.size() == src2.size() );
-
-        dst.create(src1.size(), src1.type());
-
-        divMat_8uc4_32f(src1, src2, dst, stream);
-    }
-    else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
-    {
-        CV_Assert( src1.size() == src2.size() );
-
-        dst.create(src1.size(), src1.type());
-
-        divMat_16sc4_32f(src1, src2, dst, stream);
-    }
-    else
-    {
-        typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
-        static const func_t funcs[7][7] =
         {
-            {
-                divMat<unsigned char, float, unsigned char>,
-                divMat<unsigned char, float, signed char>,
-                divMat<unsigned char, float, unsigned short>,
-                divMat<unsigned char, float, short>,
-                divMat<unsigned char, float, int>,
-                divMat<unsigned char, float, float>,
-                divMat<unsigned char, double, double>
-            },
-            {
-                divMat<signed char, float, unsigned char>,
-                divMat<signed char, float, signed char>,
-                divMat<signed char, float, unsigned short>,
-                divMat<signed char, float, short>,
-                divMat<signed char, float, int>,
-                divMat<signed char, float, float>,
-                divMat<signed char, double, double>
-            },
-            {
-                0 /*divMat<unsigned short, float, unsigned char>*/,
-                0 /*divMat<unsigned short, float, signed char>*/,
-                divMat<unsigned short, float, unsigned short>,
-                divMat<unsigned short, float, short>,
-                divMat<unsigned short, float, int>,
-                divMat<unsigned short, float, float>,
-                divMat<unsigned short, double, double>
-            },
-            {
-                0 /*divMat<short, float, unsigned char>*/,
-                0 /*divMat<short, float, signed char>*/,
-                divMat<short, float, unsigned short>,
-                divMat<short, float, short>,
-                divMat<short, float, int>,
-                divMat<short, float, float>,
-                divMat<short, double, double>
-            },
-            {
-                0 /*divMat<int, float, unsigned char>*/,
-                0 /*divMat<int, float, signed char>*/,
-                0 /*divMat<int, float, unsigned short>*/,
-                0 /*divMat<int, float, short>*/,
-                divMat<int, float, int>,
-                divMat<int, float, float>,
-                divMat<int, double, double>
-            },
-            {
-                0 /*divMat<float, float, unsigned char>*/,
-                0 /*divMat<float, float, signed char>*/,
-                0 /*divMat<float, float, unsigned short>*/,
-                0 /*divMat<float, float, short>*/,
-                0 /*divMat<float, float, int>*/,
-                divMat<float, float, float>,
-                divMat<float, double, double>
-            },
-            {
-                0 /*divMat<double, double, unsigned char>*/,
-                0 /*divMat<double, double, signed char>*/,
-                0 /*divMat<double, double, unsigned short>*/,
-                0 /*divMat<double, double, short>*/,
-                0 /*divMat<double, double, int>*/,
-                0 /*divMat<double, double, float>*/,
-                divMat<double, double, double>
-            }
-        };
-
-        if (dtype < 0)
-            dtype = src1.depth();
-
-        const int sdepth = src1.depth();
-        const int ddepth = CV_MAT_DEPTH(dtype);
-        const int cn = src1.channels();
-
-        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-        CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-
-        if (sdepth == CV_64F || ddepth == CV_64F)
+            arithm::divMat<unsigned char, float, unsigned char>,
+            arithm::divMat<unsigned char, float, signed char>,
+            arithm::divMat<unsigned char, float, unsigned short>,
+            arithm::divMat<unsigned char, float, short>,
+            arithm::divMat<unsigned char, float, int>,
+            arithm::divMat<unsigned char, float, float>,
+            arithm::divMat<unsigned char, double, double>
+        },
         {
-            if (!deviceSupports(NATIVE_DOUBLE))
-                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+            arithm::divMat<signed char, float, unsigned char>,
+            arithm::divMat<signed char, float, signed char>,
+            arithm::divMat<signed char, float, unsigned short>,
+            arithm::divMat<signed char, float, short>,
+            arithm::divMat<signed char, float, int>,
+            arithm::divMat<signed char, float, float>,
+            arithm::divMat<signed char, double, double>
+        },
+        {
+            0 /*arithm::divMat<unsigned short, float, unsigned char>*/,
+            0 /*arithm::divMat<unsigned short, float, signed char>*/,
+            arithm::divMat<unsigned short, float, unsigned short>,
+            arithm::divMat<unsigned short, float, short>,
+            arithm::divMat<unsigned short, float, int>,
+            arithm::divMat<unsigned short, float, float>,
+            arithm::divMat<unsigned short, double, double>
+        },
+        {
+            0 /*arithm::divMat<short, float, unsigned char>*/,
+            0 /*arithm::divMat<short, float, signed char>*/,
+            arithm::divMat<short, float, unsigned short>,
+            arithm::divMat<short, float, short>,
+            arithm::divMat<short, float, int>,
+            arithm::divMat<short, float, float>,
+            arithm::divMat<short, double, double>
+        },
+        {
+            0 /*arithm::divMat<int, float, unsigned char>*/,
+            0 /*arithm::divMat<int, float, signed char>*/,
+            0 /*arithm::divMat<int, float, unsigned short>*/,
+            0 /*arithm::divMat<int, float, short>*/,
+            arithm::divMat<int, float, int>,
+            arithm::divMat<int, float, float>,
+            arithm::divMat<int, double, double>
+        },
+        {
+            0 /*arithm::divMat<float, float, unsigned char>*/,
+            0 /*arithm::divMat<float, float, signed char>*/,
+            0 /*arithm::divMat<float, float, unsigned short>*/,
+            0 /*arithm::divMat<float, float, short>*/,
+            0 /*arithm::divMat<float, float, int>*/,
+            arithm::divMat<float, float, float>,
+            arithm::divMat<float, double, double>
+        },
+        {
+            0 /*arithm::divMat<double, double, unsigned char>*/,
+            0 /*arithm::divMat<double, double, signed char>*/,
+            0 /*arithm::divMat<double, double, unsigned short>*/,
+            0 /*arithm::divMat<double, double, short>*/,
+            0 /*arithm::divMat<double, double, int>*/,
+            0 /*arithm::divMat<double, double, float>*/,
+            arithm::divMat<double, double, double>
         }
+    };
 
-        dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+    const int cn = src1.channels();
 
-        PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-        PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-        PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-        const func_t func = funcs[sdepth][ddepth];
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-        if (!func)
-            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+    const func_t func = funcs[sdepth][ddepth];
 
-        func(src1_, src2_, dst_, scale, stream);
-    }
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, scale, stream);
 }
 
 namespace arithm
 {
     template <typename T, typename S, typename D>
-    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
+static void divScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int)
 {
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
         {
-            divScalar<unsigned char, float, unsigned char>,
-            divScalar<unsigned char, float, signed char>,
-            divScalar<unsigned char, float, unsigned short>,
-            divScalar<unsigned char, float, short>,
-            divScalar<unsigned char, float, int>,
-            divScalar<unsigned char, float, float>,
-            divScalar<unsigned char, double, double>
+            arithm::divScalar<unsigned char, float, unsigned char>,
+            arithm::divScalar<unsigned char, float, signed char>,
+            arithm::divScalar<unsigned char, float, unsigned short>,
+            arithm::divScalar<unsigned char, float, short>,
+            arithm::divScalar<unsigned char, float, int>,
+            arithm::divScalar<unsigned char, float, float>,
+            arithm::divScalar<unsigned char, double, double>
         },
         {
-            divScalar<signed char, float, unsigned char>,
-            divScalar<signed char, float, signed char>,
-            divScalar<signed char, float, unsigned short>,
-            divScalar<signed char, float, short>,
-            divScalar<signed char, float, int>,
-            divScalar<signed char, float, float>,
-            divScalar<signed char, double, double>
+            arithm::divScalar<signed char, float, unsigned char>,
+            arithm::divScalar<signed char, float, signed char>,
+            arithm::divScalar<signed char, float, unsigned short>,
+            arithm::divScalar<signed char, float, short>,
+            arithm::divScalar<signed char, float, int>,
+            arithm::divScalar<signed char, float, float>,
+            arithm::divScalar<signed char, double, double>
         },
         {
-            0 /*divScalar<unsigned short, float, unsigned char>*/,
-            0 /*divScalar<unsigned short, float, signed char>*/,
-            divScalar<unsigned short, float, unsigned short>,
-            divScalar<unsigned short, float, short>,
-            divScalar<unsigned short, float, int>,
-            divScalar<unsigned short, float, float>,
-            divScalar<unsigned short, double, double>
+            0 /*arithm::divScalar<unsigned short, float, unsigned char>*/,
+            0 /*arithm::divScalar<unsigned short, float, signed char>*/,
+            arithm::divScalar<unsigned short, float, unsigned short>,
+            arithm::divScalar<unsigned short, float, short>,
+            arithm::divScalar<unsigned short, float, int>,
+            arithm::divScalar<unsigned short, float, float>,
+            arithm::divScalar<unsigned short, double, double>
         },
         {
-            0 /*divScalar<short, float, unsigned char>*/,
-            0 /*divScalar<short, float, signed char>*/,
-            divScalar<short, float, unsigned short>,
-            divScalar<short, float, short>,
-            divScalar<short, float, int>,
-            divScalar<short, float, float>,
-            divScalar<short, double, double>
+            0 /*arithm::divScalar<short, float, unsigned char>*/,
+            0 /*arithm::divScalar<short, float, signed char>*/,
+            arithm::divScalar<short, float, unsigned short>,
+            arithm::divScalar<short, float, short>,
+            arithm::divScalar<short, float, int>,
+            arithm::divScalar<short, float, float>,
+            arithm::divScalar<short, double, double>
         },
         {
-            0 /*divScalar<int, float, unsigned char>*/,
-            0 /*divScalar<int, float, signed char>*/,
-            0 /*divScalar<int, float, unsigned short>*/,
-            0 /*divScalar<int, float, short>*/,
-            divScalar<int, float, int>,
-            divScalar<int, float, float>,
-            divScalar<int, double, double>
+            0 /*arithm::divScalar<int, float, unsigned char>*/,
+            0 /*arithm::divScalar<int, float, signed char>*/,
+            0 /*arithm::divScalar<int, float, unsigned short>*/,
+            0 /*arithm::divScalar<int, float, short>*/,
+            arithm::divScalar<int, float, int>,
+            arithm::divScalar<int, float, float>,
+            arithm::divScalar<int, double, double>
         },
         {
-            0 /*divScalar<float, float, unsigned char>*/,
-            0 /*divScalar<float, float, signed char>*/,
-            0 /*divScalar<float, float, unsigned short>*/,
-            0 /*divScalar<float, float, short>*/,
-            0 /*divScalar<float, float, int>*/,
-            divScalar<float, float, float>,
-            divScalar<float, double, double>
+            0 /*arithm::divScalar<float, float, unsigned char>*/,
+            0 /*arithm::divScalar<float, float, signed char>*/,
+            0 /*arithm::divScalar<float, float, unsigned short>*/,
+            0 /*arithm::divScalar<float, float, short>*/,
+            0 /*arithm::divScalar<float, float, int>*/,
+            arithm::divScalar<float, float, float>,
+            arithm::divScalar<float, double, double>
         },
         {
-            0 /*divScalar<double, double, unsigned char>*/,
-            0 /*divScalar<double, double, signed char>*/,
-            0 /*divScalar<double, double, unsigned short>*/,
-            0 /*divScalar<double, double, short>*/,
-            0 /*divScalar<double, double, int>*/,
-            0 /*divScalar<double, double, float>*/,
-            divScalar<double, double, double>
+            0 /*arithm::divScalar<double, double, unsigned char>*/,
+            0 /*arithm::divScalar<double, double, signed char>*/,
+            0 /*arithm::divScalar<double, double, unsigned short>*/,
+            0 /*arithm::divScalar<double, double, short>*/,
+            0 /*arithm::divScalar<double, double, int>*/,
+            0 /*arithm::divScalar<double, double, float>*/,
+            arithm::divScalar<double, double, double>
         }
     };
 
@@ -1341,32 +1273,31 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
         {0                                                    , 0, 0                                                    , 0                                                    }
     };
 
-    if (dtype < 0)
-        dtype = src.depth();
-
     const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int ddepth = dst.depth();
     const int cn = src.channels();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( cn <= 4 );
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-    if (sdepth == CV_64F || ddepth == CV_64F)
+    if (inv)
     {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        val[0] *= scale;
+        val[1] *= scale;
+        val[2] *= scale;
+        val[3] *= scale;
+    }
+    else
+    {
+        val[0] /= scale;
+        val[1] /= scale;
+        val[2] /= scale;
+        val[3] /= scale;
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    const Scalar nsc(sc.val[0] / scale, sc.val[1] / scale, sc.val[2] / scale, sc.val[3] / scale);
-
     const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
-    if (ddepth == sdepth && cn > 1 && npp_func != 0)
+    if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv)
     {
-        npp_func(src, nsc, dst, stream);
+        npp_func(src, val, dst, stream);
         return;
     }
 
@@ -1377,113 +1308,39 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, nsc.val[0], dst, stream);
+    func(src, val[0], inv, dst, stream);
 }
 
-namespace arithm
+void cv::gpu::divide(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream)
 {
-    template <typename T, typename S, typename D>
-    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}
-
-void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
-{
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[7][7] =
+    if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
     {
-        {
-            divInv<unsigned char, float, unsigned char>,
-            divInv<unsigned char, float, signed char>,
-            divInv<unsigned char, float, unsigned short>,
-            divInv<unsigned char, float, short>,
-            divInv<unsigned char, float, int>,
-            divInv<unsigned char, float, float>,
-            divInv<unsigned char, double, double>
-        },
-        {
-            divInv<signed char, float, unsigned char>,
-            divInv<signed char, float, signed char>,
-            divInv<signed char, float, unsigned short>,
-            divInv<signed char, float, short>,
-            divInv<signed char, float, int>,
-            divInv<signed char, float, float>,
-            divInv<signed char, double, double>
-        },
-        {
-            0 /*divInv<unsigned short, float, unsigned char>*/,
-            0 /*divInv<unsigned short, float, signed char>*/,
-            divInv<unsigned short, float, unsigned short>,
-            divInv<unsigned short, float, short>,
-            divInv<unsigned short, float, int>,
-            divInv<unsigned short, float, float>,
-            divInv<unsigned short, double, double>
-        },
-        {
-            0 /*divInv<short, float, unsigned char>*/,
-            0 /*divInv<short, float, signed char>*/,
-            divInv<short, float, unsigned short>,
-            divInv<short, float, short>,
-            divInv<short, float, int>,
-            divInv<short, float, float>,
-            divInv<short, double, double>
-        },
-        {
-            0 /*divInv<int, float, unsigned char>*/,
-            0 /*divInv<int, float, signed char>*/,
-            0 /*divInv<int, float, unsigned short>*/,
-            0 /*divInv<int, float, short>*/,
-            divInv<int, float, int>,
-            divInv<int, float, float>,
-            divInv<int, double, double>
-        },
-        {
-            0 /*divInv<float, float, unsigned char>*/,
-            0 /*divInv<float, float, signed char>*/,
-            0 /*divInv<float, float, unsigned short>*/,
-            0 /*divInv<float, float, short>*/,
-            0 /*divInv<float, float, int>*/,
-            divInv<float, float, float>,
-            divInv<float, double, double>
-        },
-        {
-            0 /*divInv<double, double, unsigned char>*/,
-            0 /*divInv<double, double, signed char>*/,
-            0 /*divInv<double, double, unsigned short>*/,
-            0 /*divInv<double, double, short>*/,
-            0 /*divInv<double, double, int>*/,
-            0 /*divInv<double, double, float>*/,
-            divInv<double, double, double>
-        }
-    };
+        GpuMat src1 = _src1.getGpuMat();
+        GpuMat src2 = _src2.getGpuMat();
 
-    if (dtype < 0)
-        dtype = src.depth();
+        CV_Assert( src1.size() == src2.size() );
 
-    const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
-    const int cn = src.channels();
+        _dst.create(src1.size(), src1.type());
+        GpuMat dst = _dst.getGpuMat();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( cn == 1 );
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        arithm::divMat_8uc4_32f(src1, src2, dst, StreamAccessor::getStream(stream));
     }
+    else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = _src1.getGpuMat();
+        GpuMat src2 = _src2.getGpuMat();
 
-    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
+        CV_Assert( src1.size() == src2.size() );
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+        _dst.create(src1.size(), src1.type());
+        GpuMat dst = _dst.getGpuMat();
 
-    const func_t func = funcs[sdepth][ddepth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src, scale, dst, stream);
+        arithm::divMat_16sc4_32f(src1, src2, dst, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, divMat, divScalar);
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1498,37 +1355,24 @@ namespace arithm
     void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
+static void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int)
 {
-    using namespace arithm;
-
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        absDiffMat<unsigned char>,
-        absDiffMat<signed char>,
-        absDiffMat<unsigned short>,
-        absDiffMat<short>,
-        absDiffMat<int>,
-        absDiffMat<float>,
-        absDiffMat<double>
+        arithm::absDiffMat<unsigned char>,
+        arithm::absDiffMat<signed char>,
+        arithm::absDiffMat<unsigned short>,
+        arithm::absDiffMat<short>,
+        arithm::absDiffMat<int>,
+        arithm::absDiffMat<float>,
+        arithm::absDiffMat<double>
     };
 
     const int depth = src1.depth();
     const int cn = src1.channels();
 
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
@@ -1548,10 +1392,10 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
             {
                 const int vcols = src1_.cols >> 2;
 
-                absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                              stream);
+                arithm::absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                                      stream);
 
                 return;
             }
@@ -1559,10 +1403,10 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
             {
                 const int vcols = src1_.cols >> 1;
 
-                absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                              stream);
+                arithm::absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                                      PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                                      stream);
 
                 return;
             }
@@ -1583,36 +1427,28 @@ namespace arithm
     void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& stream)
+static void absDiffScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int)
 {
-    using namespace arithm;
-
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        absDiffScalar<unsigned char, float>,
-        absDiffScalar<signed char, float>,
-        absDiffScalar<unsigned short, float>,
-        absDiffScalar<short, float>,
-        absDiffScalar<int, float>,
-        absDiffScalar<float, float>,
-        absDiffScalar<double, double>
+        arithm::absDiffScalar<unsigned char, float>,
+        arithm::absDiffScalar<signed char, float>,
+        arithm::absDiffScalar<unsigned short, float>,
+        arithm::absDiffScalar<short, float>,
+        arithm::absDiffScalar<int, float>,
+        arithm::absDiffScalar<float, float>,
+        arithm::absDiffScalar<double, double>
     };
 
-    const int depth = src1.depth();
+    const int depth = src.depth();
 
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src1.channels() == 1 );
+    funcs[depth](src, val[0], dst, StreamAccessor::getStream(stream));
+}
 
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    funcs[depth](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
+void cv::gpu::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, absDiffMat, absDiffScalar);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1624,7 +1460,7 @@ namespace arithm
     void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::abs(InputArray _src, OutputArray _dst, Stream& stream)
 {
     using namespace arithm;
 
@@ -1640,6 +1476,8 @@ void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
         absMat<double>
     };
 
+    GpuMat src = _src.getGpuMat();
+
     const int depth = src.depth();
 
     CV_Assert( depth <= CV_64F );
@@ -1651,7 +1489,8 @@ void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
 
     funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
@@ -1665,7 +1504,7 @@ namespace arithm
     void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::sqr(InputArray _src, OutputArray _dst, Stream& stream)
 {
     using namespace arithm;
 
@@ -1681,6 +1520,8 @@ void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
         sqrMat<double>
     };
 
+    GpuMat src = _src.getGpuMat();
+
     const int depth = src.depth();
 
     CV_Assert( depth <= CV_64F );
@@ -1692,7 +1533,8 @@ void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
 
     funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
@@ -1706,7 +1548,7 @@ namespace arithm
     void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::sqrt(InputArray _src, OutputArray _dst, Stream& stream)
 {
     using namespace arithm;
 
@@ -1722,46 +1564,7 @@ void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
         sqrtMat<double>
     };
 
-    const int depth = src.depth();
-
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src.channels() == 1 );
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), src.type());
-
-    funcs[depth](src, dst, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// log
-
-namespace arithm
-{
-    template <typename T>
-    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-}
-
-void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        logMat<unsigned char>,
-        logMat<signed char>,
-        logMat<unsigned short>,
-        logMat<short>,
-        logMat<int>,
-        logMat<float>,
-        logMat<double>
-    };
+    GpuMat src = _src.getGpuMat();
 
     const int depth = src.depth();
 
@@ -1774,7 +1577,8 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
 
     funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
@@ -1788,7 +1592,7 @@ namespace arithm
     void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::exp(InputArray _src, OutputArray _dst, Stream& stream)
 {
     using namespace arithm;
 
@@ -1804,6 +1608,8 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
         expMat<double>
     };
 
+    GpuMat src = _src.getGpuMat();
+
     const int depth = src.depth();
 
     CV_Assert( depth <= CV_64F );
@@ -1815,11 +1621,100 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
 
     funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
+////////////////////////////////////////////////////////////////////////
+// log
+
+namespace arithm
+{
+    template <typename T>
+    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+void cv::gpu::log(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        logMat<unsigned char>,
+        logMat<signed char>,
+        logMat<unsigned short>,
+        logMat<short>,
+        logMat<int>,
+        logMat<float>,
+        logMat<double>
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// pow
+
+namespace arithm
+{
+    template<typename T> void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
+
+void cv::gpu::pow(InputArray _src, double power, OutputArray _dst, Stream& stream)
+{
+    typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        arithm::pow<unsigned char>,
+        arithm::pow<signed char>,
+        arithm::pow<unsigned short>,
+        arithm::pow<short>,
+        arithm::pow<int>,
+        arithm::pow<float>,
+        arithm::pow<double>
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert(depth <= CV_64F);
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step);
+    PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step);
+
+    funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream));
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // compare
 
@@ -1836,7 +1731,7 @@ namespace arithm
     template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& s)
+static void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int cmpop)
 {
     using namespace arithm;
 
@@ -1861,19 +1756,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
     const int depth = src1.depth();
     const int cn = src1.channels();
 
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
-    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     static const int codes[] =
     {
@@ -1940,7 +1823,7 @@ namespace
     }
 }
 
-void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
+static void cmpScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop)
 {
     using namespace arithm;
 
@@ -1962,46 +1845,50 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
         castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
     };
 
+    if (inv)
+    {
+        // src1 is a scalar; swap it with src2
+        cmpop = cmpop == CMP_LT ? CMP_GT : cmpop == CMP_LE ? CMP_GE :
+            cmpop == CMP_GE ? CMP_LE : cmpop == CMP_GT ? CMP_LT : cmpop;
+    }
+
     const int depth = src.depth();
     const int cn = src.channels();
 
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( cn <= 4 );
-    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+    cast_func[depth](val);
 
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
+    funcs[depth][cmpop](src, cn, val.val, dst, StreamAccessor::getStream(stream));
+}
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
-
-    cast_func[depth](sc);
-
-    funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream));
+void cv::gpu::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), 1.0, CV_8U, stream, cmpMat, cmpScalar, cmpop);
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// Unary bitwise logical operations
+// bitwise_not
 
 namespace arithm
 {
     template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& s)
+void cv::gpu::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& _stream)
 {
     using namespace arithm;
 
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
     const int depth = src.depth();
 
     CV_Assert( depth <= CV_64F );
     CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     const int bcols = (int) (src.cols * src.elemSize());
 
@@ -2035,6 +1922,16 @@ void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, St
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
 
+namespace
+{
+    enum
+    {
+        BIT_OP_AND,
+        BIT_OP_OR,
+        BIT_OP_XOR
+    };
+}
+
 namespace arithm
 {
     template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -2042,19 +1939,31 @@ namespace arithm
     template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
+static void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int op)
 {
     using namespace arithm;
 
-    const int depth = src1.depth();
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    static const func_t funcs32[] =
+    {
+        bitMatAnd<uint>,
+        bitMatOr<uint>,
+        bitMatXor<uint>
+    };
+    static const func_t funcs16[] =
+    {
+        bitMatAnd<ushort>,
+        bitMatOr<ushort>,
+        bitMatXor<ushort>
+    };
+    static const func_t funcs8[] =
+    {
+        bitMatAnd<uchar>,
+        bitMatOr<uchar>,
+        bitMatXor<uchar>
+    };
 
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     const int bcols = (int) (src1.cols * src1.elemSize());
 
@@ -2062,8 +1971,7 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
     {
         const int vcols = bcols >> 2;
 
-        bitMatAnd<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+        funcs32[op](PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
                     PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
                     PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
                     mask, stream);
@@ -2072,8 +1980,7 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
     {
         const int vcols = bcols >> 1;
 
-        bitMatAnd<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+        funcs16[op](PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
                     PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
                     PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
                     mask, stream);
@@ -2081,111 +1988,13 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
     else
     {
 
-        bitMatAnd<unsigned int>(
-                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
-                    mask, stream);
+        funcs8[op](PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                   PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                   PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                   mask, stream);
     }
 }
 
-void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
-{
-    using namespace arithm;
-
-    const int depth = src1.depth();
-
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    const int bcols = (int) (src1.cols * src1.elemSize());
-
-    if ((bcols & 3) == 0)
-    {
-        const int vcols = bcols >> 2;
-
-        bitMatOr<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else if ((bcols & 1) == 0)
-    {
-        const int vcols = bcols >> 1;
-
-        bitMatOr<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else
-    {
-
-        bitMatOr<unsigned int>(
-                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
-                    mask, stream);
-    }
-}
-
-void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
-{
-    using namespace arithm;
-
-    const int depth = src1.depth();
-
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    const int bcols = (int) (src1.cols * src1.elemSize());
-
-    if ((bcols & 3) == 0)
-    {
-        const int vcols = bcols >> 2;
-
-        bitMatXor<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else if ((bcols & 1) == 0)
-    {
-        const int vcols = bcols >> 1;
-
-        bitMatXor<unsigned int>(
-                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
-                    mask, stream);
-    }
-    else
-    {
-
-        bitMatXor<unsigned int>(
-                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
-                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
-                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
-                    mask, stream);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Binary bitwise logical operations with scalars
-
 namespace arithm
 {
     template <typename T> void bitScalarAnd(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
@@ -2273,18 +2082,34 @@ namespace
     };
 }
 
-void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+static void bitScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
 {
     using namespace arithm;
 
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[5][4] =
+    static const func_t funcs[3][5][4] =
     {
-        {BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
-        {0,0,0,0},
-        {BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
-        {0,0,0,0},
-        {BitScalar<int, bitScalarAnd<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        {
+            {BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
+            {0,0,0,0},
+            {BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+            {0,0,0,0},
+            {BitScalar<int, bitScalarAnd<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<unsigned char, bitScalarOr<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
+            {0,0,0,0},
+            {BitScalar<unsigned short, bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+            {0,0,0,0},
+            {BitScalar<int, bitScalarOr<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<unsigned char, bitScalarXor<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
+            {0,0,0,0},
+            {BitScalar<unsigned short, bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+            {0,0,0,0},
+            {BitScalar<int, bitScalarXor<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        }
     };
 
     const int depth = src.depth();
@@ -2292,60 +2117,24 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
 
     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
     CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+    CV_Assert( mask.empty() );
 
-    dst.create(src.size(), src.type());
-
-    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    funcs[op][depth][cn - 1](src, val, dst, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+void cv::gpu::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream)
 {
-    using namespace arithm;
-
-    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[5][4] =
-    {
-        {BitScalar<unsigned char, bitScalarOr<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
-        {0,0,0,0},
-        {BitScalar<unsigned short, bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
-        {0,0,0,0},
-        {BitScalar<int, bitScalarOr<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
-    };
-
-    const int depth = src.depth();
-    const int cn = src.channels();
-
-    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
-    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
-
-    dst.create(src.size(), src.type());
-
-    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_OR);
 }
 
-void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+void cv::gpu::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream)
 {
-    using namespace arithm;
+    arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_AND);
+}
 
-    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[5][4] =
-    {
-        {BitScalar<unsigned char, bitScalarXor<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
-        {0,0,0,0},
-        {BitScalar<unsigned short, bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
-        {0,0,0,0},
-        {BitScalar<int, bitScalarXor<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
-    };
-
-    const int depth = src.depth();
-    const int cn = src.channels();
-
-    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
-    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
-
-    dst.create(src.size(), src.type());
-
-    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+void cv::gpu::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_XOR);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -2404,7 +2193,7 @@ namespace
     };
 }
 
-void cv::gpu::rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream)
+void cv::gpu::rshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
@@ -2416,15 +2205,18 @@ void cv::gpu::rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
         {NppShift<CV_32S, 1, nppiRShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R>::call},
     };
 
-    CV_Assert(src.depth() < CV_32F);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+    GpuMat src = _src.getGpuMat();
 
-    dst.create(src.size(), src.type());
+    CV_Assert( src.depth() < CV_32F );
+    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
 
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream)
+void cv::gpu::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream& stream)
 {
     typedef void (*func_t)(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
@@ -2436,17 +2228,29 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
         {NppShift<CV_32S, 1, nppiLShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R>::call},
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+    GpuMat src = _src.getGpuMat();
 
-    dst.create(src.size(), src.type());
+    CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S );
+    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
 
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations
 
+namespace
+{
+    enum
+    {
+        MIN_OP,
+        MAX_OP
+    };
+}
+
 namespace arithm
 {
     void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
@@ -2460,37 +2264,49 @@ namespace arithm
     template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
+void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int op)
 {
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
+    static const func_t funcs[2][7] =
     {
-        minMat<unsigned char>,
-        minMat<signed char>,
-        minMat<unsigned short>,
-        minMat<short>,
-        minMat<int>,
-        minMat<float>,
-        minMat<double>
+        {
+            minMat<unsigned char>,
+            minMat<signed char>,
+            minMat<unsigned short>,
+            minMat<short>,
+            minMat<int>,
+            minMat<float>,
+            minMat<double>
+        },
+        {
+            maxMat<unsigned char>,
+            maxMat<signed char>,
+            maxMat<unsigned short>,
+            maxMat<short>,
+            maxMat<int>,
+            maxMat<float>,
+            maxMat<double>
+        }
+    };
+
+    typedef void (*opt_func_t)(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    static const opt_func_t funcs_v4[2] =
+    {
+        minMat_v4, maxMat_v4
+    };
+    static const opt_func_t funcs_v2[2] =
+    {
+        minMat_v2, maxMat_v2
     };
 
     const int depth = src1.depth();
     const int cn = src1.channels();
 
     CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
 
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
@@ -2510,10 +2326,10 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
             {
                 const int vcols = src1_.cols >> 2;
 
-                minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                funcs_v4[op](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                             stream);
 
                 return;
             }
@@ -2521,96 +2337,17 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
             {
                 const int vcols = src1_.cols >> 1;
 
-                minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
+                funcs_v2[op](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                             PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                             stream);
 
                 return;
             }
         }
     }
 
-    const func_t func = funcs[depth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src1_, src2_, dst_, stream);
-}
-
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
-{
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        maxMat<unsigned char>,
-        maxMat<signed char>,
-        maxMat<unsigned short>,
-        maxMat<short>,
-        maxMat<int>,
-        maxMat<float>,
-        maxMat<double>
-    };
-
-    const int depth = src1.depth();
-    const int cn = src1.channels();
-
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
-
-    if (depth == CV_8U || depth == CV_16U)
-    {
-        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
-        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
-        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
-
-        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
-
-        if (isAllAligned)
-        {
-            if (depth == CV_8U && (src1_.cols & 3) == 0)
-            {
-                const int vcols = src1_.cols >> 2;
-
-                maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
-
-                return;
-            }
-            else if (depth == CV_16U && (src1_.cols & 1) == 0)
-            {
-                const int vcols = src1_.cols >> 1;
-
-                maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                          stream);
-
-                return;
-            }
-        }
-    }
-
-    const func_t func = funcs[depth];
+    const func_t func = funcs[op][depth];
 
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
@@ -2626,20 +2363,31 @@ namespace
     }
 }
 
-void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
+void minMaxScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op)
 {
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
+    static const func_t funcs[2][7] =
     {
-        minScalar<unsigned char>,
-        minScalar<signed char>,
-        minScalar<unsigned short>,
-        minScalar<short>,
-        minScalar<int>,
-        minScalar<float>,
-        minScalar<double>
+        {
+            minScalar<unsigned char>,
+            minScalar<signed char>,
+            minScalar<unsigned short>,
+            minScalar<short>,
+            minScalar<int>,
+            minScalar<float>,
+            minScalar<double>
+        },
+        {
+            maxScalar<unsigned char>,
+            maxScalar<signed char>,
+            maxScalar<unsigned short>,
+            maxScalar<short>,
+            maxScalar<int>,
+            maxScalar<float>,
+            maxScalar<double>
+        }
     };
 
     typedef double (*cast_func_t)(double sc);
@@ -2653,94 +2401,17 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
     CV_Assert( depth <= CV_64F );
     CV_Assert( src.channels() == 1 );
 
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), src.type());
-
-    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
+    funcs[op][depth](src, cast_func[depth](val[0]), dst, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
+void cv::gpu::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
 {
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        maxScalar<unsigned char>,
-        maxScalar<signed char>,
-        maxScalar<unsigned short>,
-        maxScalar<short>,
-        maxScalar<int>,
-        maxScalar<float>,
-        maxScalar<double>
-    };
-
-    typedef double (*cast_func_t)(double sc);
-    static const cast_func_t cast_func[] =
-    {
-        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
-    };
-
-    const int depth = src.depth();
-
-    CV_Assert( depth <= CV_64F );
-    CV_Assert( src.channels() == 1 );
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), src.type());
-
-    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MIN_OP);
 }
 
-////////////////////////////////////////////////////////////////////////
-// pow
-
-namespace arithm
+void cv::gpu::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream)
 {
-    template<typename T> void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-}
-
-void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
-{
-    typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        arithm::pow<unsigned char>,
-        arithm::pow<signed char>,
-        arithm::pow<unsigned short>,
-        arithm::pow<short>,
-        arithm::pow<int>,
-        arithm::pow<float>,
-        arithm::pow<double>
-    };
-
-    const int depth = src.depth();
-    const int cn = src.channels();
-
-    CV_Assert(depth <= CV_64F);
-
-    if (depth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src.size(), src.type());
-
-    PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step);
-    PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step);
-
-    funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream));
+    arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MAX_OP);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -2752,7 +2423,7 @@ namespace arithm
     void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int ddepth, Stream& stream)
+void cv::gpu::addWeighted(InputArray _src1, double alpha, InputArray _src2, double beta, double gamma, OutputArray _dst, int ddepth, Stream& stream)
 {
     typedef void (*func_t)(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7][7] =
@@ -3214,6 +2885,9 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
         }
     };
 
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+
     int sdepth1 = src1.depth();
     int sdepth2 = src2.depth();
     ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
@@ -3228,7 +2902,8 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
             CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
+    _dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
+    GpuMat dst = _dst.getGpuMat();
 
     PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
@@ -3259,8 +2934,10 @@ namespace arithm
     void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
 }
 
-double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
+double cv::gpu::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& _stream)
 {
+    GpuMat src = _src.getGpuMat();
+
     const int depth = src.depth();
 
     CV_Assert( src.channels() == 1 && depth <= CV_64F );
@@ -3272,9 +2949,10 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
             CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     if (src.type() == CV_32FC1 && type == 2/*THRESH_TRUNC*/)
     {
@@ -3323,12 +3001,10 @@ namespace
 {
     typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
 
-    inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
+    void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
     {
         CV_Assert(src.type() == CV_32FC2);
 
-        dst.create(src.size(), CV_32FC1);
-
         NppiSize sz;
         sz.width = src.cols;
         sz.height = src.rows;
@@ -3342,13 +3018,23 @@ namespace
     }
 }
 
-void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::magnitude(InputArray _src, OutputArray _dst, Stream& stream)
 {
+    GpuMat src = _src.getGpuMat();
+
+    _dst.create(src.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();
+
     npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream)
 {
+    GpuMat src = _src.getGpuMat();
+
+    _dst.create(src.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();
+
     npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
 }
 
@@ -3366,18 +3052,13 @@ namespace cv { namespace gpu { namespace cudev
 
 namespace
 {
-    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
+    void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
     {
         using namespace ::cv::gpu::cudev::mathfunc;
 
         CV_Assert(x.size() == y.size() && x.type() == y.type());
         CV_Assert(x.depth() == CV_32F);
 
-        if (mag)
-            mag->create(x.size(), x.type());
-        if (angle)
-            angle->create(x.size(), x.type());
-
         GpuMat x1cn = x.reshape(1);
         GpuMat y1cn = y.reshape(1);
         GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
@@ -3386,16 +3067,13 @@ namespace
         cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
     }
 
-    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
+    void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
     {
         using namespace ::cv::gpu::cudev::mathfunc;
 
         CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
         CV_Assert(mag.depth() == CV_32F);
 
-        x.create(mag.size(), mag.type());
-        y.create(mag.size(), mag.type());
-
         GpuMat mag1cn = mag.reshape(1);
         GpuMat angle1cn = angle.reshape(1);
         GpuMat x1cn = x.reshape(1);
@@ -3405,29 +3083,65 @@ namespace
     }
 }
 
-void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
+void cv::gpu::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
 {
+    GpuMat x = _x.getGpuMat();
+    GpuMat y = _y.getGpuMat();
+
+    _dst.create(x.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();
+
     cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
+void cv::gpu::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
 {
+    GpuMat x = _x.getGpuMat();
+    GpuMat y = _y.getGpuMat();
+
+    _dst.create(x.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();
+
     cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
+void cv::gpu::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream)
 {
-    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+    GpuMat x = _x.getGpuMat();
+    GpuMat y = _y.getGpuMat();
+
+    _dst.create(x.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();
+
+    cartToPolar_caller(x, y, 0, false, &dst, angleInDegrees, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
+void cv::gpu::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream)
 {
+    GpuMat x = _x.getGpuMat();
+    GpuMat y = _y.getGpuMat();
+
+    _mag.create(x.size(), CV_32FC1);
+    GpuMat mag = _mag.getGpuMat();
+
+    _angle.create(x.size(), CV_32FC1);
+    GpuMat angle = _angle.getGpuMat();
+
     cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
 }
 
-void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
+void cv::gpu::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& stream)
 {
-    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
+    GpuMat mag = _mag.getGpuMat();
+    GpuMat angle = _angle.getGpuMat();
+
+    _x.create(mag.size(), CV_32FC1);
+    GpuMat x = _x.getGpuMat();
+
+    _y.create(mag.size(), CV_32FC1);
+    GpuMat y = _y.getGpuMat();
+
+    polarToCart_caller(mag, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
 }
 
 #endif
diff --git a/modules/gpuarithm/src/reductions.cpp b/modules/gpuarithm/src/reductions.cpp
index b8b24188d4..248fa9a4e7 100644
--- a/modules/gpuarithm/src/reductions.cpp
+++ b/modules/gpuarithm/src/reductions.cpp
@@ -47,41 +47,28 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-double cv::gpu::norm(const GpuMat&, int) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_no_cuda(); return 0.0; }
+double cv::gpu::norm(InputArray, int, InputArray, GpuMat&) { throw_no_cuda(); return 0.0; }
+double cv::gpu::norm(InputArray, InputArray, GpuMat&, int) { throw_no_cuda(); return 0.0; }
 
-Scalar cv::gpu::sum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::gpu::sum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::gpu::absSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::gpu::sqrSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
 
-Scalar cv::gpu::absSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+void cv::gpu::minMax(InputArray, double*, double*, InputArray, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
 
-Scalar cv::gpu::sqrSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+int cv::gpu::countNonZero(InputArray, GpuMat&) { throw_no_cuda(); return 0; }
 
-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::meanStdDev(InputArray, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
 
-int cv::gpu::countNonZero(const GpuMat&) { throw_no_cuda(); return 0; }
-int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_no_cuda(); return 0; }
+void cv::gpu::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::normalize(InputArray, OutputArray, double, double, int, int, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
 
-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_no_cuda(); }
-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
-
-void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::integral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::sqrIntegral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
 
 #else
 
@@ -124,21 +111,13 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // norm
 
-double cv::gpu::norm(const GpuMat& src, int normType)
+double cv::gpu::norm(InputArray _src, int normType, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::norm(src, normType, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
-double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
-{
-    return gpu::norm(src, normType, GpuMat(), buf);
-}
-
-double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf)
-{
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1));
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1) );
 
     GpuMat src_single_channel = src.reshape(1);
 
@@ -154,13 +133,11 @@ double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat
     return std::max(std::abs(min_val), std::abs(max_val));
 }
 
-double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
+double cv::gpu::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
 {
-    CV_Assert(src1.type() == CV_8UC1);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-
 #if CUDA_VERSION < 5050
+    (void) buf;
+
     typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);
 
     static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
@@ -175,13 +152,18 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
     static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
 #endif
 
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+
+    CV_Assert( src1.type() == CV_8UC1 );
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+
     NppiSize sz;
     sz.width  = src1.cols;
     sz.height = src1.rows;
 
-    int funcIdx = normType >> 1;
-
-    double retVal;
+    const int funcIdx = normType >> 1;
 
     DeviceBuffer dbuf;
 
@@ -191,13 +173,14 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
     int bufSize;
     buf_size_funcs[funcIdx](sz, &bufSize);
 
-    GpuMat buf(1, bufSize, CV_8UC1);
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
 
     nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
 #endif
 
     cudaSafeCall( cudaDeviceSynchronize() );
 
+    double retVal;
     dbuf.download(&retVal);
 
     return retVal;
@@ -220,19 +203,11 @@ namespace sum
     void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 }
 
-Scalar cv::gpu::sum(const GpuMat& src)
+Scalar cv::gpu::sum(InputArray _src, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::sum(src, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
-Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
-{
-    return gpu::sum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
     typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
@@ -266,19 +241,11 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
     return Scalar(result[0], result[1], result[2], result[3]);
 }
 
-Scalar cv::gpu::absSum(const GpuMat& src)
+Scalar cv::gpu::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::absSum(src, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
-Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
-{
-    return gpu::absSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
     typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
@@ -312,19 +279,11 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
     return Scalar(result[0], result[1], result[2], result[3]);
 }
 
-Scalar cv::gpu::sqrSum(const GpuMat& src)
+Scalar cv::gpu::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::sqrSum(src, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
-Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
-{
-    return gpu::sqrSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
     typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
     static const func_t funcs[7][5] =
     {
@@ -369,14 +328,11 @@ namespace minMax
     void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 }
 
-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
+void cv::gpu::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    gpu::minMax(src, minVal, maxVal, mask, buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
-{
     typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
     static const func_t funcs[] =
     {
@@ -419,15 +375,12 @@ namespace minMaxLoc
     void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 }
 
-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
+void cv::gpu::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                        InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
 {
-    GpuMat valBuf, locBuf;
-    gpu::minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
 
-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
-{
     typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
     static const func_t funcs[] =
     {
@@ -472,14 +425,10 @@ namespace countNonZero
     int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
 }
 
-int cv::gpu::countNonZero(const GpuMat& src)
+int cv::gpu::countNonZero(InputArray _src, GpuMat& buf)
 {
-    GpuMat buf;
-    return countNonZero(src, buf);
-}
+    GpuMat src = _src.getGpuMat();
 
-int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
-{
     typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
     static const func_t funcs[] =
     {
@@ -521,8 +470,10 @@ namespace reduce
     void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
 }
 
-void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
+void cv::gpu::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
+    GpuMat src = _src.getGpuMat();
+
     CV_Assert( src.channels() <= 4 );
     CV_Assert( dim == 0 || dim == 1 );
     CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
@@ -530,7 +481,8 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
     if (dtype < 0)
         dtype = src.depth();
 
-    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    GpuMat dst = _dst.getGpuMat();
 
     if (dim == 0)
     {
@@ -691,15 +643,11 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
 ////////////////////////////////////////////////////////////////////////
 // meanStdDev
 
-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
+void cv::gpu::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& buf)
 {
-    GpuMat buf;
-    meanStdDev(src, mean, stddev, buf);
-}
+    GpuMat src = _src.getGpuMat();
 
-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
-{
-    CV_Assert(src.type() == CV_8UC1);
+    CV_Assert( src.type() == CV_8UC1 );
 
     if (!deviceSupports(FEATURE_SET_COMPUTE_13))
         CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
@@ -730,11 +678,15 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
 //////////////////////////////////////////////////////////////////////////////
 // rectStdDev
 
-void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
+void cv::gpu::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream)
 {
-    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);
+    GpuMat src = _src.getGpuMat();
+    GpuMat sqr = _sqr.getGpuMat();
 
-    dst.create(src.size(), CV_32FC1);
+    CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 );
+
+    _dst.create(src.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();
 
     NppiSize sz;
     sz.width = src.cols;
@@ -746,7 +698,7 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
     nppRect.x = rect.x;
     nppRect.y = rect.y;
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
     NppStreamHandler h(stream);
 
@@ -760,16 +712,12 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
 ////////////////////////////////////////////////////////////////////////
 // normalize
 
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
+void cv::gpu::normalize(InputArray _src, OutputArray dst, double a, double b, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
 {
-    GpuMat norm_buf;
-    GpuMat cvt_buf;
-    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
-}
+    GpuMat src = _src.getGpuMat();
 
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-{
     double scale = 1, shift = 0;
+
     if (norm_type == NORM_MINMAX)
     {
         double smin = 0, smax = 0;
@@ -800,4 +748,116 @@ void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int
     }
 }
 
+////////////////////////////////////////////////////////////////////////
+// integral
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    cv::Size whole;
+    cv::Point offset;
+    src.locateROI(whole, offset);
+
+    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
+    {
+        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
+
+        cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
+
+        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        dst.setTo(Scalar::all(0), _stream);
+
+        GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
+        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
+
+        res.copyTo(inner, _stream);
+    }
+    else
+    {
+    #ifndef HAVE_OPENCV_GPULEGACY
+        throw_no_cuda();
+    #else
+        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        NcvSize32u roiSize;
+        roiSize.width = src.cols;
+        roiSize.height = src.rows;
+
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+        Ncv32u bufSize;
+        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+
+        NppStStreamHandler h(stream);
+
+        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
+            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    #endif
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// sqrIntegral
+
+void cv::gpu::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream)
+{
+#ifndef HAVE_OPENCV_GPULEGACY
+    (void) _src;
+    (void) _dst;
+    (void) _stream;
+    throw_no_cuda();
+#else
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8U );
+
+    NcvSize32u roiSize;
+    roiSize.width = src.cols;
+    roiSize.height = src.rows;
+
+    cudaDeviceProp prop;
+    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+    Ncv32u bufSize;
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
+
+    ensureSizeIsEnough(1, bufSize, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    NppStStreamHandler h(stream);
+
+    _dst.create(src.rows + 1, src.cols + 1, CV_64F);
+    GpuMat dst = _dst.getGpuMat();
+
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
+            dst.ptr<Ncv64u>(0), static_cast<int>(dst.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+#endif
+}
+
 #endif
diff --git a/modules/gpuarithm/test/test_arithm.cpp b/modules/gpuarithm/test/test_arithm.cpp
index 93fb0ae845..0534e219d8 100644
--- a/modules/gpuarithm/test/test_arithm.cpp
+++ b/modules/gpuarithm/test/test_arithm.cpp
@@ -419,8 +419,10 @@ GPU_TEST_P(Convolve, Accuracy)
     cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
     cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
 
+    cv::Ptr<cv::gpu::Convolution> conv = cv::gpu::createConvolution();
+
     cv::gpu::GpuMat dst;
-    cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr);
+    conv->convolve(loadMat(src), loadMat(kernel), dst, ccorr);
 
     cv::Mat dst_gold;
     convolveDFT(src, kernel, dst_gold, ccorr);
diff --git a/modules/gpuarithm/test/test_core.cpp b/modules/gpuarithm/test/test_core.cpp
index 45f796dc59..d465aa4634 100644
--- a/modules/gpuarithm/test/test_core.cpp
+++ b/modules/gpuarithm/test/test_core.cpp
@@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel)
     cv::Mat src = randomMat(size, type);
     cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
 
+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
     cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::LUT(src, lut, dst_gold);
@@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel)
     cv::Mat src = randomMat(size, type);
     cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
 
+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
     cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::LUT(src, lut, dst_gold);
diff --git a/modules/gpuarithm/test/test_element_operations.cpp b/modules/gpuarithm/test/test_element_operations.cpp
index 89f578fdd1..61ea454ead 100644
--- a/modules/gpuarithm/test/test_element_operations.cpp
+++ b/modules/gpuarithm/test/test_element_operations.cpp
@@ -261,6 +261,94 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Add_Scalar, testing::Combine(
     DEPTH_PAIRS,
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// Add_Scalar_First
+
+PARAM_TEST_CASE(Add_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Add_Scalar_First, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::add(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::add(val, loadMat(mat, useRoi), dst, cv::gpu::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(val, mat, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+GPU_TEST_P(Add_Scalar_First, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::add(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::add(val, loadMat(mat, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(val, mat, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Add_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Subtract_Array
 
@@ -476,6 +564,94 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Subtract_Scalar, testing::Combine(
     DEPTH_PAIRS,
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// Subtract_Scalar_First
+
+PARAM_TEST_CASE(Subtract_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Subtract_Scalar_First, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::subtract(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::subtract(val, loadMat(mat, useRoi), dst, cv::gpu::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(val, mat, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+GPU_TEST_P(Subtract_Scalar_First, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::subtract(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::subtract(val, loadMat(mat, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(val, mat, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Subtract_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Multiply_Array
 
@@ -756,6 +932,93 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Multiply_Scalar, testing::Combine(
     DEPTH_PAIRS,
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Scalar_First
+
+PARAM_TEST_CASE(Multiply_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Multiply_Scalar_First, WithOutScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::multiply(val, loadMat(mat), dst, 1, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::gpu::multiply(val, loadMat(mat, useRoi), dst, 1, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(val, mat, dst_gold, 1, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+
+GPU_TEST_P(Multiply_Scalar_First, WithScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    double scale = randomDouble(0.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::multiply(val, loadMat(mat), dst, scale, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::gpu::multiply(val, loadMat(mat, useRoi), dst, scale, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(val, mat, dst_gold, scale, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Multiply_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Divide_Array
 
@@ -1036,9 +1299,9 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar, testing::Combine(
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
-// Divide_Scalar_Inv
+// Divide_Scalar_First
 
-PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+PARAM_TEST_CASE(Divide_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
@@ -1056,7 +1319,7 @@ PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatD
     }
 };
 
-GPU_TEST_P(Divide_Scalar_Inv, Accuracy)
+GPU_TEST_P(Divide_Scalar_First, Accuracy)
 {
     double scale = randomDouble(0.0, 255.0);
     cv::Mat mat = randomMat(size, depth.first, 1.0, 255.0);
@@ -1085,7 +1348,7 @@ GPU_TEST_P(Divide_Scalar_Inv, Accuracy)
     }
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar_Inv, testing::Combine(
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar_First, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     DEPTH_PAIRS,
@@ -1170,6 +1433,35 @@ GPU_TEST_P(AbsDiff, Scalar)
     }
 }
 
+GPU_TEST_P(AbsDiff, Scalar_First)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::absdiff(val, loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::absdiff(val, loadMat(src, useRoi), dst);
+
+        cv::Mat dst_gold;
+        cv::absdiff(val, src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(GPU_Arithm, AbsDiff, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -1478,6 +1770,65 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Exp, testing::Combine(
                     MatDepth(CV_32F)),
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// Pow
+
+PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Pow, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth, 0.0, 10.0);
+    double power = randomDouble(2.0, 4.0);
+
+    if (src.depth() < CV_32F)
+        power = static_cast<int>(power);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::pow(loadMat(src), power, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::pow(loadMat(src, useRoi), power, dst);
+
+        cv::Mat dst_gold;
+        cv::pow(src, power, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Pow, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Compare_Array
 
@@ -2110,65 +2461,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Max, testing::Combine(
     ALL_DEPTH,
     WHOLE_SUBMAT));
 
-////////////////////////////////////////////////////////////////////////////////
-// Pow
-
-PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int depth;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        depth = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Pow, Accuracy)
-{
-    cv::Mat src = randomMat(size, depth, 0.0, 10.0);
-    double power = randomDouble(2.0, 4.0);
-
-    if (src.depth() < CV_32F)
-        power = static_cast<int>(power);
-
-    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::pow(loadMat(src), power, dst);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
-        cv::gpu::pow(loadMat(src, useRoi), power, dst);
-
-        cv::Mat dst_gold;
-        cv::pow(src, power, dst_gold);
-
-        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Arithm, Pow, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    ALL_DEPTH,
-    WHOLE_SUBMAT));
-
 //////////////////////////////////////////////////////////////////////////////
 // AddWeighted
 
@@ -2234,6 +2526,54 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, AddWeighted, testing::Combine(
     ALL_DEPTH,
     WHOLE_SUBMAT));
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Threshold
+
+CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
+
+PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int threshOp;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        threshOp = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Threshold, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    double maxVal = randomDouble(20.0, 127.0);
+    double thresh = randomDouble(0.0, maxVal);
+
+    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
+
+    cv::Mat dst_gold;
+    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Threshold, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
+    ALL_THRESH_OPS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Magnitude
 
@@ -2452,52 +2792,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, PolarToCart, testing::Combine(
     testing::Values(AngleInDegrees(false), AngleInDegrees(true)),
     WHOLE_SUBMAT));
 
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Threshold
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
-
-PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    int threshOp;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        threshOp = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Threshold, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    double maxVal = randomDouble(20.0, 127.0);
-    double thresh = randomDouble(0.0, maxVal);
-
-    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
-    cv::gpu::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
-
-    cv::Mat dst_gold;
-    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Arithm, Threshold, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
-    ALL_THRESH_OPS,
-    WHOLE_SUBMAT));
-
 #endif // HAVE_CUDA
diff --git a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
index e7a29b5763..3fe62ec94b 100644
--- a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
+++ b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
@@ -321,7 +321,7 @@ private:
     GpuMat colors_;
     GpuMat weights_;
 
-    Ptr<FilterEngine_GPU> boxFilter_;
+    Ptr<gpu::Filter> boxFilter_;
     GpuMat buf_;
 };
 
diff --git a/modules/gpubgsegm/src/fgd.cpp b/modules/gpubgsegm/src/fgd.cpp
index 1b4038304a..fb14ff172a 100644
--- a/modules/gpubgsegm/src/fgd.cpp
+++ b/modules/gpubgsegm/src/fgd.cpp
@@ -228,11 +228,10 @@ private:
     cv::gpu::GpuMat countBuf_;
 
     cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
     cv::gpu::GpuMat filterBrd_;
 
-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
+    cv::Ptr<cv::gpu::Filter> dilateFilter_;
+    cv::Ptr<cv::gpu::Filter> erodeFilter_;
 
     CvMemStorage* storage_;
 };
@@ -305,8 +304,8 @@ void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, cons
         cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
         cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
 
-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
+        dilateFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, CV_8UC1, kernel, anchor);
+        erodeFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, CV_8UC1, kernel, anchor);
     }
 }
 
@@ -326,7 +325,6 @@ void cv::gpu::FGDStatModel::Impl::release()
     countBuf_.release();
 
     buf_.release();
-    filterBuf_.release();
     filterBrd_.release();
 }
 
@@ -488,14 +486,14 @@ namespace
 
 namespace
 {
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
+    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::Filter>& filter, cv::Scalar brdVal)
     {
         cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
+        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst);
     }
 
     void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
+                          cv::Ptr<cv::gpu::Filter>& erodeFilter, cv::Ptr<cv::gpu::Filter>& dilateFilter,
                           const cv::gpu::FGDStatModel::Params& params)
     {
         const int brd = params.perform_morphing;
diff --git a/modules/gpubgsegm/src/gmg.cpp b/modules/gpubgsegm/src/gmg.cpp
index a38cbffaca..b97f0836f4 100644
--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
@@ -100,7 +100,7 @@ void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
     nfeatures_.setTo(cv::Scalar::all(0));
 
     if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
+        boxFilter_ = cv::gpu::createBoxFilter(CV_8UC1, -1, cv::Size(smoothingRadius, smoothingRadius));
 
     loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
 }
@@ -141,7 +141,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
     // medianBlur
     if (smoothingRadius > 0)
     {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
+        boxFilter_->apply(fgmask, buf_, stream);
         int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
         double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
         cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
diff --git a/modules/gpucodec/doc/videodec.rst b/modules/gpucodec/doc/videodec.rst
index 342203223b..e2da305591 100644
--- a/modules/gpucodec/doc/videodec.rst
+++ b/modules/gpucodec/doc/videodec.rst
@@ -5,20 +5,37 @@ Video Decoding
 
 
 
-gpu::VideoReader_GPU
---------------------
-Video reader class.
+gpucodec::VideoReader
+---------------------
+Video reader interface.
 
-.. ocv:class:: gpu::VideoReader_GPU
+.. ocv:class:: gpucodec::VideoReader
 
 
 
-gpu::VideoReader_GPU::Codec
----------------------------
+gpucodec::VideoReader::nextFrame
+--------------------------------
+Grabs, decodes and returns the next video frame.
 
-Video codecs supported by :ocv:class:`gpu::VideoReader_GPU` .
+.. ocv:function:: bool gpucodec::VideoReader::nextFrame(OutputArray frame)
 
-.. ocv:enum:: gpu::VideoReader_GPU::Codec
+If no frames has been grabbed (there are no more frames in video file), the methods return ``false`` . The method throws :ocv:class:`Exception` if error occurs.
+
+
+
+gpucodec::VideoReader::format
+-----------------------------
+Returns information about video file format.
+
+.. ocv:function:: FormatInfo gpucodec::VideoReader::format() const
+
+
+
+gpucodec::Codec
+---------------
+Video codecs supported by :ocv:class:`gpucodec::VideoReader` .
+
+.. ocv:enum:: gpucodec::Codec
 
   .. ocv:emember:: MPEG1 = 0
   .. ocv:emember:: MPEG2
@@ -50,12 +67,12 @@ Video codecs supported by :ocv:class:`gpu::VideoReader_GPU` .
         UYVY (4:2:2)
 
 
-gpu::VideoReader_GPU::ChromaFormat
-----------------------------------
 
-Chroma formats supported by :ocv:class:`gpu::VideoReader_GPU` .
+gpucodec::ChromaFormat
+----------------------
+Chroma formats supported by :ocv:class:`gpucodec::VideoReader` .
 
-.. ocv:enum:: gpu::VideoReader_GPU::ChromaFormat
+.. ocv:enum:: gpucodec::ChromaFormat
 
   .. ocv:emember:: Monochrome = 0
   .. ocv:emember:: YUV420
@@ -63,9 +80,10 @@ Chroma formats supported by :ocv:class:`gpu::VideoReader_GPU` .
   .. ocv:emember:: YUV444
 
 
-gpu::VideoReader_GPU::FormatInfo
---------------------------------
-.. ocv:struct:: gpu::VideoReader_GPU::FormatInfo
+
+gpucodec::FormatInfo
+--------------------
+.. ocv:struct:: gpucodec::FormatInfo
 
 Struct providing information about video file format. ::
 
@@ -78,157 +96,58 @@ Struct providing information about video file format. ::
     };
 
 
-gpu::VideoReader_GPU::VideoReader_GPU
--------------------------------------
-Constructors.
 
-.. ocv:function:: gpu::VideoReader_GPU::VideoReader_GPU()
-.. ocv:function:: gpu::VideoReader_GPU::VideoReader_GPU(const String& filename)
-.. ocv:function:: gpu::VideoReader_GPU::VideoReader_GPU(const cv::Ptr<VideoSource>& source)
+gpucodec::createVideoReader
+---------------------------
+Creates video reader.
+
+.. ocv:function:: Ptr<VideoReader> gpucodec::createVideoReader(const String& filename)
+.. ocv:function:: Ptr<VideoReader> gpucodec::createVideoReader(const Ptr<RawVideoSource>& source)
 
     :param filename: Name of the input video file.
 
-    :param source: Video file parser implemented by user.
+    :param source: RAW video source implemented by user.
 
-The constructors initialize video reader. FFMPEG is used to read videos. User can implement own demultiplexing with :ocv:class:`gpu::VideoReader_GPU::VideoSource` .
+FFMPEG is used to read videos. User can implement own demultiplexing with :ocv:class:`gpucodec::RawVideoSource` .
 
 
 
-gpu::VideoReader_GPU::open
---------------------------
-Initializes or reinitializes video reader.
-
-.. ocv:function:: void gpu::VideoReader_GPU::open(const String& filename)
-.. ocv:function:: void gpu::VideoReader_GPU::open(const cv::Ptr<VideoSource>& source)
-
-The method opens video reader. Parameters are the same as in the constructor :ocv:func:`gpu::VideoReader_GPU::VideoReader_GPU` . The method throws :ocv:class:`Exception` if error occurs.
-
-
-
-gpu::VideoReader_GPU::isOpened
-------------------------------
-Returns true if video reader has been successfully initialized.
-
-.. ocv:function:: bool gpu::VideoReader_GPU::isOpened() const
-
-
-
-gpu::VideoReader_GPU::close
----------------------------
-Releases the video reader.
-
-.. ocv:function:: void gpu::VideoReader_GPU::close()
-
-
-
-gpu::VideoReader_GPU::read
---------------------------
-Grabs, decodes and returns the next video frame.
-
-.. ocv:function:: bool gpu::VideoReader_GPU::read(GpuMat& image)
-
-If no frames has been grabbed (there are no more frames in video file), the methods return ``false`` . The method throws :ocv:class:`Exception` if error occurs.
-
-
-
-gpu::VideoReader_GPU::format
-----------------------------
-Returns information about video file format.
-
-.. ocv:function:: FormatInfo gpu::VideoReader_GPU::format() const
-
-The method throws :ocv:class:`Exception` if video reader wasn't initialized.
-
-
-
-gpu::VideoReader_GPU::dumpFormat
---------------------------------
-Dump information about video file format to specified stream.
-
-.. ocv:function:: void gpu::VideoReader_GPU::dumpFormat(std::ostream& st)
-
-    :param st: Output stream.
-
-The method throws :ocv:class:`Exception` if video reader wasn't initialized.
-
-
-
-gpu::VideoReader_GPU::VideoSource
------------------------------------
-.. ocv:class:: gpu::VideoReader_GPU::VideoSource
+gpucodec::RawVideoSource
+------------------------
+.. ocv:class:: gpucodec::RawVideoSource
 
 Interface for video demultiplexing. ::
 
-    class VideoSource
+    class RawVideoSource
     {
     public:
-        VideoSource();
-        virtual ~VideoSource() {}
+        virtual ~RawVideoSource() {}
+
+        virtual bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0;
 
         virtual FormatInfo format() const = 0;
-        virtual void start() = 0;
-        virtual void stop() = 0;
-        virtual bool isStarted() const = 0;
-        virtual bool hasError() const = 0;
-
-    protected:
-        bool parseVideoData(const unsigned char* data, size_t size, bool endOfStream = false);
     };
 
 User can implement own demultiplexing by implementing this interface.
 
 
 
-gpu::VideoReader_GPU::VideoSource::format
------------------------------------------
-Returns information about video file format.
-
-.. ocv:function:: virtual FormatInfo gpu::VideoReader_GPU::VideoSource::format() const = 0
-
-
-
-gpu::VideoReader_GPU::VideoSource::start
-----------------------------------------
-Starts processing.
-
-.. ocv:function:: virtual void gpu::VideoReader_GPU::VideoSource::start() = 0
-
-Implementation must create own thread with video processing and call periodic :ocv:func:`gpu::VideoReader_GPU::VideoSource::parseVideoData` .
-
-
-
-gpu::VideoReader_GPU::VideoSource::stop
+gpucodec::RawVideoSource::getNextPacket
 ---------------------------------------
-Stops processing.
+Returns next packet with RAW video frame.
 
-.. ocv:function:: virtual void gpu::VideoReader_GPU::VideoSource::stop() = 0
+.. ocv:function:: bool gpucodec::VideoSource::getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0
 
-
-
-gpu::VideoReader_GPU::VideoSource::isStarted
---------------------------------------------
-Returns ``true`` if processing was successfully started.
-
-.. ocv:function:: virtual bool gpu::VideoReader_GPU::VideoSource::isStarted() const = 0
-
-
-
-gpu::VideoReader_GPU::VideoSource::hasError
--------------------------------------------
-Returns ``true`` if error occured during processing.
-
-.. ocv:function:: virtual bool gpu::VideoReader_GPU::VideoSource::hasError() const = 0
-
-
-
-gpu::VideoReader_GPU::VideoSource::parseVideoData
--------------------------------------------------
-Parse next video frame. Implementation must call this method after new frame was grabbed.
-
-.. ocv:function:: bool gpu::VideoReader_GPU::VideoSource::parseVideoData(const uchar* data, size_t size, bool endOfStream = false)
-
-    :param data: Pointer to frame data. Can be ``NULL`` if ``endOfStream`` if ``true`` .
+    :param data: Pointer to frame data.
 
     :param size: Size in bytes of current frame.
 
     :param endOfStream: Indicates that it is end of stream.
+
+
+
+gpucodec::RawVideoSource::format
+--------------------------------
+Returns information about video file format.
+
+.. ocv:function:: virtual FormatInfo gpucodec::RawVideoSource::format() const = 0
diff --git a/modules/gpucodec/doc/videoenc.rst b/modules/gpucodec/doc/videoenc.rst
index ec26e27ef7..739ec0d704 100644
--- a/modules/gpucodec/doc/videoenc.rst
+++ b/modules/gpucodec/doc/videoenc.rst
@@ -5,80 +5,25 @@ Video Encoding
 
 
 
-gpu::VideoWriter_GPU
+gpucodec::VideoWriter
 ---------------------
-Video writer class.
+Video writer interface.
 
-.. ocv:class:: gpu::VideoWriter_GPU
+.. ocv:class:: gpucodec::VideoWriter
 
-The class uses H264 video codec.
+The implementation uses H264 video codec.
 
 .. note:: Currently only Windows platform is supported.
 
 
 
-gpu::VideoWriter_GPU::VideoWriter_GPU
--------------------------------------
-Constructors.
-
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU()
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-
-    :param fileName: Name of the output video file. Only AVI file format is supported.
-
-    :param frameSize: Size of the input video frames.
-
-    :param fps: Framerate of the created video stream.
-
-    :param params: Encoder parameters. See :ocv:struct:`gpu::VideoWriter_GPU::EncoderParams` .
-
-    :param format: Surface format of input frames ( ``SF_UYVY`` , ``SF_YUY2`` , ``SF_YV12`` , ``SF_NV12`` , ``SF_IYUV`` , ``SF_BGR`` or ``SF_GRAY``). BGR or gray frames will be converted to YV12 format before encoding, frames with other formats will be used as is.
-
-    :param encoderCallback: Callbacks for video encoder. See :ocv:class:`gpu::VideoWriter_GPU::EncoderCallBack` . Use it if you want to work with raw video stream.
-
-The constructors initialize video writer. FFMPEG is used to write videos. User can implement own multiplexing with :ocv:class:`gpu::VideoWriter_GPU::EncoderCallBack` .
-
-
-
-gpu::VideoWriter_GPU::open
---------------------------
-Initializes or reinitializes video writer.
-
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-
-The method opens video writer. Parameters are the same as in the constructor :ocv:func:`gpu::VideoWriter_GPU::VideoWriter_GPU` . The method throws :ocv:class:`Exception` if error occurs.
-
-
-
-gpu::VideoWriter_GPU::isOpened
-------------------------------
-Returns true if video writer has been successfully initialized.
-
-.. ocv:function:: bool gpu::VideoWriter_GPU::isOpened() const
-
-
-
-gpu::VideoWriter_GPU::close
----------------------------
-Releases the video writer.
-
-.. ocv:function:: void gpu::VideoWriter_GPU::close()
-
-
-
-gpu::VideoWriter_GPU::write
----------------------------
+gpucodec::VideoWriter::write
+----------------------------
 Writes the next video frame.
 
-.. ocv:function:: void gpu::VideoWriter_GPU::write(const cv::gpu::GpuMat& image, bool lastFrame = false)
+.. ocv:function:: void gpucodec::VideoWriter::write(InputArray frame, bool lastFrame = false) = 0
 
-    :param image: The written frame.
+    :param frame: The written frame.
 
     :param lastFrame: Indicates that it is end of stream. The parameter can be ignored.
 
@@ -86,9 +31,34 @@ The method write the specified image to video file. The image must have the same
 
 
 
-gpu::VideoWriter_GPU::EncoderParams
------------------------------------
-.. ocv:struct:: gpu::VideoWriter_GPU::EncoderParams
+gpucodec::createVideoWriter
+---------------------------
+Creates video writer.
+
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format = SF_BGR)
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format = SF_BGR)
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
+
+    :param fileName: Name of the output video file. Only AVI file format is supported.
+
+    :param frameSize: Size of the input video frames.
+
+    :param fps: Framerate of the created video stream.
+
+    :param params: Encoder parameters. See :ocv:struct:`gpucodec::EncoderParams` .
+
+    :param format: Surface format of input frames ( ``SF_UYVY`` , ``SF_YUY2`` , ``SF_YV12`` , ``SF_NV12`` , ``SF_IYUV`` , ``SF_BGR`` or ``SF_GRAY``). BGR or gray frames will be converted to YV12 format before encoding, frames with other formats will be used as is.
+
+    :param encoderCallback: Callbacks for video encoder. See :ocv:class:`gpucodec::EncoderCallBack` . Use it if you want to work with raw video stream.
+
+The constructors initialize video writer. FFMPEG is used to write videos. User can implement own multiplexing with :ocv:class:`gpucodec::EncoderCallBack` .
+
+
+
+gpucodec::EncoderParams
+-----------------------
+.. ocv:struct:: gpucodec::EncoderParams
 
 Different parameters for CUDA video encoder. ::
 
@@ -123,12 +93,12 @@ Different parameters for CUDA video encoder. ::
 
 
 
-gpu::VideoWriter_GPU::EncoderParams::EncoderParams
---------------------------------------------------
+gpucodec::EncoderParams::EncoderParams
+--------------------------------------
 Constructors.
 
-.. ocv:function:: gpu::VideoWriter_GPU::EncoderParams::EncoderParams()
-.. ocv:function:: gpu::VideoWriter_GPU::EncoderParams::EncoderParams(const String& configFile)
+.. ocv:function:: gpucodec::EncoderParams::EncoderParams()
+.. ocv:function:: gpucodec::EncoderParams::EncoderParams(const String& configFile)
 
     :param configFile: Config file name.
 
@@ -136,29 +106,29 @@ Creates default parameters or reads parameters from config file.
 
 
 
-gpu::VideoWriter_GPU::EncoderParams::load
------------------------------------------
+gpucodec::EncoderParams::load
+-----------------------------
 Reads parameters from config file.
 
-.. ocv:function:: void gpu::VideoWriter_GPU::EncoderParams::load(const String& configFile)
+.. ocv:function:: void gpucodec::EncoderParams::load(const String& configFile)
 
     :param configFile: Config file name.
 
 
 
-gpu::VideoWriter_GPU::EncoderParams::save
------------------------------------------
+gpucodec::EncoderParams::save
+-----------------------------
 Saves parameters to config file.
 
-.. ocv:function:: void gpu::VideoWriter_GPU::EncoderParams::save(const String& configFile) const
+.. ocv:function:: void gpucodec::EncoderParams::save(const String& configFile) const
 
     :param configFile: Config file name.
 
 
 
-gpu::VideoWriter_GPU::EncoderCallBack
--------------------------------------
-.. ocv:class:: gpu::VideoWriter_GPU::EncoderCallBack
+gpucodec::EncoderCallBack
+-------------------------
+.. ocv:class:: gpucodec::EncoderCallBack
 
 Callbacks for CUDA video encoder. ::
 
@@ -182,38 +152,38 @@ Callbacks for CUDA video encoder. ::
 
 
 
-gpu::VideoWriter_GPU::EncoderCallBack::acquireBitStream
--------------------------------------------------------
+gpucodec::EncoderCallBack::acquireBitStream
+-------------------------------------------
 Callback function to signal the start of bitstream that is to be encoded.
 
-.. ocv:function:: virtual uchar* gpu::VideoWriter_GPU::EncoderCallBack::acquireBitStream(int* bufferSize) = 0
+.. ocv:function:: virtual uchar* gpucodec::EncoderCallBack::acquireBitStream(int* bufferSize) = 0
 
 Callback must allocate buffer for CUDA encoder and return pointer to it and it's size.
 
 
 
-gpu::VideoWriter_GPU::EncoderCallBack::releaseBitStream
--------------------------------------------------------
+gpucodec::EncoderCallBack::releaseBitStream
+-------------------------------------------
 Callback function to signal that the encoded bitstream is ready to be written to file.
 
-.. ocv:function:: virtual void gpu::VideoWriter_GPU::EncoderCallBack::releaseBitStream(unsigned char* data, int size) = 0
+.. ocv:function:: virtual void gpucodec::EncoderCallBack::releaseBitStream(unsigned char* data, int size) = 0
 
 
 
-gpu::VideoWriter_GPU::EncoderCallBack::onBeginFrame
----------------------------------------------------
+gpucodec::EncoderCallBack::onBeginFrame
+---------------------------------------
 Callback function to signal that the encoding operation on the frame has started.
 
-.. ocv:function:: virtual void gpu::VideoWriter_GPU::EncoderCallBack::onBeginFrame(int frameNumber, PicType picType) = 0
+.. ocv:function:: virtual void gpucodec::EncoderCallBack::onBeginFrame(int frameNumber, PicType picType) = 0
 
     :param picType: Specify frame type (I-Frame, P-Frame or B-Frame).
 
 
 
-gpu::VideoWriter_GPU::EncoderCallBack::onEndFrame
--------------------------------------------------
+gpucodec::EncoderCallBack::onEndFrame
+-------------------------------------
 Callback function signals that the encoding operation on the frame has finished.
 
-.. ocv:function:: virtual void gpu::VideoWriter_GPU::EncoderCallBack::onEndFrame(int frameNumber, PicType picType) = 0
+.. ocv:function:: virtual void gpucodec::EncoderCallBack::onEndFrame(int frameNumber, PicType picType) = 0
 
     :param picType: Specify frame type (I-Frame, P-Frame or B-Frame).
diff --git a/modules/gpucodec/include/opencv2/gpucodec.hpp b/modules/gpucodec/include/opencv2/gpucodec.hpp
index af68c38410..f2e298fd70 100644
--- a/modules/gpucodec/include/opencv2/gpucodec.hpp
+++ b/modules/gpucodec/include/opencv2/gpucodec.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -47,219 +48,159 @@
 #  error gpucodec.hpp header must be compiled as C++
 #endif
 
-#include <iosfwd>
-
 #include "opencv2/core/gpu.hpp"
 
-namespace cv { namespace gpu {
+namespace cv { namespace gpucodec {
 
 ////////////////////////////////// Video Encoding //////////////////////////////////
 
-// Works only under Windows
-// Supports olny H264 video codec and AVI files
-class CV_EXPORTS VideoWriter_GPU
+// Works only under Windows.
+// Supports olny H264 video codec and AVI files.
+
+enum SurfaceFormat
+{
+    SF_UYVY = 0,
+    SF_YUY2,
+    SF_YV12,
+    SF_NV12,
+    SF_IYUV,
+    SF_BGR,
+    SF_GRAY = SF_BGR
+};
+
+struct CV_EXPORTS EncoderParams
+{
+    int P_Interval;      // NVVE_P_INTERVAL,
+    int IDR_Period;      // NVVE_IDR_PERIOD,
+    int DynamicGOP;      // NVVE_DYNAMIC_GOP,
+    int RCType;          // NVVE_RC_TYPE,
+    int AvgBitrate;      // NVVE_AVG_BITRATE,
+    int PeakBitrate;     // NVVE_PEAK_BITRATE,
+    int QP_Level_Intra;  // NVVE_QP_LEVEL_INTRA,
+    int QP_Level_InterP; // NVVE_QP_LEVEL_INTER_P,
+    int QP_Level_InterB; // NVVE_QP_LEVEL_INTER_B,
+    int DeblockMode;     // NVVE_DEBLOCK_MODE,
+    int ProfileLevel;    // NVVE_PROFILE_LEVEL,
+    int ForceIntra;      // NVVE_FORCE_INTRA,
+    int ForceIDR;        // NVVE_FORCE_IDR,
+    int ClearStat;       // NVVE_CLEAR_STAT,
+    int DIMode;          // NVVE_SET_DEINTERLACE,
+    int Presets;         // NVVE_PRESETS,
+    int DisableCabac;    // NVVE_DISABLE_CABAC,
+    int NaluFramingType; // NVVE_CONFIGURE_NALU_FRAMING_TYPE
+    int DisableSPSPPS;   // NVVE_DISABLE_SPS_PPS
+
+    EncoderParams();
+    explicit EncoderParams(const String& configFile);
+
+    void load(const String& configFile);
+    void save(const String& configFile) const;
+};
+
+class CV_EXPORTS EncoderCallBack
 {
 public:
-    struct EncoderParams;
-
-    // Callbacks for video encoder, use it if you want to work with raw video stream
-    class EncoderCallBack;
-
-    enum SurfaceFormat
+    enum PicType
     {
-        SF_UYVY = 0,
-        SF_YUY2,
-        SF_YV12,
-        SF_NV12,
-        SF_IYUV,
-        SF_BGR,
-        SF_GRAY = SF_BGR
+        IFRAME = 1,
+        PFRAME = 2,
+        BFRAME = 3
     };
 
-    VideoWriter_GPU();
-    VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
-    VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
-    ~VideoWriter_GPU();
+    virtual ~EncoderCallBack() {}
 
-    // all methods throws cv::Exception if error occurs
-    void open(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    void open(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
-    void open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    void open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+    //! callback function to signal the start of bitstream that is to be encoded
+    //! callback must allocate host buffer for CUDA encoder and return pointer to it and it's size
+    virtual uchar* acquireBitStream(int* bufferSize) = 0;
 
-    bool isOpened() const;
-    void close();
+    //! callback function to signal that the encoded bitstream is ready to be written to file
+    virtual void releaseBitStream(unsigned char* data, int size) = 0;
 
-    void write(const cv::gpu::GpuMat& image, bool lastFrame = false);
+    //! callback function to signal that the encoding operation on the frame has started
+    virtual void onBeginFrame(int frameNumber, PicType picType) = 0;
 
-    struct CV_EXPORTS EncoderParams
-    {
-        int       P_Interval;      //    NVVE_P_INTERVAL,
-        int       IDR_Period;      //    NVVE_IDR_PERIOD,
-        int       DynamicGOP;      //    NVVE_DYNAMIC_GOP,
-        int       RCType;          //    NVVE_RC_TYPE,
-        int       AvgBitrate;      //    NVVE_AVG_BITRATE,
-        int       PeakBitrate;     //    NVVE_PEAK_BITRATE,
-        int       QP_Level_Intra;  //    NVVE_QP_LEVEL_INTRA,
-        int       QP_Level_InterP; //    NVVE_QP_LEVEL_INTER_P,
-        int       QP_Level_InterB; //    NVVE_QP_LEVEL_INTER_B,
-        int       DeblockMode;     //    NVVE_DEBLOCK_MODE,
-        int       ProfileLevel;    //    NVVE_PROFILE_LEVEL,
-        int       ForceIntra;      //    NVVE_FORCE_INTRA,
-        int       ForceIDR;        //    NVVE_FORCE_IDR,
-        int       ClearStat;       //    NVVE_CLEAR_STAT,
-        int       DIMode;          //    NVVE_SET_DEINTERLACE,
-        int       Presets;         //    NVVE_PRESETS,
-        int       DisableCabac;    //    NVVE_DISABLE_CABAC,
-        int       NaluFramingType; //    NVVE_CONFIGURE_NALU_FRAMING_TYPE
-        int       DisableSPSPPS;   //    NVVE_DISABLE_SPS_PPS
-
-        EncoderParams();
-        explicit EncoderParams(const String& configFile);
-
-        void load(const String& configFile);
-        void save(const String& configFile) const;
-    };
-
-    EncoderParams getParams() const;
-
-    class CV_EXPORTS EncoderCallBack
-    {
-    public:
-        enum PicType
-        {
-            IFRAME = 1,
-            PFRAME = 2,
-            BFRAME = 3
-        };
-
-        virtual ~EncoderCallBack() {}
-
-        // callback function to signal the start of bitstream that is to be encoded
-        // must return pointer to buffer
-        virtual uchar* acquireBitStream(int* bufferSize) = 0;
-
-        // callback function to signal that the encoded bitstream is ready to be written to file
-        virtual void releaseBitStream(unsigned char* data, int size) = 0;
-
-        // callback function to signal that the encoding operation on the frame has started
-        virtual void onBeginFrame(int frameNumber, PicType picType) = 0;
-
-        // callback function signals that the encoding operation on the frame has finished
-        virtual void onEndFrame(int frameNumber, PicType picType) = 0;
-    };
-
-    class Impl;
-
-private:
-    cv::Ptr<Impl> impl_;
+    //! callback function signals that the encoding operation on the frame has finished
+    virtual void onEndFrame(int frameNumber, PicType picType) = 0;
 };
 
+class CV_EXPORTS VideoWriter
+{
+public:
+    virtual ~VideoWriter() {}
+
+    //! writes the next frame from GPU memory
+    virtual void write(InputArray frame, bool lastFrame = false) = 0;
+
+    virtual EncoderParams getEncoderParams() const = 0;
+};
+
+//! create VideoWriter for specified output file (only AVI file format is supported)
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+
+//! create VideoWriter for user-defined callbacks
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+
 ////////////////////////////////// Video Decoding //////////////////////////////////////////
 
-namespace detail
+enum Codec
 {
-    class FrameQueue;
-    class VideoParser;
-}
+    MPEG1 = 0,
+    MPEG2,
+    MPEG4,
+    VC1,
+    H264,
+    JPEG,
+    H264_SVC,
+    H264_MVC,
 
-class CV_EXPORTS VideoReader_GPU
-{
-public:
-    enum Codec
-    {
-        MPEG1 = 0,
-        MPEG2,
-        MPEG4,
-        VC1,
-        H264,
-        JPEG,
-        H264_SVC,
-        H264_MVC,
-
-        Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   // Y,U,V (4:2:0)
-        Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,V,U (4:2:0)
-        Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,UV  (4:2:0)
-        Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   // YUYV/YUY2 (4:2:2)
-        Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y')),   // UYVY (4:2:2)
-    };
-
-    enum ChromaFormat
-    {
-        Monochrome=0,
-        YUV420,
-        YUV422,
-        YUV444,
-    };
-
-    struct FormatInfo
-    {
-        Codec codec;
-        ChromaFormat chromaFormat;
-        int width;
-        int height;
-    };
-
-    class VideoSource;
-
-    VideoReader_GPU();
-    explicit VideoReader_GPU(const String& filename);
-    explicit VideoReader_GPU(const cv::Ptr<VideoSource>& source);
-
-    ~VideoReader_GPU();
-
-    void open(const String& filename);
-    void open(const cv::Ptr<VideoSource>& source);
-    bool isOpened() const;
-
-    void close();
-
-    bool read(GpuMat& image);
-
-    FormatInfo format() const;
-    void dumpFormat(std::ostream& st);
-
-    class CV_EXPORTS VideoSource
-    {
-    public:
-        VideoSource() : frameQueue_(0), videoParser_(0) {}
-        virtual ~VideoSource() {}
-
-        virtual FormatInfo format() const = 0;
-        virtual void start() = 0;
-        virtual void stop() = 0;
-        virtual bool isStarted() const = 0;
-        virtual bool hasError() const = 0;
-
-        void setFrameQueue(detail::FrameQueue* frameQueue) { frameQueue_ = frameQueue; }
-        void setVideoParser(detail::VideoParser* videoParser) { videoParser_ = videoParser; }
-
-    protected:
-        bool parseVideoData(const uchar* data, size_t size, bool endOfStream = false);
-
-    private:
-        VideoSource(const VideoSource&);
-        VideoSource& operator =(const VideoSource&);
-
-        detail::FrameQueue* frameQueue_;
-        detail::VideoParser* videoParser_;
-    };
-
-    class Impl;
-
-private:
-    cv::Ptr<Impl> impl_;
+    Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   // Y,U,V (4:2:0)
+    Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,V,U (4:2:0)
+    Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,UV  (4:2:0)
+    Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   // YUYV/YUY2 (4:2:2)
+    Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    // UYVY (4:2:2)
 };
 
-}} // namespace cv { namespace gpu {
+enum ChromaFormat
+{
+    Monochrome = 0,
+    YUV420,
+    YUV422,
+    YUV444
+};
 
-namespace cv {
+struct FormatInfo
+{
+    Codec codec;
+    ChromaFormat chromaFormat;
+    int width;
+    int height;
+};
 
-template <> CV_EXPORTS void Ptr<cv::gpu::VideoWriter_GPU::Impl>::delete_obj();
-template <> CV_EXPORTS void Ptr<cv::gpu::VideoReader_GPU::Impl>::delete_obj();
+class CV_EXPORTS VideoReader
+{
+public:
+    virtual ~VideoReader() {}
 
-}
+    virtual bool nextFrame(OutputArray frame) = 0;
+
+    virtual FormatInfo format() const = 0;
+};
+
+class CV_EXPORTS RawVideoSource
+{
+public:
+    virtual ~RawVideoSource() {}
+
+    virtual bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0;
+
+    virtual FormatInfo format() const = 0;
+};
+
+CV_EXPORTS Ptr<VideoReader> createVideoReader(const String& filename);
+CV_EXPORTS Ptr<VideoReader> createVideoReader(const Ptr<RawVideoSource>& source);
+
+}} // namespace cv { namespace gpucodec {
 
 #endif /* __OPENCV_GPUCODEC_HPP__ */
diff --git a/modules/gpucodec/perf/perf_video.cpp b/modules/gpucodec/perf/perf_video.cpp
index 8f5e1700ea..f389605d05 100644
--- a/modules/gpucodec/perf/perf_video.cpp
+++ b/modules/gpucodec/perf/perf_video.cpp
@@ -74,12 +74,11 @@ PERF_TEST_P(FileName, VideoReader, Values("gpu/video/768x576.avi", "gpu/video/19
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::VideoReader_GPU d_reader(inputFile);
-        ASSERT_TRUE( d_reader.isOpened() );
+        cv::Ptr<cv::gpucodec::VideoReader> d_reader = cv::gpucodec::createVideoReader(inputFile);
 
         cv::gpu::GpuMat frame;
 
-        TEST_CYCLE_N(10) d_reader.read(frame);
+        TEST_CYCLE_N(10) d_reader->nextFrame(frame);
 
         GPU_SANITY_CHECK(frame);
     }
@@ -119,7 +118,7 @@ PERF_TEST_P(FileName, VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/19
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::VideoWriter_GPU d_writer;
+        cv::Ptr<cv::gpucodec::VideoWriter> d_writer;
 
         cv::gpu::GpuMat d_frame;
 
@@ -130,11 +129,11 @@ PERF_TEST_P(FileName, VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/19
 
             d_frame.upload(frame);
 
-            if (!d_writer.isOpened())
-                d_writer.open(outputFile, frame.size(), FPS);
+            if (d_writer.empty())
+                d_writer = cv::gpucodec::createVideoWriter(outputFile, frame.size(), FPS);
 
             startTimer(); next();
-            d_writer.write(d_frame);
+            d_writer->write(d_frame);
             stopTimer();
         }
     }
diff --git a/modules/gpucodec/src/cuda/nv12_to_rgb.cu b/modules/gpucodec/src/cuda/nv12_to_rgb.cu
index 536ba2715f..1de916e5a3 100644
--- a/modules/gpucodec/src/cuda/nv12_to_rgb.cu
+++ b/modules/gpucodec/src/cuda/nv12_to_rgb.cu
@@ -51,12 +51,7 @@
 
 namespace cv { namespace gpu { namespace cudev
 {
-    __constant__ float constHueColorSpaceMat[9];
-
-    void loadHueCSC(float hueCSC[9])
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
-    }
+    __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
 
     __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
     {
diff --git a/modules/gpucodec/src/cuvid_video_source.cpp b/modules/gpucodec/src/cuvid_video_source.cpp
index 73d6d24263..477951e931 100644
--- a/modules/gpucodec/src/cuvid_video_source.cpp
+++ b/modules/gpucodec/src/cuvid_video_source.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -44,7 +45,11 @@
 
 #ifdef HAVE_NVCUVID
 
-cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
+using namespace cv;
+using namespace cv::gpucodec;
+using namespace cv::gpucodec::detail;
+
+cv::gpucodec::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
 {
     CUVIDSOURCEPARAMS params;
     std::memset(&params, 0, sizeof(CUVIDSOURCEPARAMS));
@@ -55,51 +60,51 @@ cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
     params.pfnAudioDataHandler = 0;
 
     // now create the actual source
-    CUresult res = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
-    if (res == CUDA_ERROR_INVALID_SOURCE)
-        throw std::runtime_error("Unsupported video source");
-    cuSafeCall( res );
+    CUresult cuRes = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
+    if (cuRes == CUDA_ERROR_INVALID_SOURCE)
+        throw std::runtime_error("");
+    cuSafeCall( cuRes );
 
     CUVIDEOFORMAT vidfmt;
     cuSafeCall( cuvidGetSourceVideoFormat(videoSource_, &vidfmt, 0) );
 
-    format_.codec = static_cast<VideoReader_GPU::Codec>(vidfmt.codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(vidfmt.chroma_format);
+    format_.codec = static_cast<Codec>(vidfmt.codec);
+    format_.chromaFormat = static_cast<ChromaFormat>(vidfmt.chroma_format);
     format_.width = vidfmt.coded_width;
     format_.height = vidfmt.coded_height;
 }
 
-cv::gpu::detail::CuvidVideoSource::~CuvidVideoSource()
+cv::gpucodec::detail::CuvidVideoSource::~CuvidVideoSource()
 {
     cuvidDestroyVideoSource(videoSource_);
 }
 
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::CuvidVideoSource::format() const
+FormatInfo cv::gpucodec::detail::CuvidVideoSource::format() const
 {
     return format_;
 }
 
-void cv::gpu::detail::CuvidVideoSource::start()
+void cv::gpucodec::detail::CuvidVideoSource::start()
 {
     cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Started) );
 }
 
-void cv::gpu::detail::CuvidVideoSource::stop()
+void cv::gpucodec::detail::CuvidVideoSource::stop()
 {
     cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Stopped) );
 }
 
-bool cv::gpu::detail::CuvidVideoSource::isStarted() const
+bool cv::gpucodec::detail::CuvidVideoSource::isStarted() const
 {
     return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Started);
 }
 
-bool cv::gpu::detail::CuvidVideoSource::hasError() const
+bool cv::gpucodec::detail::CuvidVideoSource::hasError() const
 {
     return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Error);
 }
 
-int CUDAAPI cv::gpu::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
+int CUDAAPI cv::gpucodec::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
 {
     CuvidVideoSource* thiz = static_cast<CuvidVideoSource*>(userData);
 
diff --git a/modules/gpucodec/src/cuvid_video_source.h b/modules/gpucodec/src/cuvid_video_source.hpp
similarity index 88%
rename from modules/gpucodec/src/cuvid_video_source.h
rename to modules/gpucodec/src/cuvid_video_source.hpp
index a4a0e85211..c2f0e2f571 100644
--- a/modules/gpucodec/src/cuvid_video_source.h
+++ b/modules/gpucodec/src/cuvid_video_source.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -40,25 +41,25 @@
 //
 //M*/
 
-#ifndef __CUVUD_VIDEO_SOURCE_H__
-#define __CUVUD_VIDEO_SOURCE_H__
-
-#include "opencv2/core/private.gpu.hpp"
-#include "opencv2/gpucodec.hpp"
-#include "thread.h"
+#ifndef __CUVID_VIDEO_SOURCE_HPP__
+#define __CUVID_VIDEO_SOURCE_HPP__
 
 #include <nvcuvid.h>
 
-namespace cv { namespace gpu { namespace detail
+#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/gpucodec.hpp"
+#include "video_source.hpp"
+
+namespace cv { namespace gpucodec { namespace detail
 {
 
-class CuvidVideoSource : public VideoReader_GPU::VideoSource
+class CuvidVideoSource : public VideoSource
 {
 public:
     explicit CuvidVideoSource(const String& fname);
     ~CuvidVideoSource();
 
-    VideoReader_GPU::FormatInfo format() const;
+    FormatInfo format() const;
     void start();
     void stop();
     bool isStarted() const;
@@ -78,9 +79,9 @@ private:
     static int CUDAAPI HandleVideoData(void* pUserData, CUVIDSOURCEDATAPACKET* pPacket);
 
     CUvideosource videoSource_;
-    VideoReader_GPU::FormatInfo format_;
+    FormatInfo format_;
 };
 
 }}}
 
-#endif // __CUVUD_VIDEO_SOURCE_H__
+#endif // __CUVID_VIDEO_SOURCE_HPP__
diff --git a/modules/gpucodec/src/ffmpeg_video_source.cpp b/modules/gpucodec/src/ffmpeg_video_source.cpp
index 6ba09284dc..b5a73875bc 100644
--- a/modules/gpucodec/src/ffmpeg_video_source.cpp
+++ b/modules/gpucodec/src/ffmpeg_video_source.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -48,6 +49,10 @@
     #include "../src/cap_ffmpeg_impl.hpp"
 #endif
 
+using namespace cv;
+using namespace cv::gpucodec;
+using namespace cv::gpucodec::detail;
+
 namespace
 {
     Create_InputMediaStream_FFMPEG_Plugin create_InputMediaStream_FFMPEG_p = 0;
@@ -94,7 +99,7 @@ namespace
     }
 }
 
-cv::gpu::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :
+cv::gpucodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :
     stream_(0)
 {
     CV_Assert( init_MediaStream_FFMPEG() );
@@ -106,75 +111,33 @@ cv::gpu::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :
 
     stream_ = create_InputMediaStream_FFMPEG_p(fname.c_str(), &codec, &chroma_format, &width, &height);
     if (!stream_)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported video source");
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
 
-    format_.codec = static_cast<VideoReader_GPU::Codec>(codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(chroma_format);
+    format_.codec = static_cast<Codec>(codec);
+    format_.chromaFormat = static_cast<ChromaFormat>(chroma_format);
     format_.width = width;
     format_.height = height;
 }
 
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::FFmpegVideoSource::format() const
+cv::gpucodec::detail::FFmpegVideoSource::~FFmpegVideoSource()
+{
+    if (stream_)
+        release_InputMediaStream_FFMPEG_p(stream_);
+}
+
+FormatInfo cv::gpucodec::detail::FFmpegVideoSource::format() const
 {
     return format_;
 }
 
-void cv::gpu::detail::FFmpegVideoSource::start()
+bool cv::gpucodec::detail::FFmpegVideoSource::getNextPacket(unsigned char** data, int* size, bool* bEndOfFile)
 {
-    stop_ = false;
-    hasError_ = false;
-    thread_ = new Thread(readLoop, this);
-}
+    int endOfFile;
 
-void cv::gpu::detail::FFmpegVideoSource::stop()
-{
-    stop_ = true;
-    thread_->wait();
-    thread_.release();
-}
+    int res = read_InputMediaStream_FFMPEG_p(stream_, data, size, &endOfFile);
 
-bool cv::gpu::detail::FFmpegVideoSource::isStarted() const
-{
-    return !stop_;
-}
-
-bool cv::gpu::detail::FFmpegVideoSource::hasError() const
-{
-    return hasError_;
-}
-
-void cv::gpu::detail::FFmpegVideoSource::readLoop(void* userData)
-{
-    FFmpegVideoSource* thiz = static_cast<FFmpegVideoSource*>(userData);
-
-    for (;;)
-    {
-        unsigned char* data;
-        int size;
-        int endOfFile;
-
-        if (!read_InputMediaStream_FFMPEG_p(thiz->stream_, &data, &size, &endOfFile))
-        {
-            thiz->hasError_ = !endOfFile;
-            break;
-        }
-
-        if (!thiz->parseVideoData(data, size))
-        {
-            thiz->hasError_ = true;
-            break;
-        }
-
-        if (thiz->stop_)
-            break;
-    }
-
-    thiz->parseVideoData(0, 0, true);
-}
-
-template <> void cv::Ptr<InputMediaStream_FFMPEG>::delete_obj()
-{
-    if (obj) release_InputMediaStream_FFMPEG_p(obj);
+    *bEndOfFile = (endOfFile != 0);
+    return res != 0;
 }
 
 #endif // HAVE_CUDA
diff --git a/modules/gpucodec/src/ffmpeg_video_source.h b/modules/gpucodec/src/ffmpeg_video_source.hpp
similarity index 75%
rename from modules/gpucodec/src/ffmpeg_video_source.h
rename to modules/gpucodec/src/ffmpeg_video_source.hpp
index d097785d77..6ea59ddac1 100644
--- a/modules/gpucodec/src/ffmpeg_video_source.h
+++ b/modules/gpucodec/src/ffmpeg_video_source.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -40,43 +41,31 @@
 //
 //M*/
 
-#ifndef __FFMPEG_VIDEO_SOURCE_H__
-#define __FFMPEG_VIDEO_SOURCE_H__
+#ifndef __FFMPEG_VIDEO_SOURCE_HPP__
+#define __FFMPEG_VIDEO_SOURCE_HPP__
 
 #include "opencv2/gpucodec.hpp"
-#include "thread.h"
 
 struct InputMediaStream_FFMPEG;
 
-namespace cv { namespace gpu { namespace detail {
+namespace cv { namespace gpucodec { namespace detail {
 
-class FFmpegVideoSource : public VideoReader_GPU::VideoSource
+class FFmpegVideoSource : public RawVideoSource
 {
 public:
     FFmpegVideoSource(const String& fname);
+    ~FFmpegVideoSource();
 
-    VideoReader_GPU::FormatInfo format() const;
-    void start();
-    void stop();
-    bool isStarted() const;
-    bool hasError() const;
+    bool getNextPacket(unsigned char** data, int* size, bool* endOfFile);
+
+    FormatInfo format() const;
 
 private:
-    VideoReader_GPU::FormatInfo format_;
+    FormatInfo format_;
 
-    cv::Ptr<InputMediaStream_FFMPEG> stream_;
-
-    cv::Ptr<Thread> thread_;
-    volatile bool stop_;
-    volatile bool hasError_;
-
-    static void readLoop(void* userData);
+    InputMediaStream_FFMPEG* stream_;
 };
 
 }}}
 
-namespace cv {
-    template <> void Ptr<InputMediaStream_FFMPEG>::delete_obj();
-}
-
-#endif // __FFMPEG_VIDEO_SOURCE_H__
+#endif // __FFMPEG_VIDEO_SOURCE_HPP__
diff --git a/modules/gpucodec/src/frame_queue.cpp b/modules/gpucodec/src/frame_queue.cpp
index 2c5045500d..f9141d84f5 100644
--- a/modules/gpucodec/src/frame_queue.cpp
+++ b/modules/gpucodec/src/frame_queue.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -44,16 +45,16 @@
 
 #ifdef HAVE_NVCUVID
 
-cv::gpu::detail::FrameQueue::FrameQueue() :
+cv::gpucodec::detail::FrameQueue::FrameQueue() :
     endOfDecode_(0),
     framesInQueue_(0),
     readPosition_(0)
 {
     std::memset(displayQueue_, 0, sizeof(displayQueue_));
-    std::memset((void*)isFrameInUse_, 0, sizeof(isFrameInUse_));
+    std::memset((void*) isFrameInUse_, 0, sizeof(isFrameInUse_));
 }
 
-bool cv::gpu::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
+bool cv::gpucodec::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
 {
     while (isInUse(pictureIndex))
     {
@@ -67,7 +68,7 @@ bool cv::gpu::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
     return true;
 }
 
-void cv::gpu::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
+void cv::gpucodec::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
 {
     // Mark the frame as 'in-use' so we don't re-use it for decoding until it is no longer needed
     // for display
@@ -98,7 +99,7 @@ void cv::gpu::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
     } while (!isEndOfDecode());
 }
 
-bool cv::gpu::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
+bool cv::gpucodec::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
 {
     AutoLock autoLock(mtx_);
 
diff --git a/modules/gpucodec/src/frame_queue.h b/modules/gpucodec/src/frame_queue.hpp
similarity index 93%
rename from modules/gpucodec/src/frame_queue.h
rename to modules/gpucodec/src/frame_queue.hpp
index d9a4433b3d..c3b427b74f 100644
--- a/modules/gpucodec/src/frame_queue.h
+++ b/modules/gpucodec/src/frame_queue.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -40,15 +41,15 @@
 //
 //M*/
 
-#ifndef __FRAME_QUEUE_H__
-#define __FRAME_QUEUE_H__
+#ifndef __FRAME_QUEUE_HPP__
+#define __FRAME_QUEUE_HPP__
 
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.gpu.hpp"
 
 #include <nvcuvid.h>
 
-namespace cv { namespace gpu { namespace detail
+namespace cv { namespace gpucodec { namespace detail
 {
 
 class FrameQueue
@@ -94,4 +95,4 @@ private:
 
 }}}
 
-#endif // __FRAME_QUEUE_H__
+#endif // __FRAME_QUEUE_HPP__
diff --git a/modules/gpucodec/src/precomp.hpp b/modules/gpucodec/src/precomp.hpp
index c8580c9fe3..7cef1b7a94 100644
--- a/modules/gpucodec/src/precomp.hpp
+++ b/modules/gpucodec/src/precomp.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -66,12 +67,13 @@
         #include <unistd.h>
     #endif
 
-    #include "thread.h"
-    #include "ffmpeg_video_source.h"
-    #include "cuvid_video_source.h"
-    #include "frame_queue.h"
-    #include "video_decoder.h"
-    #include "video_parser.h"
+    #include "thread.hpp"
+    #include "video_source.hpp"
+    #include "ffmpeg_video_source.hpp"
+    #include "cuvid_video_source.hpp"
+    #include "frame_queue.hpp"
+    #include "video_decoder.hpp"
+    #include "video_parser.hpp"
 
     #include "../src/cap_ffmpeg_api.hpp"
 #endif
diff --git a/modules/gpucodec/src/thread.cpp b/modules/gpucodec/src/thread.cpp
index db9f3de39b..b936d8e21a 100644
--- a/modules/gpucodec/src/thread.cpp
+++ b/modules/gpucodec/src/thread.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -44,7 +45,7 @@
 
 #ifdef HAVE_NVCUVID
 
-using namespace cv::gpu::detail;
+using namespace cv::gpucodec::detail;
 
 #ifdef WIN32
 
@@ -66,7 +67,7 @@ namespace
     }
 }
 
-class cv::gpu::detail::Thread::Impl
+class cv::gpucodec::detail::Thread::Impl
 {
 public:
     Impl(Thread::Func func, void* userData)
@@ -119,7 +120,7 @@ namespace
     }
 }
 
-class cv::gpu::detail::Thread::Impl
+class cv::gpucodec::detail::Thread::Impl
 {
 public:
     Impl(Thread::Func func, void* userData)
@@ -147,17 +148,17 @@ private:
 
 #endif
 
-cv::gpu::detail::Thread::Thread(Func func, void* userData) :
+cv::gpucodec::detail::Thread::Thread(Func func, void* userData) :
     impl_(new Impl(func, userData))
 {
 }
 
-void cv::gpu::detail::Thread::wait()
+void cv::gpucodec::detail::Thread::wait()
 {
     impl_->wait();
 }
 
-void cv::gpu::detail::Thread::sleep(int ms)
+void cv::gpucodec::detail::Thread::sleep(int ms)
 {
 #ifdef WIN32
     ::Sleep(ms);
@@ -166,7 +167,7 @@ void cv::gpu::detail::Thread::sleep(int ms)
 #endif
 }
 
-template <> void cv::Ptr<cv::gpu::detail::Thread::Impl>::delete_obj()
+template <> void cv::Ptr<cv::gpucodec::detail::Thread::Impl>::delete_obj()
 {
     if (obj) delete obj;
 }
diff --git a/modules/gpucodec/src/thread.h b/modules/gpucodec/src/thread.hpp
similarity index 87%
rename from modules/gpucodec/src/thread.h
rename to modules/gpucodec/src/thread.hpp
index 1489f5830b..ccda5b5c7c 100644
--- a/modules/gpucodec/src/thread.h
+++ b/modules/gpucodec/src/thread.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -40,12 +41,12 @@
 //
 //M*/
 
-#ifndef __THREAD_WRAPPERS_H__
-#define __THREAD_WRAPPERS_H__
+#ifndef __THREAD_WRAPPERS_HPP__
+#define __THREAD_WRAPPERS_HPP__
 
 #include "opencv2/core.hpp"
 
-namespace cv { namespace gpu { namespace detail {
+namespace cv { namespace gpucodec { namespace detail {
 
 class Thread
 {
@@ -67,7 +68,7 @@ private:
 }}}
 
 namespace cv {
-    template <> void Ptr<cv::gpu::detail::Thread::Impl>::delete_obj();
+    template <> void Ptr<cv::gpucodec::detail::Thread::Impl>::delete_obj();
 }
 
-#endif // __THREAD_WRAPPERS_H__
+#endif // __THREAD_WRAPPERS_HPP__
diff --git a/modules/gpucodec/src/video_decoder.cpp b/modules/gpucodec/src/video_decoder.cpp
index 7e28e872bb..d734ef363a 100644
--- a/modules/gpucodec/src/video_decoder.cpp
+++ b/modules/gpucodec/src/video_decoder.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -44,7 +45,7 @@
 
 #ifdef HAVE_NVCUVID
 
-void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& videoFormat)
+void cv::gpucodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
 {
     release();
 
@@ -103,7 +104,7 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
     cuSafeCall( cuvidCreateDecoder(&decoder_, &createInfo_) );
 }
 
-void cv::gpu::detail::VideoDecoder::release()
+void cv::gpucodec::detail::VideoDecoder::release()
 {
     if (decoder_)
     {
diff --git a/modules/gpucodec/src/video_decoder.h b/modules/gpucodec/src/video_decoder.hpp
similarity index 85%
rename from modules/gpucodec/src/video_decoder.h
rename to modules/gpucodec/src/video_decoder.hpp
index 7a36335cc3..05a92f2664 100644
--- a/modules/gpucodec/src/video_decoder.h
+++ b/modules/gpucodec/src/video_decoder.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -40,21 +41,21 @@
 //
 //M*/
 
-#ifndef __VIDEO_DECODER_H__
-#define __VIDEO_DECODER_H__
+#ifndef __VIDEO_DECODER_HPP__
+#define __VIDEO_DECODER_HPP__
+
+#include <nvcuvid.h>
 
 #include "opencv2/core/private.gpu.hpp"
 #include "opencv2/gpucodec.hpp"
 
-#include <nvcuvid.h>
-
-namespace cv { namespace gpu { namespace detail
+namespace cv { namespace gpucodec { namespace detail
 {
 
 class VideoDecoder
 {
 public:
-    VideoDecoder(const VideoReader_GPU::FormatInfo& videoFormat, CUvideoctxlock lock) : lock_(lock), decoder_(0)
+    VideoDecoder(const FormatInfo& videoFormat, CUvideoctxlock lock) : lock_(lock), decoder_(0)
     {
         create(videoFormat);
     }
@@ -64,7 +65,7 @@ public:
         release();
     }
 
-    void create(const VideoReader_GPU::FormatInfo& videoFormat);
+    void create(const FormatInfo& videoFormat);
     void release();
 
     // Get the code-type currently used.
@@ -84,17 +85,17 @@ public:
         return cuvidDecodePicture(decoder_, picParams) == CUDA_SUCCESS;
     }
 
-    cv::gpu::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
+    gpu::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
     {
         CUdeviceptr ptr;
         unsigned int pitch;
 
         cuSafeCall( cuvidMapVideoFrame(decoder_, picIdx, &ptr, &pitch, &videoProcParams) );
 
-        return GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
+        return gpu::GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
     }
 
-    void unmapFrame(cv::gpu::GpuMat& frame)
+    void unmapFrame(gpu::GpuMat& frame)
     {
         cuSafeCall( cuvidUnmapVideoFrame(decoder_, (CUdeviceptr) frame.data) );
         frame.release();
@@ -108,4 +109,4 @@ private:
 
 }}}
 
-#endif // __VIDEO_DECODER_H__
+#endif // __VIDEO_DECODER_HPP__
diff --git a/modules/gpucodec/src/video_parser.cpp b/modules/gpucodec/src/video_parser.cpp
index 620f85fe8f..66aab62ad1 100644
--- a/modules/gpucodec/src/video_parser.cpp
+++ b/modules/gpucodec/src/video_parser.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -44,11 +45,11 @@
 
 #ifdef HAVE_NVCUVID
 
-cv::gpu::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue* frameQueue) :
+cv::gpucodec::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue* frameQueue) :
     videoDecoder_(videoDecoder), frameQueue_(frameQueue), unparsedPackets_(0), hasError_(false)
 {
     CUVIDPARSERPARAMS params;
-    memset(&params, 0, sizeof(CUVIDPARSERPARAMS));
+    std::memset(&params, 0, sizeof(CUVIDPARSERPARAMS));
 
     params.CodecType              = videoDecoder->codec();
     params.ulMaxNumDecodeSurfaces = videoDecoder->maxDecodeSurfaces();
@@ -61,7 +62,7 @@ cv::gpu::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue
     cuSafeCall( cuvidCreateVideoParser(&parser_, &params) );
 }
 
-bool cv::gpu::detail::VideoParser::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
+bool cv::gpucodec::detail::VideoParser::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
 {
     CUVIDSOURCEDATAPACKET packet;
     std::memset(&packet, 0, sizeof(CUVIDSOURCEDATAPACKET));
@@ -95,7 +96,7 @@ bool cv::gpu::detail::VideoParser::parseVideoData(const unsigned char* data, siz
     return !frameQueue_->isEndOfDecode();
 }
 
-int CUDAAPI cv::gpu::detail::VideoParser::HandleVideoSequence(void* userData, CUVIDEOFORMAT* format)
+int CUDAAPI cv::gpucodec::detail::VideoParser::HandleVideoSequence(void* userData, CUVIDEOFORMAT* format)
 {
     VideoParser* thiz = static_cast<VideoParser*>(userData);
 
@@ -106,10 +107,10 @@ int CUDAAPI cv::gpu::detail::VideoParser::HandleVideoSequence(void* userData, CU
         format->coded_height  != thiz->videoDecoder_->frameHeight() ||
         format->chroma_format != thiz->videoDecoder_->chromaFormat())
     {
-        VideoReader_GPU::FormatInfo newFormat;
+        FormatInfo newFormat;
 
-        newFormat.codec = static_cast<VideoReader_GPU::Codec>(format->codec);
-        newFormat.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(format->chroma_format);
+        newFormat.codec = static_cast<Codec>(format->codec);
+        newFormat.chromaFormat = static_cast<ChromaFormat>(format->chroma_format);
         newFormat.width = format->coded_width;
         newFormat.height = format->coded_height;
 
@@ -127,7 +128,7 @@ int CUDAAPI cv::gpu::detail::VideoParser::HandleVideoSequence(void* userData, CU
     return true;
 }
 
-int CUDAAPI cv::gpu::detail::VideoParser::HandlePictureDecode(void* userData, CUVIDPICPARAMS* picParams)
+int CUDAAPI cv::gpucodec::detail::VideoParser::HandlePictureDecode(void* userData, CUVIDPICPARAMS* picParams)
 {
     VideoParser* thiz = static_cast<VideoParser*>(userData);
 
@@ -147,7 +148,7 @@ int CUDAAPI cv::gpu::detail::VideoParser::HandlePictureDecode(void* userData, CU
     return true;
 }
 
-int CUDAAPI cv::gpu::detail::VideoParser::HandlePictureDisplay(void* userData, CUVIDPARSERDISPINFO* picParams)
+int CUDAAPI cv::gpucodec::detail::VideoParser::HandlePictureDisplay(void* userData, CUVIDPARSERDISPINFO* picParams)
 {
     VideoParser* thiz = static_cast<VideoParser*>(userData);
 
diff --git a/modules/gpucodec/src/video_parser.h b/modules/gpucodec/src/video_parser.hpp
similarity index 92%
rename from modules/gpucodec/src/video_parser.h
rename to modules/gpucodec/src/video_parser.hpp
index e11b7eff6a..b4dddb3895 100644
--- a/modules/gpucodec/src/video_parser.h
+++ b/modules/gpucodec/src/video_parser.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -40,17 +41,17 @@
 //
 //M*/
 
-#ifndef __VIDEO_PARSER_H__
-#define __VIDEO_PARSER_H__
-
-#include "opencv2/core/private.gpu.hpp"
-#include "opencv2/gpucodec.hpp"
-#include "frame_queue.h"
-#include "video_decoder.h"
+#ifndef __VIDEO_PARSER_HPP__
+#define __VIDEO_PARSER_HPP__
 
 #include <nvcuvid.h>
 
-namespace cv { namespace gpu { namespace detail
+#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/gpucodec.hpp"
+#include "frame_queue.hpp"
+#include "video_decoder.hpp"
+
+namespace cv { namespace gpucodec { namespace detail
 {
 
 class VideoParser
@@ -91,4 +92,4 @@ private:
 
 }}}
 
-#endif // __VIDEO_PARSER_H__
+#endif // __VIDEO_PARSER_HPP__
diff --git a/modules/gpucodec/src/video_reader.cpp b/modules/gpucodec/src/video_reader.cpp
index dbb4bbcf2b..67e9cd1078 100644
--- a/modules/gpucodec/src/video_reader.cpp
+++ b/modules/gpucodec/src/video_reader.cpp
@@ -42,88 +42,77 @@
 
 #include "precomp.hpp"
 
+using namespace cv;
+using namespace cv::gpu;
+using namespace cv::gpucodec;
+
 #ifndef HAVE_NVCUVID
 
-class cv::gpu::VideoReader_GPU::Impl
-{
-};
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU() { throw_no_cuda(); }
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const String&) { throw_no_cuda(); }
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const cv::Ptr<VideoSource>&) { throw_no_cuda(); }
-cv::gpu::VideoReader_GPU::~VideoReader_GPU() { }
-void cv::gpu::VideoReader_GPU::open(const String&) { throw_no_cuda(); }
-void cv::gpu::VideoReader_GPU::open(const cv::Ptr<VideoSource>&) { throw_no_cuda(); }
-bool cv::gpu::VideoReader_GPU::isOpened() const { return false; }
-void cv::gpu::VideoReader_GPU::close() { }
-bool cv::gpu::VideoReader_GPU::read(GpuMat&) { throw_no_cuda(); return false; }
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::VideoReader_GPU::format() const { throw_no_cuda(); FormatInfo format_ = {MPEG1,Monochrome,0,0}; return format_; }
-bool cv::gpu::VideoReader_GPU::VideoSource::parseVideoData(const unsigned char*, size_t, bool) { throw_no_cuda(); return false; }
-void cv::gpu::VideoReader_GPU::dumpFormat(std::ostream&) { throw_no_cuda(); }
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const String&) { throw_no_cuda(); return Ptr<VideoReader>(); }
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const Ptr<RawVideoSource>&) { throw_no_cuda(); return Ptr<VideoReader>(); }
 
 #else // HAVE_NVCUVID
 
-class cv::gpu::VideoReader_GPU::Impl
-{
-public:
-    explicit Impl(const cv::Ptr<cv::gpu::VideoReader_GPU::VideoSource>& source);
-    ~Impl();
-
-    bool grab(cv::gpu::GpuMat& frame);
-
-    cv::gpu::VideoReader_GPU::FormatInfo format() const { return videoSource_->format(); }
-
-private:
-    cv::Ptr<cv::gpu::VideoReader_GPU::VideoSource> videoSource_;
-
-    cv::Ptr<cv::gpu::detail::FrameQueue> frameQueue_;
-    cv::Ptr<cv::gpu::detail::VideoDecoder> videoDecoder_;
-    cv::Ptr<cv::gpu::detail::VideoParser> videoParser_;
-
-    CUvideoctxlock lock_;
-
-    std::deque< std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> > frames_;
-};
-
-cv::gpu::VideoReader_GPU::Impl::Impl(const cv::Ptr<VideoSource>& source) :
-    videoSource_(source),
-    lock_(0)
-{
-    // init context
-    GpuMat temp(1, 1, CV_8UC1);
-    temp.release();
-
-    DeviceInfo devInfo;
-    CV_Assert( devInfo.supports(FEATURE_SET_COMPUTE_11) );
-
-    CUcontext ctx;
-    cuSafeCall( cuCtxGetCurrent(&ctx) );
-    cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
-
-    frameQueue_ = new detail::FrameQueue;
-    videoDecoder_ = new detail::VideoDecoder(videoSource_->format(), lock_);
-    videoParser_ = new detail::VideoParser(videoDecoder_, frameQueue_);
-
-    videoSource_->setFrameQueue(frameQueue_);
-    videoSource_->setVideoParser(videoParser_);
-
-    videoSource_->start();
-}
-
-cv::gpu::VideoReader_GPU::Impl::~Impl()
-{
-    frameQueue_->endDecode();
-    videoSource_->stop();
-}
-
 namespace cv { namespace gpu { namespace cudev
 {
-    void loadHueCSC(float hueCSC[9]);
     void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream = 0);
 }}}
 
 namespace
 {
+    class VideoReaderImpl : public VideoReader
+    {
+    public:
+        explicit VideoReaderImpl(const Ptr<detail::VideoSource>& source);
+        ~VideoReaderImpl();
+
+        bool nextFrame(OutputArray frame);
+
+        FormatInfo format() const;
+
+    private:
+        Ptr<detail::VideoSource> videoSource_;
+
+        Ptr<detail::FrameQueue> frameQueue_;
+        Ptr<detail::VideoDecoder> videoDecoder_;
+        Ptr<detail::VideoParser> videoParser_;
+
+        CUvideoctxlock lock_;
+
+        std::deque< std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> > frames_;
+    };
+
+    FormatInfo VideoReaderImpl::format() const
+    {
+        return videoSource_->format();
+    }
+
+    VideoReaderImpl::VideoReaderImpl(const Ptr<detail::VideoSource>& source) :
+        videoSource_(source),
+        lock_(0)
+    {
+        // init context
+        GpuMat temp(1, 1, CV_8UC1);
+        temp.release();
+
+        CUcontext ctx;
+        cuSafeCall( cuCtxGetCurrent(&ctx) );
+        cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
+
+        frameQueue_ = new detail::FrameQueue;
+        videoDecoder_ = new detail::VideoDecoder(videoSource_->format(), lock_);
+        videoParser_ = new detail::VideoParser(videoDecoder_, frameQueue_);
+
+        videoSource_->setVideoParser(videoParser_);
+        videoSource_->start();
+    }
+
+    VideoReaderImpl::~VideoReaderImpl()
+    {
+        frameQueue_->endDecode();
+        videoSource_->stop();
+    }
+
     class VideoCtxAutoLock
     {
     public:
@@ -134,259 +123,114 @@ namespace
         CUvideoctxlock m_lock;
     };
 
-    enum ColorSpace
-    {
-        ITU601 = 1,
-        ITU709 = 2
-    };
-
-    void setColorSpaceMatrix(ColorSpace CSC, float hueCSC[9], float hue)
-    {
-        float hueSin = std::sin(hue);
-        float hueCos = std::cos(hue);
-
-        if (CSC == ITU601)
-        {
-            //CCIR 601
-            hueCSC[0] = 1.1644f;
-            hueCSC[1] = hueSin * 1.5960f;
-            hueCSC[2] = hueCos * 1.5960f;
-            hueCSC[3] = 1.1644f;
-            hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
-            hueCSC[5] = (hueSin *  0.3918f) - (hueCos * 0.8130f);
-            hueCSC[6] = 1.1644f;
-            hueCSC[7] = hueCos *  2.0172f;
-            hueCSC[8] = hueSin * -2.0172f;
-        }
-        else if (CSC == ITU709)
-        {
-            //CCIR 709
-            hueCSC[0] = 1.0f;
-            hueCSC[1] = hueSin * 1.57480f;
-            hueCSC[2] = hueCos * 1.57480f;
-            hueCSC[3] = 1.0;
-            hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
-            hueCSC[5] = (hueSin *  0.18732f) - (hueCos * 0.46812f);
-            hueCSC[6] = 1.0f;
-            hueCSC[7] = hueCos *  1.85560f;
-            hueCSC[8] = hueSin * -1.85560f;
-        }
-    }
-
-    void cudaPostProcessFrame(const cv::gpu::GpuMat& decodedFrame, cv::gpu::GpuMat& interopFrame, int width, int height)
+    void cudaPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height)
     {
         using namespace cv::gpu::cudev;
 
-        static bool updateCSC = true;
-        static float hueColorSpaceMat[9];
-
-        // Upload the Color Space Conversion Matrices
-        if (updateCSC)
-        {
-            const ColorSpace colorSpace = ITU601;
-            const float hue = 0.0f;
-
-            // CCIR 601/709
-            setColorSpaceMatrix(colorSpace, hueColorSpaceMat, hue);
-
-            updateCSC = false;
-        }
-
         // Final Stage: NV12toARGB color space conversion
 
-        interopFrame.create(height, width, CV_8UC4);
+        _outFrame.create(height, width, CV_8UC4);
+        GpuMat outFrame = _outFrame.getGpuMat();
 
-        loadHueCSC(hueColorSpaceMat);
-
-        NV12_to_RGB(decodedFrame, interopFrame);
+        NV12_to_RGB(decodedFrame, outFrame);
     }
-}
 
-bool cv::gpu::VideoReader_GPU::Impl::grab(GpuMat& frame)
-{
-    if (videoSource_->hasError() || videoParser_->hasError())
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported video source");
-
-    if (!videoSource_->isStarted() || frameQueue_->isEndOfDecode())
-        return false;
-
-    if (frames_.empty())
+    bool VideoReaderImpl::nextFrame(OutputArray frame)
     {
-        CUVIDPARSERDISPINFO displayInfo;
+        if (videoSource_->hasError() || videoParser_->hasError())
+            CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
 
-        for (;;)
+        if (!videoSource_->isStarted() || frameQueue_->isEndOfDecode())
+            return false;
+
+        if (frames_.empty())
         {
-            if (frameQueue_->dequeue(displayInfo))
-                break;
+            CUVIDPARSERDISPINFO displayInfo;
 
-            if (videoSource_->hasError() || videoParser_->hasError())
-                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported video source");
+            for (;;)
+            {
+                if (frameQueue_->dequeue(displayInfo))
+                    break;
 
-            if (frameQueue_->isEndOfDecode())
-                return false;
+                if (videoSource_->hasError() || videoParser_->hasError())
+                    CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
 
-            // Wait a bit
-            detail::Thread::sleep(1);
+                if (frameQueue_->isEndOfDecode())
+                    return false;
+
+                // Wait a bit
+                detail::Thread::sleep(1);
+            }
+
+            bool isProgressive = displayInfo.progressive_frame != 0;
+            const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
+
+            for (int active_field = 0; active_field < num_fields; ++active_field)
+            {
+                CUVIDPROCPARAMS videoProcParams;
+                std::memset(&videoProcParams, 0, sizeof(CUVIDPROCPARAMS));
+
+                videoProcParams.progressive_frame = displayInfo.progressive_frame;
+                videoProcParams.second_field      = active_field;
+                videoProcParams.top_field_first   = displayInfo.top_field_first;
+                videoProcParams.unpaired_field    = (num_fields == 1);
+
+                frames_.push_back(std::make_pair(displayInfo, videoProcParams));
+            }
         }
 
-        bool isProgressive = displayInfo.progressive_frame != 0;
-        const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
+        if (frames_.empty())
+            return false;
+
+        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo = frames_.front();
+        frames_.pop_front();
 
-        for (int active_field = 0; active_field < num_fields; ++active_field)
         {
-            CUVIDPROCPARAMS videoProcParams;
-            std::memset(&videoProcParams, 0, sizeof(CUVIDPROCPARAMS));
+            VideoCtxAutoLock autoLock(lock_);
 
-            videoProcParams.progressive_frame = displayInfo.progressive_frame;
-            videoProcParams.second_field      = active_field;
-            videoProcParams.top_field_first   = displayInfo.top_field_first;
-            videoProcParams.unpaired_field    = (num_fields == 1);
+            // map decoded video frame to CUDA surface
+            GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
 
-            frames_.push_back(std::make_pair(displayInfo, videoProcParams));
+            // perform post processing on the CUDA surface (performs colors space conversion and post processing)
+            // comment this out if we inclue the line of code seen above
+            cudaPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
+
+            // unmap video frame
+            // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
+            videoDecoder_->unmapFrame(decodedFrame);
         }
+
+        // release the frame, so it can be re-used in decoder
+        if (frames_.empty())
+            frameQueue_->releaseFrame(frameInfo.first);
+
+        return true;
     }
-
-    if (frames_.empty())
-        return false;
-
-    std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo = frames_.front();
-    frames_.pop_front();
-
-    {
-        VideoCtxAutoLock autoLock(lock_);
-
-        // map decoded video frame to CUDA surface
-        cv::gpu::GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
-
-        // perform post processing on the CUDA surface (performs colors space conversion and post processing)
-        // comment this out if we inclue the line of code seen above
-        cudaPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
-
-        // unmap video frame
-        // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
-        videoDecoder_->unmapFrame(decodedFrame);
-    }
-
-    // release the frame, so it can be re-used in decoder
-    if (frames_.empty())
-        frameQueue_->releaseFrame(frameInfo.first);
-
-    return true;
 }
 
-////////////////////////////////////////////////////////////////////////////
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU()
-{
-}
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const String& filename)
-{
-    open(filename);
-}
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const cv::Ptr<VideoSource>& source)
-{
-    open(source);
-}
-
-cv::gpu::VideoReader_GPU::~VideoReader_GPU()
-{
-    close();
-}
-
-void cv::gpu::VideoReader_GPU::open(const String& filename)
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const String& filename)
 {
     CV_Assert( !filename.empty() );
 
-#ifndef __APPLE__
+    Ptr<detail::VideoSource> videoSource;
+
     try
     {
-        cv::Ptr<VideoSource> source(new detail::CuvidVideoSource(filename));
-        open(source);
+        videoSource = new detail::CuvidVideoSource(filename);
     }
-    catch (const std::runtime_error&)
-#endif
+    catch (...)
     {
-        cv::Ptr<VideoSource> source(new cv::gpu::detail::FFmpegVideoSource(filename));
-        open(source);
-    }
-}
-
-void cv::gpu::VideoReader_GPU::open(const cv::Ptr<VideoSource>& source)
-{
-    CV_Assert( !source.empty() );
-    close();
-    impl_ = new Impl(source);
-}
-
-bool cv::gpu::VideoReader_GPU::isOpened() const
-{
-    return !impl_.empty();
-}
-
-void cv::gpu::VideoReader_GPU::close()
-{
-    impl_.release();
-}
-
-bool cv::gpu::VideoReader_GPU::read(GpuMat& image)
-{
-    if (!isOpened())
-        return false;
-
-    if (!impl_->grab(image))
-    {
-        close();
-        return false;
+        Ptr<RawVideoSource> source(new detail::FFmpegVideoSource(filename));
+        videoSource = new detail::RawVideoSourceWrapper(source);
     }
 
-    return true;
+    return new VideoReaderImpl(videoSource);
 }
 
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::VideoReader_GPU::format() const
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const Ptr<RawVideoSource>& source)
 {
-    CV_Assert( isOpened() );
-    return impl_->format();
-}
-
-bool cv::gpu::VideoReader_GPU::VideoSource::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
-{
-    return videoParser_->parseVideoData(data, size, endOfStream);
-}
-
-void cv::gpu::VideoReader_GPU::dumpFormat(std::ostream& st)
-{
-    static const char* codecs[] =
-    {
-        "MPEG1",
-        "MPEG2",
-        "MPEG4",
-        "VC1",
-        "H264",
-        "JPEG",
-        "H264_SVC",
-        "H264_MVC"
-    };
-
-    static const char* chromas[] =
-    {
-        "Monochrome",
-        "YUV420",
-        "YUV422",
-        "YUV444"
-    };
-
-    FormatInfo _format = this->format();
-
-    st << "Frame Size    : " << _format.width << "x" << _format.height << std::endl;
-    st << "Codec         : " << (_format.codec <= H264_MVC ? codecs[_format.codec] : "Uncompressed YUV") << std::endl;
-    st << "Chroma Format : " << chromas[_format.chromaFormat] << std::endl;
+    Ptr<detail::VideoSource> videoSource(new detail::RawVideoSourceWrapper(source));
+    return new VideoReaderImpl(videoSource);
 }
 
 #endif // HAVE_NVCUVID
-
-template <> void cv::Ptr<cv::gpu::VideoReader_GPU::Impl>::delete_obj()
-{
-    if (obj) delete obj;
-}
diff --git a/modules/gpucodec/src/video_source.cpp b/modules/gpucodec/src/video_source.cpp
new file mode 100644
index 0000000000..ce6a1bd8c0
--- /dev/null
+++ b/modules/gpucodec/src/video_source.cpp
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+using namespace cv;
+using namespace cv::gpucodec;
+using namespace cv::gpucodec::detail;
+
+bool cv::gpucodec::detail::VideoSource::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
+{
+    return videoParser_->parseVideoData(data, size, endOfStream);
+}
+
+cv::gpucodec::detail::RawVideoSourceWrapper::RawVideoSourceWrapper(const Ptr<RawVideoSource>& source) :
+    source_(source)
+{
+    CV_Assert( !source_.empty() );
+}
+
+cv::gpucodec::FormatInfo cv::gpucodec::detail::RawVideoSourceWrapper::format() const
+{
+    return source_->format();
+}
+
+void cv::gpucodec::detail::RawVideoSourceWrapper::start()
+{
+    stop_ = false;
+    hasError_ = false;
+    thread_ = new Thread(readLoop, this);
+}
+
+void cv::gpucodec::detail::RawVideoSourceWrapper::stop()
+{
+    stop_ = true;
+    thread_->wait();
+    thread_.release();
+}
+
+bool cv::gpucodec::detail::RawVideoSourceWrapper::isStarted() const
+{
+    return !stop_;
+}
+
+bool cv::gpucodec::detail::RawVideoSourceWrapper::hasError() const
+{
+    return hasError_;
+}
+
+void cv::gpucodec::detail::RawVideoSourceWrapper::readLoop(void* userData)
+{
+    RawVideoSourceWrapper* thiz = static_cast<RawVideoSourceWrapper*>(userData);
+
+    for (;;)
+    {
+        unsigned char* data;
+        int size;
+        bool endOfFile;
+
+        if (!thiz->source_->getNextPacket(&data, &size, &endOfFile))
+        {
+            thiz->hasError_ = !endOfFile;
+            break;
+        }
+
+        if (!thiz->parseVideoData(data, size))
+        {
+            thiz->hasError_ = true;
+            break;
+        }
+
+        if (thiz->stop_)
+            break;
+    }
+
+    thiz->parseVideoData(0, 0, true);
+}
+
+#endif // HAVE_NVCUVID
diff --git a/modules/gpucodec/src/video_source.hpp b/modules/gpucodec/src/video_source.hpp
new file mode 100644
index 0000000000..b4d930ee09
--- /dev/null
+++ b/modules/gpucodec/src/video_source.hpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __GPUCODEC_VIDEO_SOURCE_H__
+#define __GPUCODEC_VIDEO_SOURCE_H__
+
+#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/gpucodec.hpp"
+#include "thread.hpp"
+
+namespace cv { namespace gpucodec { namespace detail
+{
+
+class VideoParser;
+
+class VideoSource
+{
+public:
+    virtual ~VideoSource() {}
+
+    virtual FormatInfo format() const = 0;
+    virtual void start() = 0;
+    virtual void stop() = 0;
+    virtual bool isStarted() const = 0;
+    virtual bool hasError() const = 0;
+
+    void setVideoParser(detail::VideoParser* videoParser) { videoParser_ = videoParser; }
+
+protected:
+    bool parseVideoData(const uchar* data, size_t size, bool endOfStream = false);
+
+private:
+    detail::VideoParser* videoParser_;
+};
+
+class RawVideoSourceWrapper : public VideoSource
+{
+public:
+    RawVideoSourceWrapper(const Ptr<RawVideoSource>& source);
+
+    FormatInfo format() const;
+    void start();
+    void stop();
+    bool isStarted() const;
+    bool hasError() const;
+
+private:
+    Ptr<RawVideoSource> source_;
+
+    Ptr<Thread> thread_;
+    volatile bool stop_;
+    volatile bool hasError_;
+
+    static void readLoop(void* userData);
+};
+
+}}}
+
+#endif // __GPUCODEC_VIDEO_SOURCE_H__
diff --git a/modules/gpucodec/src/video_writer.cpp b/modules/gpucodec/src/video_writer.cpp
index 94100c0b8c..6ffb7c12d7 100644
--- a/modules/gpucodec/src/video_writer.cpp
+++ b/modules/gpucodec/src/video_writer.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -42,36 +43,32 @@
 
 #include "precomp.hpp"
 
+using namespace cv;
+using namespace cv::gpu;
+using namespace cv::gpucodec;
+
 #if !defined(HAVE_NVCUVID) || !defined(WIN32)
 
-class cv::gpu::VideoWriter_GPU::Impl
-{
-};
+cv::gpucodec::EncoderParams::EncoderParams() { throw_no_cuda(); }
+cv::gpucodec::EncoderParams::EncoderParams(const String&) { throw_no_cuda(); }
+void cv::gpucodec::EncoderParams::load(const String&) { throw_no_cuda(); }
+void cv::gpucodec::EncoderParams::save(const String&) const { throw_no_cuda(); }
 
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU() { throw_no_cuda(); }
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const String&, cv::Size, double, SurfaceFormat) { throw_no_cuda(); }
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const String&, cv::Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); }
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>&, cv::Size, double, SurfaceFormat) { throw_no_cuda(); }
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>&, cv::Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); }
-cv::gpu::VideoWriter_GPU::~VideoWriter_GPU() {}
-void cv::gpu::VideoWriter_GPU::open(const String&, cv::Size, double, SurfaceFormat) { throw_no_cuda(); }
-void cv::gpu::VideoWriter_GPU::open(const String&, cv::Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); }
-void cv::gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>&, cv::Size, double, SurfaceFormat) { throw_no_cuda(); }
-void cv::gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>&, cv::Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); }
-bool cv::gpu::VideoWriter_GPU::isOpened() const { return false; }
-void cv::gpu::VideoWriter_GPU::close() {}
-void cv::gpu::VideoWriter_GPU::write(const cv::gpu::GpuMat&, bool) { throw_no_cuda(); }
-cv::gpu::VideoWriter_GPU::EncoderParams cv::gpu::VideoWriter_GPU::getParams() const { EncoderParams params; throw_no_cuda(); return params; }
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const String&, Size, double, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const String&, Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
 
-cv::gpu::VideoWriter_GPU::EncoderParams::EncoderParams() { throw_no_cuda(); }
-cv::gpu::VideoWriter_GPU::EncoderParams::EncoderParams(const String&) { throw_no_cuda(); }
-void cv::gpu::VideoWriter_GPU::EncoderParams::load(const String&) { throw_no_cuda(); }
-void cv::gpu::VideoWriter_GPU::EncoderParams::save(const String&) const { throw_no_cuda(); }
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const Ptr<EncoderCallBack>&, Size, double, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const Ptr<EncoderCallBack>&, Size, double, const EncoderParams&, SurfaceFormat) { throw_no_cuda(); return Ptr<VideoWriter>(); }
 
 #else // !defined HAVE_CUDA || !defined WIN32
 
+namespace cv { namespace gpu { namespace cudev
+{
+    void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream = 0);
+}}}
+
 ///////////////////////////////////////////////////////////////////////////
-// VideoWriter_GPU::Impl
+// VideoWriterImpl
 
 namespace
 {
@@ -84,7 +81,7 @@ namespace
 
             err = NVGetHWEncodeCaps();
             if (err)
-                CV_Error(cv::Error::GpuNotSupported, "No CUDA capability present");
+                CV_Error(Error::GpuNotSupported, "No CUDA capability present");
 
             // Create the Encoder API Interface
             err = NVCreateEncoder(&encoder_);
@@ -108,405 +105,395 @@ namespace
 
     enum CodecType
     {
-        MPEG1, //not supported yet
-        MPEG2, //not supported yet
-        MPEG4, //not supported yet
+        MPEG1, // not supported yet
+        MPEG2, // not supported yet
+        MPEG4, // not supported yet
         H264
     };
-}
 
-class cv::gpu::VideoWriter_GPU::Impl
-{
-public:
-    Impl(const cv::Ptr<EncoderCallBack>& callback, cv::Size frameSize, double fps, SurfaceFormat format, CodecType codec = H264);
-    Impl(const cv::Ptr<EncoderCallBack>& callback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format, CodecType codec = H264);
-
-    void write(const cv::gpu::GpuMat& image, bool lastFrame);
-
-    EncoderParams getParams() const;
-
-private:
-    Impl(const Impl&);
-    Impl& operator=(const Impl&);
-
-    void initEncoder(double fps);
-    void setEncodeParams(const EncoderParams& params);
-    void initGpuMemory();
-    void initCallBacks();
-    void createHWEncoder();
-
-    cv::Ptr<EncoderCallBack> callback_;
-    cv::Size frameSize_;
-
-    CodecType codec_;
-    SurfaceFormat inputFormat_;
-    NVVE_SurfaceFormat surfaceFormat_;
-
-    NVEncoderWrapper encoder_;
-
-    cv::gpu::GpuMat videoFrame_;
-    CUvideoctxlock cuCtxLock_;
-
-    // CallBacks
-
-    static unsigned char* NVENCAPI HandleAcquireBitStream(int* pBufferSize, void* pUserdata);
-    static void NVENCAPI HandleReleaseBitStream(int nBytesInBuffer, unsigned char* cb, void* pUserdata);
-    static void NVENCAPI HandleOnBeginFrame(const NVVE_BeginFrameInfo* pbfi, void* pUserdata);
-    static void NVENCAPI HandleOnEndFrame(const NVVE_EndFrameInfo* pefi, void* pUserdata);
-};
-
-cv::gpu::VideoWriter_GPU::Impl::Impl(const cv::Ptr<EncoderCallBack>& callback, cv::Size frameSize, double fps, SurfaceFormat format, CodecType codec) :
-    callback_(callback),
-    frameSize_(frameSize),
-    codec_(codec),
-    inputFormat_(format),
-    cuCtxLock_(0)
-{
-    surfaceFormat_ = inputFormat_ == SF_BGR ? YV12 : static_cast<NVVE_SurfaceFormat>(inputFormat_);
-
-    initEncoder(fps);
-
-    initGpuMemory();
-
-    initCallBacks();
-
-    createHWEncoder();
-}
-
-cv::gpu::VideoWriter_GPU::Impl::Impl(const cv::Ptr<EncoderCallBack>& callback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format, CodecType codec) :
-    callback_(callback),
-    frameSize_(frameSize),
-    codec_(codec),
-    inputFormat_(format),
-    cuCtxLock_(0)
-{
-    surfaceFormat_ = inputFormat_ == SF_BGR ? YV12 : static_cast<NVVE_SurfaceFormat>(inputFormat_);
-
-    initEncoder(fps);
-
-    setEncodeParams(params);
-
-    initGpuMemory();
-
-    initCallBacks();
-
-    createHWEncoder();
-}
-
-void cv::gpu::VideoWriter_GPU::Impl::initEncoder(double fps)
-{
-    int err;
-
-    // Set codec
-
-    static const unsigned long codecs_id[] =
+    class VideoWriterImpl : public VideoWriter
     {
-        NV_CODEC_TYPE_MPEG1, NV_CODEC_TYPE_MPEG2, NV_CODEC_TYPE_MPEG4, NV_CODEC_TYPE_H264, NV_CODEC_TYPE_VC1
-    };
-    err = NVSetCodec(encoder_, codecs_id[codec_]);
-    if (err)
-        CV_Error(cv::Error::StsNotImplemented, "Codec format is not supported");
+    public:
+        VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, SurfaceFormat format, CodecType codec = H264);
+        VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format, CodecType codec = H264);
 
-    // Set default params
+        void write(InputArray frame, bool lastFrame = false);
 
-    err = NVSetDefaultParam(encoder_);
-    CV_Assert( err == 0 );
+        EncoderParams getEncoderParams() const;
 
-    // Set some common params
+    private:
+        void initEncoder(double fps);
+        void setEncodeParams(const EncoderParams& params);
+        void initGpuMemory();
+        void initCallBacks();
+        void createHWEncoder();
 
-    int inputSize[] = { frameSize_.width, frameSize_.height };
-    err = NVSetParamValue(encoder_, NVVE_IN_SIZE, &inputSize);
-    CV_Assert( err == 0 );
-    err = NVSetParamValue(encoder_, NVVE_OUT_SIZE, &inputSize);
-    CV_Assert( err == 0 );
+        Ptr<EncoderCallBack> callback_;
+        Size frameSize_;
 
-    int aspectRatio[] = { frameSize_.width, frameSize_.height, ASPECT_RATIO_DAR };
-    err = NVSetParamValue(encoder_, NVVE_ASPECT_RATIO, &aspectRatio);
-    CV_Assert( err == 0 );
+        CodecType codec_;
+        SurfaceFormat inputFormat_;
+        NVVE_SurfaceFormat surfaceFormat_;
 
-    // FPS
+        NVEncoderWrapper encoder_;
 
-    int frame_rate = static_cast<int>(fps + 0.5);
-    int frame_rate_base = 1;
-    while (fabs(static_cast<double>(frame_rate) / frame_rate_base) - fps > 0.001)
-    {
-        frame_rate_base *= 10;
-        frame_rate = static_cast<int>(fps*frame_rate_base + 0.5);
-    }
-    int FrameRate[] = { frame_rate, frame_rate_base };
-    err = NVSetParamValue(encoder_, NVVE_FRAME_RATE, &FrameRate);
-    CV_Assert( err == 0 );
+        GpuMat videoFrame_;
+        CUvideoctxlock cuCtxLock_;
 
-    // Select device for encoding
+        // CallBacks
 
-    int gpuID = cv::gpu::getDevice();
-    err = NVSetParamValue(encoder_, NVVE_FORCE_GPU_SELECTION, &gpuID);
-    CV_Assert( err == 0 );
-}
-
-void cv::gpu::VideoWriter_GPU::Impl::setEncodeParams(const EncoderParams& params)
-{
-    int err;
-
-    int P_Interval = params.P_Interval;
-    err = NVSetParamValue(encoder_, NVVE_P_INTERVAL, &P_Interval);
-    CV_Assert( err == 0 );
-
-    int IDR_Period = params.IDR_Period;
-    err = NVSetParamValue(encoder_, NVVE_IDR_PERIOD, &IDR_Period);
-    CV_Assert( err == 0 );
-
-    int DynamicGOP = params.DynamicGOP;
-    err = NVSetParamValue(encoder_, NVVE_DYNAMIC_GOP, &DynamicGOP);
-    CV_Assert( err == 0 );
-
-    NVVE_RateCtrlType RCType = static_cast<NVVE_RateCtrlType>(params.RCType);
-    err = NVSetParamValue(encoder_, NVVE_RC_TYPE, &RCType);
-    CV_Assert( err == 0 );
-
-    int AvgBitrate = params.AvgBitrate;
-    err = NVSetParamValue(encoder_, NVVE_AVG_BITRATE, &AvgBitrate);
-    CV_Assert( err == 0 );
-
-    int PeakBitrate = params.PeakBitrate;
-    err = NVSetParamValue(encoder_, NVVE_PEAK_BITRATE, &PeakBitrate);
-    CV_Assert( err == 0 );
-
-    int QP_Level_Intra = params.QP_Level_Intra;
-    err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTRA, &QP_Level_Intra);
-    CV_Assert( err == 0 );
-
-    int QP_Level_InterP = params.QP_Level_InterP;
-    err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTER_P, &QP_Level_InterP);
-    CV_Assert( err == 0 );
-
-    int QP_Level_InterB = params.QP_Level_InterB;
-    err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTER_B, &QP_Level_InterB);
-    CV_Assert( err == 0 );
-
-    int DeblockMode = params.DeblockMode;
-    err = NVSetParamValue(encoder_, NVVE_DEBLOCK_MODE, &DeblockMode);
-    CV_Assert( err == 0 );
-
-    int ProfileLevel = params.ProfileLevel;
-    err = NVSetParamValue(encoder_, NVVE_PROFILE_LEVEL, &ProfileLevel);
-    CV_Assert( err == 0 );
-
-    int ForceIntra = params.ForceIntra;
-    err = NVSetParamValue(encoder_, NVVE_FORCE_INTRA, &ForceIntra);
-    CV_Assert( err == 0 );
-
-    int ForceIDR = params.ForceIDR;
-    err = NVSetParamValue(encoder_, NVVE_FORCE_IDR, &ForceIDR);
-    CV_Assert( err == 0 );
-
-    int ClearStat = params.ClearStat;
-    err = NVSetParamValue(encoder_, NVVE_CLEAR_STAT, &ClearStat);
-    CV_Assert( err == 0 );
-
-    NVVE_DI_MODE DIMode = static_cast<NVVE_DI_MODE>(params.DIMode);
-    err = NVSetParamValue(encoder_, NVVE_SET_DEINTERLACE, &DIMode);
-    CV_Assert( err == 0 );
-
-    if (params.Presets != -1)
-    {
-        NVVE_PRESETS_TARGET Presets = static_cast<NVVE_PRESETS_TARGET>(params.Presets);
-        err = NVSetParamValue(encoder_, NVVE_PRESETS, &Presets);
-        CV_Assert ( err == 0 );
-    }
-
-    int DisableCabac = params.DisableCabac;
-    err = NVSetParamValue(encoder_, NVVE_DISABLE_CABAC, &DisableCabac);
-    CV_Assert ( err == 0 );
-
-    int NaluFramingType = params.NaluFramingType;
-    err = NVSetParamValue(encoder_, NVVE_CONFIGURE_NALU_FRAMING_TYPE, &NaluFramingType);
-    CV_Assert ( err == 0 );
-
-    int DisableSPSPPS = params.DisableSPSPPS;
-    err = NVSetParamValue(encoder_, NVVE_DISABLE_SPS_PPS, &DisableSPSPPS);
-    CV_Assert ( err == 0 );
-}
-
-cv::gpu::VideoWriter_GPU::EncoderParams cv::gpu::VideoWriter_GPU::Impl::getParams() const
-{
-    int err;
-
-    EncoderParams params;
-
-    int P_Interval;
-    err = NVGetParamValue(encoder_, NVVE_P_INTERVAL, &P_Interval);
-    CV_Assert( err == 0 );
-    params.P_Interval = P_Interval;
-
-    int IDR_Period;
-    err = NVGetParamValue(encoder_, NVVE_IDR_PERIOD, &IDR_Period);
-    CV_Assert( err == 0 );
-    params.IDR_Period = IDR_Period;
-
-    int DynamicGOP;
-    err = NVGetParamValue(encoder_, NVVE_DYNAMIC_GOP, &DynamicGOP);
-    CV_Assert( err == 0 );
-    params.DynamicGOP = DynamicGOP;
-
-    NVVE_RateCtrlType RCType;
-    err = NVGetParamValue(encoder_, NVVE_RC_TYPE, &RCType);
-    CV_Assert( err == 0 );
-    params.RCType = RCType;
-
-    int AvgBitrate;
-    err = NVGetParamValue(encoder_, NVVE_AVG_BITRATE, &AvgBitrate);
-    CV_Assert( err == 0 );
-    params.AvgBitrate = AvgBitrate;
-
-    int PeakBitrate;
-    err = NVGetParamValue(encoder_, NVVE_PEAK_BITRATE, &PeakBitrate);
-    CV_Assert( err == 0 );
-    params.PeakBitrate = PeakBitrate;
-
-    int QP_Level_Intra;
-    err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTRA, &QP_Level_Intra);
-    CV_Assert( err == 0 );
-    params.QP_Level_Intra = QP_Level_Intra;
-
-    int QP_Level_InterP;
-    err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTER_P, &QP_Level_InterP);
-    CV_Assert( err == 0 );
-    params.QP_Level_InterP = QP_Level_InterP;
-
-    int QP_Level_InterB;
-    err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTER_B, &QP_Level_InterB);
-    CV_Assert( err == 0 );
-    params.QP_Level_InterB = QP_Level_InterB;
-
-    int DeblockMode;
-    err = NVGetParamValue(encoder_, NVVE_DEBLOCK_MODE, &DeblockMode);
-    CV_Assert( err == 0 );
-    params.DeblockMode = DeblockMode;
-
-    int ProfileLevel;
-    err = NVGetParamValue(encoder_, NVVE_PROFILE_LEVEL, &ProfileLevel);
-    CV_Assert( err == 0 );
-    params.ProfileLevel = ProfileLevel;
-
-    int ForceIntra;
-    err = NVGetParamValue(encoder_, NVVE_FORCE_INTRA, &ForceIntra);
-    CV_Assert( err == 0 );
-    params.ForceIntra = ForceIntra;
-
-    int ForceIDR;
-    err = NVGetParamValue(encoder_, NVVE_FORCE_IDR, &ForceIDR);
-    CV_Assert( err == 0 );
-    params.ForceIDR = ForceIDR;
-
-    int ClearStat;
-    err = NVGetParamValue(encoder_, NVVE_CLEAR_STAT, &ClearStat);
-    CV_Assert( err == 0 );
-    params.ClearStat = ClearStat;
-
-    NVVE_DI_MODE DIMode;
-    err = NVGetParamValue(encoder_, NVVE_SET_DEINTERLACE, &DIMode);
-    CV_Assert( err == 0 );
-    params.DIMode = DIMode;
-
-    params.Presets = -1;
-
-    int DisableCabac;
-    err = NVGetParamValue(encoder_, NVVE_DISABLE_CABAC, &DisableCabac);
-    CV_Assert ( err == 0 );
-    params.DisableCabac = DisableCabac;
-
-    int NaluFramingType;
-    err = NVGetParamValue(encoder_, NVVE_CONFIGURE_NALU_FRAMING_TYPE, &NaluFramingType);
-    CV_Assert ( err == 0 );
-    params.NaluFramingType = NaluFramingType;
-
-    int DisableSPSPPS;
-    err = NVGetParamValue(encoder_, NVVE_DISABLE_SPS_PPS, &DisableSPSPPS);
-    CV_Assert ( err == 0 );
-    params.DisableSPSPPS = DisableSPSPPS;
-
-    return params;
-}
-
-void cv::gpu::VideoWriter_GPU::Impl::initGpuMemory()
-{
-    int err;
-    CUresult cuRes;
-
-    // initialize context
-    cv::gpu::GpuMat temp(1, 1, CV_8U);
-    temp.release();
-
-    static const int bpp[] =
-    {
-        16, // UYVY, 4:2:2
-        16, // YUY2, 4:2:2
-        12, // YV12, 4:2:0
-        12, // NV12, 4:2:0
-        12, // IYUV, 4:2:0
+        static unsigned char* NVENCAPI HandleAcquireBitStream(int* pBufferSize, void* pUserdata);
+        static void NVENCAPI HandleReleaseBitStream(int nBytesInBuffer, unsigned char* cb, void* pUserdata);
+        static void NVENCAPI HandleOnBeginFrame(const NVVE_BeginFrameInfo* pbfi, void* pUserdata);
+        static void NVENCAPI HandleOnEndFrame(const NVVE_EndFrameInfo* pefi, void* pUserdata);
     };
 
-    CUcontext cuContext;
-    cuRes = cuCtxGetCurrent(&cuContext);
-    CV_Assert( cuRes == CUDA_SUCCESS );
+    VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, SurfaceFormat format, CodecType codec) :
+        callback_(callback),
+        frameSize_(frameSize),
+        codec_(codec),
+        inputFormat_(format),
+        cuCtxLock_(0)
+    {
+        surfaceFormat_ = (inputFormat_ == SF_BGR ? YV12 : static_cast<NVVE_SurfaceFormat>(inputFormat_));
 
-    // Allocate the CUDA memory Pitched Surface
-    if (surfaceFormat_ == UYVY || surfaceFormat_ == YUY2)
-        videoFrame_.create(frameSize_.height, (frameSize_.width * bpp[surfaceFormat_]) / 8, CV_8UC1);
-    else
-        videoFrame_.create((frameSize_.height * bpp[surfaceFormat_]) / 8, frameSize_.width, CV_8UC1);
+        initEncoder(fps);
 
-    // Create the Video Context Lock (used for synchronization)
-    cuRes = cuvidCtxLockCreate(&cuCtxLock_, cuContext);
-    CV_Assert( cuRes == CUDA_SUCCESS );
+        initGpuMemory();
 
-    // If we are using GPU Device Memory with NVCUVENC, it is necessary to create a
-    // CUDA Context with a Context Lock cuvidCtxLock.  The Context Lock needs to be passed to NVCUVENC
+        initCallBacks();
 
-    int iUseDeviceMem = 1;
-    err = NVSetParamValue(encoder_, NVVE_DEVICE_MEMORY_INPUT, &iUseDeviceMem);
-    CV_Assert ( err == 0 );
+        createHWEncoder();
+    }
 
-    err = NVSetParamValue(encoder_, NVVE_DEVICE_CTX_LOCK, &cuCtxLock_);
-    CV_Assert ( err == 0 );
-}
+    VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallBack>& callback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format, CodecType codec) :
+        callback_(callback),
+        frameSize_(frameSize),
+        codec_(codec),
+        inputFormat_(format),
+        cuCtxLock_(0)
+    {
+        surfaceFormat_ = (inputFormat_ == SF_BGR ? YV12 : static_cast<NVVE_SurfaceFormat>(inputFormat_));
 
-void cv::gpu::VideoWriter_GPU::Impl::initCallBacks()
-{
-    NVVE_CallbackParams cb;
-    memset(&cb, 0, sizeof(NVVE_CallbackParams));
+        initEncoder(fps);
 
-    cb.pfnacquirebitstream = HandleAcquireBitStream;
-    cb.pfnonbeginframe     = HandleOnBeginFrame;
-    cb.pfnonendframe       = HandleOnEndFrame;
-    cb.pfnreleasebitstream = HandleReleaseBitStream;
+        setEncodeParams(params);
 
-    NVRegisterCB(encoder_, cb, this);
-}
+        initGpuMemory();
 
-void cv::gpu::VideoWriter_GPU::Impl::createHWEncoder()
-{
-    int err;
+        initCallBacks();
 
-    // Create the NVIDIA HW resources for Encoding on NVIDIA hardware
-    err = NVCreateHWEncoder(encoder_);
-    CV_Assert( err == 0 );
-}
+        createHWEncoder();
+    }
+
+    void VideoWriterImpl::initEncoder(double fps)
+    {
+        int err;
+
+        // Set codec
+
+        static const unsigned long codecs_id[] =
+        {
+            NV_CODEC_TYPE_MPEG1, NV_CODEC_TYPE_MPEG2, NV_CODEC_TYPE_MPEG4, NV_CODEC_TYPE_H264, NV_CODEC_TYPE_VC1
+        };
+        err = NVSetCodec(encoder_, codecs_id[codec_]);
+        if (err)
+            CV_Error(Error::StsNotImplemented, "Codec format is not supported");
+
+        // Set default params
+
+        err = NVSetDefaultParam(encoder_);
+        CV_Assert( err == 0 );
+
+        // Set some common params
+
+        int inputSize[] = { frameSize_.width, frameSize_.height };
+        err = NVSetParamValue(encoder_, NVVE_IN_SIZE, &inputSize);
+        CV_Assert( err == 0 );
+        err = NVSetParamValue(encoder_, NVVE_OUT_SIZE, &inputSize);
+        CV_Assert( err == 0 );
+
+        int aspectRatio[] = { frameSize_.width, frameSize_.height, ASPECT_RATIO_DAR };
+        err = NVSetParamValue(encoder_, NVVE_ASPECT_RATIO, &aspectRatio);
+        CV_Assert( err == 0 );
+
+        // FPS
+
+        int frame_rate = static_cast<int>(fps + 0.5);
+        int frame_rate_base = 1;
+        while (fabs(static_cast<double>(frame_rate) / frame_rate_base) - fps > 0.001)
+        {
+            frame_rate_base *= 10;
+            frame_rate = static_cast<int>(fps*frame_rate_base + 0.5);
+        }
+        int FrameRate[] = { frame_rate, frame_rate_base };
+        err = NVSetParamValue(encoder_, NVVE_FRAME_RATE, &FrameRate);
+        CV_Assert( err == 0 );
+
+        // Select device for encoding
+
+        int gpuID = getDevice();
+        err = NVSetParamValue(encoder_, NVVE_FORCE_GPU_SELECTION, &gpuID);
+        CV_Assert( err == 0 );
+    }
+
+    void VideoWriterImpl::setEncodeParams(const EncoderParams& params)
+    {
+        int err;
+
+        int P_Interval = params.P_Interval;
+        err = NVSetParamValue(encoder_, NVVE_P_INTERVAL, &P_Interval);
+        CV_Assert( err == 0 );
+
+        int IDR_Period = params.IDR_Period;
+        err = NVSetParamValue(encoder_, NVVE_IDR_PERIOD, &IDR_Period);
+        CV_Assert( err == 0 );
+
+        int DynamicGOP = params.DynamicGOP;
+        err = NVSetParamValue(encoder_, NVVE_DYNAMIC_GOP, &DynamicGOP);
+        CV_Assert( err == 0 );
+
+        NVVE_RateCtrlType RCType = static_cast<NVVE_RateCtrlType>(params.RCType);
+        err = NVSetParamValue(encoder_, NVVE_RC_TYPE, &RCType);
+        CV_Assert( err == 0 );
+
+        int AvgBitrate = params.AvgBitrate;
+        err = NVSetParamValue(encoder_, NVVE_AVG_BITRATE, &AvgBitrate);
+        CV_Assert( err == 0 );
+
+        int PeakBitrate = params.PeakBitrate;
+        err = NVSetParamValue(encoder_, NVVE_PEAK_BITRATE, &PeakBitrate);
+        CV_Assert( err == 0 );
+
+        int QP_Level_Intra = params.QP_Level_Intra;
+        err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTRA, &QP_Level_Intra);
+        CV_Assert( err == 0 );
+
+        int QP_Level_InterP = params.QP_Level_InterP;
+        err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTER_P, &QP_Level_InterP);
+        CV_Assert( err == 0 );
+
+        int QP_Level_InterB = params.QP_Level_InterB;
+        err = NVSetParamValue(encoder_, NVVE_QP_LEVEL_INTER_B, &QP_Level_InterB);
+        CV_Assert( err == 0 );
+
+        int DeblockMode = params.DeblockMode;
+        err = NVSetParamValue(encoder_, NVVE_DEBLOCK_MODE, &DeblockMode);
+        CV_Assert( err == 0 );
+
+        int ProfileLevel = params.ProfileLevel;
+        err = NVSetParamValue(encoder_, NVVE_PROFILE_LEVEL, &ProfileLevel);
+        CV_Assert( err == 0 );
+
+        int ForceIntra = params.ForceIntra;
+        err = NVSetParamValue(encoder_, NVVE_FORCE_INTRA, &ForceIntra);
+        CV_Assert( err == 0 );
+
+        int ForceIDR = params.ForceIDR;
+        err = NVSetParamValue(encoder_, NVVE_FORCE_IDR, &ForceIDR);
+        CV_Assert( err == 0 );
+
+        int ClearStat = params.ClearStat;
+        err = NVSetParamValue(encoder_, NVVE_CLEAR_STAT, &ClearStat);
+        CV_Assert( err == 0 );
+
+        NVVE_DI_MODE DIMode = static_cast<NVVE_DI_MODE>(params.DIMode);
+        err = NVSetParamValue(encoder_, NVVE_SET_DEINTERLACE, &DIMode);
+        CV_Assert( err == 0 );
+
+        if (params.Presets != -1)
+        {
+            NVVE_PRESETS_TARGET Presets = static_cast<NVVE_PRESETS_TARGET>(params.Presets);
+            err = NVSetParamValue(encoder_, NVVE_PRESETS, &Presets);
+            CV_Assert( err == 0 );
+        }
+
+        int DisableCabac = params.DisableCabac;
+        err = NVSetParamValue(encoder_, NVVE_DISABLE_CABAC, &DisableCabac);
+        CV_Assert( err == 0 );
+
+        int NaluFramingType = params.NaluFramingType;
+        err = NVSetParamValue(encoder_, NVVE_CONFIGURE_NALU_FRAMING_TYPE, &NaluFramingType);
+        CV_Assert( err == 0 );
+
+        int DisableSPSPPS = params.DisableSPSPPS;
+        err = NVSetParamValue(encoder_, NVVE_DISABLE_SPS_PPS, &DisableSPSPPS);
+        CV_Assert( err == 0 );
+    }
+
+    EncoderParams VideoWriterImpl::getEncoderParams() const
+    {
+        int err;
+
+        EncoderParams params;
+
+        int P_Interval;
+        err = NVGetParamValue(encoder_, NVVE_P_INTERVAL, &P_Interval);
+        CV_Assert( err == 0 );
+        params.P_Interval = P_Interval;
+
+        int IDR_Period;
+        err = NVGetParamValue(encoder_, NVVE_IDR_PERIOD, &IDR_Period);
+        CV_Assert( err == 0 );
+        params.IDR_Period = IDR_Period;
+
+        int DynamicGOP;
+        err = NVGetParamValue(encoder_, NVVE_DYNAMIC_GOP, &DynamicGOP);
+        CV_Assert( err == 0 );
+        params.DynamicGOP = DynamicGOP;
+
+        NVVE_RateCtrlType RCType;
+        err = NVGetParamValue(encoder_, NVVE_RC_TYPE, &RCType);
+        CV_Assert( err == 0 );
+        params.RCType = RCType;
+
+        int AvgBitrate;
+        err = NVGetParamValue(encoder_, NVVE_AVG_BITRATE, &AvgBitrate);
+        CV_Assert( err == 0 );
+        params.AvgBitrate = AvgBitrate;
+
+        int PeakBitrate;
+        err = NVGetParamValue(encoder_, NVVE_PEAK_BITRATE, &PeakBitrate);
+        CV_Assert( err == 0 );
+        params.PeakBitrate = PeakBitrate;
+
+        int QP_Level_Intra;
+        err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTRA, &QP_Level_Intra);
+        CV_Assert( err == 0 );
+        params.QP_Level_Intra = QP_Level_Intra;
+
+        int QP_Level_InterP;
+        err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTER_P, &QP_Level_InterP);
+        CV_Assert( err == 0 );
+        params.QP_Level_InterP = QP_Level_InterP;
+
+        int QP_Level_InterB;
+        err = NVGetParamValue(encoder_, NVVE_QP_LEVEL_INTER_B, &QP_Level_InterB);
+        CV_Assert( err == 0 );
+        params.QP_Level_InterB = QP_Level_InterB;
+
+        int DeblockMode;
+        err = NVGetParamValue(encoder_, NVVE_DEBLOCK_MODE, &DeblockMode);
+        CV_Assert( err == 0 );
+        params.DeblockMode = DeblockMode;
+
+        int ProfileLevel;
+        err = NVGetParamValue(encoder_, NVVE_PROFILE_LEVEL, &ProfileLevel);
+        CV_Assert( err == 0 );
+        params.ProfileLevel = ProfileLevel;
+
+        int ForceIntra;
+        err = NVGetParamValue(encoder_, NVVE_FORCE_INTRA, &ForceIntra);
+        CV_Assert( err == 0 );
+        params.ForceIntra = ForceIntra;
+
+        int ForceIDR;
+        err = NVGetParamValue(encoder_, NVVE_FORCE_IDR, &ForceIDR);
+        CV_Assert( err == 0 );
+        params.ForceIDR = ForceIDR;
+
+        int ClearStat;
+        err = NVGetParamValue(encoder_, NVVE_CLEAR_STAT, &ClearStat);
+        CV_Assert( err == 0 );
+        params.ClearStat = ClearStat;
+
+        NVVE_DI_MODE DIMode;
+        err = NVGetParamValue(encoder_, NVVE_SET_DEINTERLACE, &DIMode);
+        CV_Assert( err == 0 );
+        params.DIMode = DIMode;
+
+        params.Presets = -1;
+
+        int DisableCabac;
+        err = NVGetParamValue(encoder_, NVVE_DISABLE_CABAC, &DisableCabac);
+        CV_Assert( err == 0 );
+        params.DisableCabac = DisableCabac;
+
+        int NaluFramingType;
+        err = NVGetParamValue(encoder_, NVVE_CONFIGURE_NALU_FRAMING_TYPE, &NaluFramingType);
+        CV_Assert( err == 0 );
+        params.NaluFramingType = NaluFramingType;
+
+        int DisableSPSPPS;
+        err = NVGetParamValue(encoder_, NVVE_DISABLE_SPS_PPS, &DisableSPSPPS);
+        CV_Assert( err == 0 );
+        params.DisableSPSPPS = DisableSPSPPS;
+
+        return params;
+    }
+
+    void VideoWriterImpl::initGpuMemory()
+    {
+        int err;
+
+        // initialize context
+        GpuMat temp(1, 1, CV_8U);
+        temp.release();
+
+        static const int bpp[] =
+        {
+            16, // UYVY, 4:2:2
+            16, // YUY2, 4:2:2
+            12, // YV12, 4:2:0
+            12, // NV12, 4:2:0
+            12, // IYUV, 4:2:0
+        };
+
+        CUcontext cuContext;
+        cuSafeCall( cuCtxGetCurrent(&cuContext) );
+
+        // Allocate the CUDA memory Pitched Surface
+        if (surfaceFormat_ == UYVY || surfaceFormat_ == YUY2)
+            videoFrame_.create(frameSize_.height, (frameSize_.width * bpp[surfaceFormat_]) / 8, CV_8UC1);
+        else
+            videoFrame_.create((frameSize_.height * bpp[surfaceFormat_]) / 8, frameSize_.width, CV_8UC1);
+
+        // Create the Video Context Lock (used for synchronization)
+        cuSafeCall( cuvidCtxLockCreate(&cuCtxLock_, cuContext) );
+
+        // If we are using GPU Device Memory with NVCUVENC, it is necessary to create a
+        // CUDA Context with a Context Lock cuvidCtxLock.  The Context Lock needs to be passed to NVCUVENC
+
+        int iUseDeviceMem = 1;
+        err = NVSetParamValue(encoder_, NVVE_DEVICE_MEMORY_INPUT, &iUseDeviceMem);
+        CV_Assert( err == 0 );
+
+        err = NVSetParamValue(encoder_, NVVE_DEVICE_CTX_LOCK, &cuCtxLock_);
+        CV_Assert( err == 0 );
+    }
+
+    void VideoWriterImpl::initCallBacks()
+    {
+        NVVE_CallbackParams cb;
+        memset(&cb, 0, sizeof(NVVE_CallbackParams));
+
+        cb.pfnacquirebitstream = HandleAcquireBitStream;
+        cb.pfnonbeginframe     = HandleOnBeginFrame;
+        cb.pfnonendframe       = HandleOnEndFrame;
+        cb.pfnreleasebitstream = HandleReleaseBitStream;
+
+        NVRegisterCB(encoder_, cb, this);
+    }
+
+    void VideoWriterImpl::createHWEncoder()
+    {
+        int err;
+
+        // Create the NVIDIA HW resources for Encoding on NVIDIA hardware
+        err = NVCreateHWEncoder(encoder_);
+        CV_Assert( err == 0 );
+    }
 
-namespace
-{
     // UYVY/YUY2 are both 4:2:2 formats (16bpc)
     // Luma, U, V are interleaved, chroma is subsampled (w/2,h)
-    void copyUYVYorYUY2Frame(cv::Size frameSize, const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst)
+    void copyUYVYorYUY2Frame(Size frameSize, const GpuMat& src, GpuMat& dst)
     {
-        CUresult res;
-
         // Source is YUVY/YUY2 4:2:2, the YUV data in a packed and interleaved
 
         // YUV Copy setup
         CUDA_MEMCPY2D stCopyYUV422;
-        memset((void*)&stCopyYUV422, 0, sizeof(stCopyYUV422));
+        memset(&stCopyYUV422, 0, sizeof(CUDA_MEMCPY2D));
+
         stCopyYUV422.srcXInBytes          = 0;
         stCopyYUV422.srcY                 = 0;
         stCopyYUV422.srcMemoryType        = CU_MEMORYTYPE_DEVICE;
@@ -527,21 +514,19 @@ namespace
         stCopyYUV422.Height               = frameSize.height;
 
         // DMA Luma/Chroma
-        res = cuMemcpy2D(&stCopyYUV422);
-        CV_Assert( res == CUDA_SUCCESS );
+        cuSafeCall( cuMemcpy2D(&stCopyYUV422) );
     }
 
     // YV12/IYUV are both 4:2:0 planar formats (12bpc)
     // Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2)
-    void copyYV12orIYUVFrame(cv::Size frameSize, const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst)
+    void copyYV12orIYUVFrame(Size frameSize, const GpuMat& src, GpuMat& dst)
     {
-        CUresult res;
-
         // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder
 
         // (1) luma copy setup
         CUDA_MEMCPY2D stCopyLuma;
-        memset((void*)&stCopyLuma, 0, sizeof(stCopyLuma));
+        memset(&stCopyLuma, 0, sizeof(CUDA_MEMCPY2D));
+
         stCopyLuma.srcXInBytes          = 0;
         stCopyLuma.srcY                 = 0;
         stCopyLuma.srcMemoryType        = CU_MEMORYTYPE_DEVICE;
@@ -563,7 +548,8 @@ namespace
 
         // (2) chroma copy setup, U/V can be done together
         CUDA_MEMCPY2D stCopyChroma;
-        memset((void*)&stCopyChroma, 0, sizeof(stCopyChroma));
+        memset(&stCopyChroma, 0, sizeof(CUDA_MEMCPY2D));
+
         stCopyChroma.srcXInBytes        = 0;
         stCopyChroma.srcY               = frameSize.height << 1; // U/V chroma offset
         stCopyChroma.srcMemoryType      = CU_MEMORYTYPE_DEVICE;
@@ -584,26 +570,23 @@ namespace
         stCopyChroma.Height             = frameSize.height; // U/V are sent together
 
         // DMA Luma
-        res = cuMemcpy2D(&stCopyLuma);
-        CV_Assert( res == CUDA_SUCCESS );
+        cuSafeCall( cuMemcpy2D(&stCopyLuma) );
 
         // DMA Chroma channels (UV side by side)
-        res = cuMemcpy2D(&stCopyChroma);
-        CV_Assert( res == CUDA_SUCCESS );
+        cuSafeCall( cuMemcpy2D(&stCopyChroma) );
     }
 
     // NV12 is 4:2:0 format (12bpc)
     // Luma followed by U/V chroma interleaved (12bpc), chroma is subsampled (w/2,h/2)
-    void copyNV12Frame(cv::Size frameSize, const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst)
+    void copyNV12Frame(Size frameSize, const GpuMat& src, GpuMat& dst)
     {
-        CUresult res;
-
         // Source is NV12 in pitch linear memory
         // Because we are assume input is NV12 (if we take input in the native format), the encoder handles NV12 as a native format in pitch linear memory
 
         // Luma/Chroma can be done in a single transfer
         CUDA_MEMCPY2D stCopyNV12;
-        memset((void*)&stCopyNV12, 0, sizeof(stCopyNV12));
+        memset(&stCopyNV12, 0, sizeof(CUDA_MEMCPY2D));
+
         stCopyNV12.srcXInBytes          = 0;
         stCopyNV12.srcY                 = 0;
         stCopyNV12.srcMemoryType        = CU_MEMORYTYPE_DEVICE;
@@ -621,141 +604,137 @@ namespace
         stCopyNV12.dstPitch             = dst.step;
 
         stCopyNV12.WidthInBytes         = frameSize.width;
-        stCopyNV12.Height               =(frameSize.height * 3) >> 1;
+        stCopyNV12.Height               = (frameSize.height * 3) >> 1;
 
         // DMA Luma/Chroma
-        res = cuMemcpy2D(&stCopyNV12);
-        CV_Assert( res == CUDA_SUCCESS );
-    }
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream = 0);
-}}}
-
-void cv::gpu::VideoWriter_GPU::Impl::write(const cv::gpu::GpuMat& frame, bool lastFrame)
-{
-    if (inputFormat_ == SF_BGR)
-    {
-        CV_Assert( frame.size() == frameSize_ );
-        CV_Assert( frame.type() == CV_8UC1 || frame.type() == CV_8UC3 || frame.type() == CV_8UC4 );
-    }
-    else
-    {
-        CV_Assert( frame.size() == videoFrame_.size() );
-        CV_Assert( frame.type() == videoFrame_.type() );
+        cuSafeCall( cuMemcpy2D(&stCopyNV12) );
     }
 
-    NVVE_EncodeFrameParams efparams;
-    efparams.Width = frameSize_.width;
-    efparams.Height = frameSize_.height;
-    efparams.Pitch = static_cast<int>(videoFrame_.step);
-    efparams.SurfFmt = surfaceFormat_;
-    efparams.PictureStruc = FRAME_PICTURE;
-    efparams.topfieldfirst =  0;
-    efparams.repeatFirstField = 0;
-    efparams.progressiveFrame = (surfaceFormat_ == NV12) ? 1 : 0;
-    efparams.bLast = lastFrame;
-    efparams.picBuf = 0; // Must be set to NULL in order to support device memory input
-
-    // Don't forget we need to lock/unlock between memcopies
-    CUresult res = cuvidCtxLock(cuCtxLock_, 0);
-    CV_Assert( res == CUDA_SUCCESS );
-
-    if (inputFormat_ == SF_BGR)
-        cv::gpu::cudev::RGB_to_YV12(frame, frame.channels(), videoFrame_);
-    else
+    void VideoWriterImpl::write(InputArray _frame, bool lastFrame)
     {
-        switch (surfaceFormat_)
+        GpuMat frame = _frame.getGpuMat();
+
+        if (inputFormat_ == SF_BGR)
         {
-        case UYVY: // UYVY (4:2:2)
-        case YUY2: // YUY2 (4:2:2)
-            copyUYVYorYUY2Frame(frameSize_, frame, videoFrame_);
-            break;
-
-        case YV12: // YV12 (4:2:0), Y V U
-        case IYUV: // IYUV (4:2:0), Y U V
-            copyYV12orIYUVFrame(frameSize_, frame, videoFrame_);
-            break;
-
-        case NV12: // NV12 (4:2:0)
-            copyNV12Frame(frameSize_, frame, videoFrame_);
-            break;
+            CV_Assert( frame.size() == frameSize_ );
+            CV_Assert( frame.type() == CV_8UC1 || frame.type() == CV_8UC3 || frame.type() == CV_8UC4 );
         }
+        else
+        {
+            CV_Assert( frame.size() == videoFrame_.size() );
+            CV_Assert( frame.type() == videoFrame_.type() );
+        }
+
+        NVVE_EncodeFrameParams efparams;
+        efparams.Width = frameSize_.width;
+        efparams.Height = frameSize_.height;
+        efparams.Pitch = static_cast<int>(videoFrame_.step);
+        efparams.SurfFmt = surfaceFormat_;
+        efparams.PictureStruc = FRAME_PICTURE;
+        efparams.topfieldfirst =  0;
+        efparams.repeatFirstField = 0;
+        efparams.progressiveFrame = (surfaceFormat_ == NV12) ? 1 : 0;
+        efparams.bLast = lastFrame;
+        efparams.picBuf = 0; // Must be set to NULL in order to support device memory input
+
+        // Don't forget we need to lock/unlock between memcopies
+        cuSafeCall( cuvidCtxLock(cuCtxLock_, 0) );
+
+        if (inputFormat_ == SF_BGR)
+        {
+            cudev::RGB_to_YV12(frame, frame.channels(), videoFrame_);
+        }
+        else
+        {
+            switch (surfaceFormat_)
+            {
+            case UYVY: // UYVY (4:2:2)
+            case YUY2: // YUY2 (4:2:2)
+                copyUYVYorYUY2Frame(frameSize_, frame, videoFrame_);
+                break;
+
+            case YV12: // YV12 (4:2:0), Y V U
+            case IYUV: // IYUV (4:2:0), Y U V
+                copyYV12orIYUVFrame(frameSize_, frame, videoFrame_);
+                break;
+
+            case NV12: // NV12 (4:2:0)
+                copyNV12Frame(frameSize_, frame, videoFrame_);
+                break;
+            }
+        }
+
+        cuSafeCall( cuvidCtxUnlock(cuCtxLock_, 0) );
+
+        int err = NVEncodeFrame(encoder_, &efparams, 0, videoFrame_.data);
+        CV_Assert( err == 0 );
     }
 
-    res = cuvidCtxUnlock(cuCtxLock_, 0);
-    CV_Assert( res == CUDA_SUCCESS );
-
-    int err = NVEncodeFrame(encoder_, &efparams, 0, videoFrame_.data);
-    CV_Assert( err == 0 );
-}
-
-unsigned char* NVENCAPI cv::gpu::VideoWriter_GPU::Impl::HandleAcquireBitStream(int* pBufferSize, void* pUserdata)
-{
-    Impl* thiz = static_cast<Impl*>(pUserdata);
-
-    return thiz->callback_->acquireBitStream(pBufferSize);
-}
-
-void NVENCAPI cv::gpu::VideoWriter_GPU::Impl::HandleReleaseBitStream(int nBytesInBuffer, unsigned char* cb, void* pUserdata)
-{
-    Impl* thiz = static_cast<Impl*>(pUserdata);
-
-    thiz->callback_->releaseBitStream(cb, nBytesInBuffer);
-}
-
-void NVENCAPI cv::gpu::VideoWriter_GPU::Impl::HandleOnBeginFrame(const NVVE_BeginFrameInfo* pbfi, void* pUserdata)
-{
-    Impl* thiz = static_cast<Impl*>(pUserdata);
-
-    thiz->callback_->onBeginFrame(pbfi->nFrameNumber, static_cast<EncoderCallBack::PicType>(pbfi->nPicType));
-}
-
-void NVENCAPI cv::gpu::VideoWriter_GPU::Impl::HandleOnEndFrame(const NVVE_EndFrameInfo* pefi, void* pUserdata)
-{
-    Impl* thiz = static_cast<Impl*>(pUserdata);
-
-    thiz->callback_->onEndFrame(pefi->nFrameNumber, static_cast<EncoderCallBack::PicType>(pefi->nPicType));
-}
-
-///////////////////////////////////////////////////////////////////////////
-// FFMPEG
-
-class EncoderCallBackFFMPEG : public cv::gpu::VideoWriter_GPU::EncoderCallBack
-{
-public:
-    EncoderCallBackFFMPEG(const cv::String& fileName, cv::Size frameSize, double fps);
-    ~EncoderCallBackFFMPEG();
-
-    unsigned char* acquireBitStream(int* bufferSize);
-    void releaseBitStream(unsigned char* data, int size);
-    void onBeginFrame(int frameNumber, PicType picType);
-    void onEndFrame(int frameNumber, PicType picType);
-
-private:
-    EncoderCallBackFFMPEG(const EncoderCallBackFFMPEG&);
-    EncoderCallBackFFMPEG& operator=(const EncoderCallBackFFMPEG&);
-
-    struct OutputMediaStream_FFMPEG* stream_;
-    std::vector<uchar> buf_;
-    bool isKeyFrame_;
-};
-
-namespace
-{
-    Create_OutputMediaStream_FFMPEG_Plugin create_OutputMediaStream_FFMPEG_p = 0;
-    Release_OutputMediaStream_FFMPEG_Plugin release_OutputMediaStream_FFMPEG_p = 0;
-    Write_OutputMediaStream_FFMPEG_Plugin write_OutputMediaStream_FFMPEG_p = 0;
-
-    bool init_MediaStream_FFMPEG()
+    unsigned char* NVENCAPI VideoWriterImpl::HandleAcquireBitStream(int* pBufferSize, void* pUserdata)
     {
-        static bool initialized = 0;
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        return thiz->callback_->acquireBitStream(pBufferSize);
+    }
+
+    void NVENCAPI VideoWriterImpl::HandleReleaseBitStream(int nBytesInBuffer, unsigned char* cb, void* pUserdata)
+    {
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        thiz->callback_->releaseBitStream(cb, nBytesInBuffer);
+    }
+
+    void NVENCAPI VideoWriterImpl::HandleOnBeginFrame(const NVVE_BeginFrameInfo* pbfi, void* pUserdata)
+    {
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        thiz->callback_->onBeginFrame(pbfi->nFrameNumber, static_cast<EncoderCallBack::PicType>(pbfi->nPicType));
+    }
+
+    void NVENCAPI VideoWriterImpl::HandleOnEndFrame(const NVVE_EndFrameInfo* pefi, void* pUserdata)
+    {
+        VideoWriterImpl* thiz = static_cast<VideoWriterImpl*>(pUserdata);
+
+        thiz->callback_->onEndFrame(pefi->nFrameNumber, static_cast<EncoderCallBack::PicType>(pefi->nPicType));
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // FFMPEG
+
+    class EncoderCallBackFFMPEG : public EncoderCallBack
+    {
+    public:
+        EncoderCallBackFFMPEG(const String& fileName, Size frameSize, double fps);
+        ~EncoderCallBackFFMPEG();
+
+        unsigned char* acquireBitStream(int* bufferSize);
+        void releaseBitStream(unsigned char* data, int size);
+        void onBeginFrame(int frameNumber, PicType picType);
+        void onEndFrame(int frameNumber, PicType picType);
+
+    private:
+        static bool init_MediaStream_FFMPEG();
+
+        struct OutputMediaStream_FFMPEG* stream_;
+        std::vector<uchar> buf_;
+        bool isKeyFrame_;
+
+        static Create_OutputMediaStream_FFMPEG_Plugin create_OutputMediaStream_FFMPEG_p;
+        static Release_OutputMediaStream_FFMPEG_Plugin release_OutputMediaStream_FFMPEG_p;
+        static Write_OutputMediaStream_FFMPEG_Plugin write_OutputMediaStream_FFMPEG_p;
+    };
+
+    Create_OutputMediaStream_FFMPEG_Plugin EncoderCallBackFFMPEG::create_OutputMediaStream_FFMPEG_p = 0;
+    Release_OutputMediaStream_FFMPEG_Plugin EncoderCallBackFFMPEG::release_OutputMediaStream_FFMPEG_p = 0;
+    Write_OutputMediaStream_FFMPEG_Plugin EncoderCallBackFFMPEG::write_OutputMediaStream_FFMPEG_p = 0;
+
+    bool EncoderCallBackFFMPEG::init_MediaStream_FFMPEG()
+    {
+        static bool initialized = false;
 
         if (!initialized)
         {
-            #if defined WIN32 || defined _WIN32
+            #if defined(WIN32) || defined(_WIN32)
                 const char* module_name = "opencv_ffmpeg"
                     CVAUX_STR(CV_VERSION_EPOCH) CVAUX_STR(CV_VERSION_MAJOR) CVAUX_STR(CV_VERSION_MINOR)
                 #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
@@ -776,7 +755,7 @@ namespace
 
                     initialized = create_OutputMediaStream_FFMPEG_p != 0 && release_OutputMediaStream_FFMPEG_p != 0 && write_OutputMediaStream_FFMPEG_p != 0;
                 }
-            #elif defined HAVE_FFMPEG
+            #elif defined(HAVE_FFMPEG)
                 create_OutputMediaStream_FFMPEG_p = create_OutputMediaStream_FFMPEG;
                 release_OutputMediaStream_FFMPEG_p = release_OutputMediaStream_FFMPEG;
                 write_OutputMediaStream_FFMPEG_p = write_OutputMediaStream_FFMPEG;
@@ -787,134 +766,52 @@ namespace
 
         return initialized;
     }
-}
 
-EncoderCallBackFFMPEG::EncoderCallBackFFMPEG(const cv::String& fileName, cv::Size frameSize, double fps) :
-    stream_(0), isKeyFrame_(false)
-{
-    int buf_size = std::max(frameSize.area() * 4, 1024 * 1024);
-    buf_.resize(buf_size);
+    EncoderCallBackFFMPEG::EncoderCallBackFFMPEG(const String& fileName, Size frameSize, double fps) :
+        stream_(0), isKeyFrame_(false)
+    {
+        int buf_size = std::max(frameSize.area() * 4, 1024 * 1024);
+        buf_.resize(buf_size);
 
-    CV_Assert( init_MediaStream_FFMPEG() );
+        CV_Assert( init_MediaStream_FFMPEG() );
 
-    stream_ = create_OutputMediaStream_FFMPEG_p(fileName.c_str(), frameSize.width, frameSize.height, fps);
-    CV_Assert( stream_ != 0 );
-}
+        stream_ = create_OutputMediaStream_FFMPEG_p(fileName.c_str(), frameSize.width, frameSize.height, fps);
+        CV_Assert( stream_ != 0 );
+    }
 
-EncoderCallBackFFMPEG::~EncoderCallBackFFMPEG()
-{
-    release_OutputMediaStream_FFMPEG_p(stream_);
-}
+    EncoderCallBackFFMPEG::~EncoderCallBackFFMPEG()
+    {
+        release_OutputMediaStream_FFMPEG_p(stream_);
+    }
 
-unsigned char* EncoderCallBackFFMPEG::acquireBitStream(int* bufferSize)
-{
-    *bufferSize = static_cast<int>(buf_.size());
-    return &buf_[0];
-}
+    unsigned char* EncoderCallBackFFMPEG::acquireBitStream(int* bufferSize)
+    {
+        *bufferSize = static_cast<int>(buf_.size());
+        return &buf_[0];
+    }
 
-void EncoderCallBackFFMPEG::releaseBitStream(unsigned char* data, int size)
-{
-    write_OutputMediaStream_FFMPEG_p(stream_, data, size, isKeyFrame_);
-}
+    void EncoderCallBackFFMPEG::releaseBitStream(unsigned char* data, int size)
+    {
+        write_OutputMediaStream_FFMPEG_p(stream_, data, size, isKeyFrame_);
+    }
 
-void EncoderCallBackFFMPEG::onBeginFrame(int frameNumber, PicType picType)
-{
-    (void) frameNumber;
-    isKeyFrame_ = picType == IFRAME;
-}
+    void EncoderCallBackFFMPEG::onBeginFrame(int frameNumber, PicType picType)
+    {
+        (void) frameNumber;
+        isKeyFrame_ = (picType == IFRAME);
+    }
 
-void EncoderCallBackFFMPEG::onEndFrame(int frameNumber, PicType picType)
-{
-    (void) frameNumber;
-    (void) picType;
+    void EncoderCallBackFFMPEG::onEndFrame(int frameNumber, PicType picType)
+    {
+        (void) frameNumber;
+        (void) picType;
+    }
 }
 
 ///////////////////////////////////////////////////////////////////////////
-// VideoWriter_GPU
+// EncoderParams
 
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU()
-{
-}
-
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format)
-{
-    open(fileName, frameSize, fps, format);
-}
-
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
-{
-    open(fileName, frameSize, fps, params, format);
-}
-
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format)
-{
-    open(encoderCallback, frameSize, fps, format);
-}
-
-cv::gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
-{
-    open(encoderCallback, frameSize, fps, params, format);
-}
-
-cv::gpu::VideoWriter_GPU::~VideoWriter_GPU()
-{
-    close();
-}
-
-void cv::gpu::VideoWriter_GPU::open(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format)
-{
-    close();
-    cv::Ptr<EncoderCallBack> encoderCallback(new EncoderCallBackFFMPEG(fileName, frameSize, fps));
-    open(encoderCallback, frameSize, fps, format);
-}
-
-void cv::gpu::VideoWriter_GPU::open(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
-{
-    close();
-    cv::Ptr<EncoderCallBack> encoderCallback(new EncoderCallBackFFMPEG(fileName, frameSize, fps));
-    open(encoderCallback, frameSize, fps, params, format);
-}
-
-void cv::gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format)
-{
-    close();
-    impl_ = new Impl(encoderCallback, frameSize, fps, format);
-}
-
-void cv::gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
-{
-    close();
-    impl_ = new Impl(encoderCallback, frameSize, fps, params, format);
-}
-
-bool cv::gpu::VideoWriter_GPU::isOpened() const
-{
-    return !impl_.empty();
-}
-
-void cv::gpu::VideoWriter_GPU::close()
-{
-    impl_.release();
-}
-
-void cv::gpu::VideoWriter_GPU::write(const cv::gpu::GpuMat& image, bool lastFrame)
-{
-    CV_Assert( isOpened() );
-
-    impl_->write(image, lastFrame);
-}
-
-cv::gpu::VideoWriter_GPU::EncoderParams cv::gpu::VideoWriter_GPU::getParams() const
-{
-    CV_Assert( isOpened() );
-
-    return impl_->getParams();
-}
-
-///////////////////////////////////////////////////////////////////////////
-// VideoWriter_GPU::EncoderParams
-
-cv::gpu::VideoWriter_GPU::EncoderParams::EncoderParams()
+cv::gpucodec::EncoderParams::EncoderParams()
 {
     P_Interval = 3;
     IDR_Period = 15;
@@ -937,66 +834,86 @@ cv::gpu::VideoWriter_GPU::EncoderParams::EncoderParams()
     DisableSPSPPS = 0;
 }
 
-cv::gpu::VideoWriter_GPU::EncoderParams::EncoderParams(const String& configFile)
+cv::gpucodec::EncoderParams::EncoderParams(const String& configFile)
 {
     load(configFile);
 }
 
-void cv::gpu::VideoWriter_GPU::EncoderParams::load(const String& configFile)
+void cv::gpucodec::EncoderParams::load(const String& configFile)
 {
-    cv::FileStorage fs(configFile, cv::FileStorage::READ);
+    FileStorage fs(configFile, FileStorage::READ);
     CV_Assert( fs.isOpened() );
 
-    cv::read(fs["P_Interval"     ], P_Interval, 3);
-    cv::read(fs["IDR_Period"     ], IDR_Period, 15);
-    cv::read(fs["DynamicGOP"     ], DynamicGOP, 0);
-    cv::read(fs["RCType"         ], RCType, 1);
-    cv::read(fs["AvgBitrate"     ], AvgBitrate, 4000000);
-    cv::read(fs["PeakBitrate"    ], PeakBitrate, 10000000);
-    cv::read(fs["QP_Level_Intra" ], QP_Level_Intra, 25);
-    cv::read(fs["QP_Level_InterP"], QP_Level_InterP, 28);
-    cv::read(fs["QP_Level_InterB"], QP_Level_InterB, 31);
-    cv::read(fs["DeblockMode"    ], DeblockMode, 1);
-    cv::read(fs["ProfileLevel"   ], ProfileLevel, 65357);
-    cv::read(fs["ForceIntra"     ], ForceIntra, 0);
-    cv::read(fs["ForceIDR"       ], ForceIDR, 0);
-    cv::read(fs["ClearStat"      ], ClearStat, 0);
-    cv::read(fs["DIMode"         ], DIMode, 1);
-    cv::read(fs["Presets"        ], Presets, 2);
-    cv::read(fs["DisableCabac"   ], DisableCabac, 0);
-    cv::read(fs["NaluFramingType"], NaluFramingType, 0);
-    cv::read(fs["DisableSPSPPS"  ], DisableSPSPPS, 0);
+    read(fs["P_Interval"     ], P_Interval, 3);
+    read(fs["IDR_Period"     ], IDR_Period, 15);
+    read(fs["DynamicGOP"     ], DynamicGOP, 0);
+    read(fs["RCType"         ], RCType, 1);
+    read(fs["AvgBitrate"     ], AvgBitrate, 4000000);
+    read(fs["PeakBitrate"    ], PeakBitrate, 10000000);
+    read(fs["QP_Level_Intra" ], QP_Level_Intra, 25);
+    read(fs["QP_Level_InterP"], QP_Level_InterP, 28);
+    read(fs["QP_Level_InterB"], QP_Level_InterB, 31);
+    read(fs["DeblockMode"    ], DeblockMode, 1);
+    read(fs["ProfileLevel"   ], ProfileLevel, 65357);
+    read(fs["ForceIntra"     ], ForceIntra, 0);
+    read(fs["ForceIDR"       ], ForceIDR, 0);
+    read(fs["ClearStat"      ], ClearStat, 0);
+    read(fs["DIMode"         ], DIMode, 1);
+    read(fs["Presets"        ], Presets, 2);
+    read(fs["DisableCabac"   ], DisableCabac, 0);
+    read(fs["NaluFramingType"], NaluFramingType, 0);
+    read(fs["DisableSPSPPS"  ], DisableSPSPPS, 0);
 }
 
-void cv::gpu::VideoWriter_GPU::EncoderParams::save(const String& configFile) const
+void cv::gpucodec::EncoderParams::save(const String& configFile) const
 {
-    cv::FileStorage fs(configFile, cv::FileStorage::WRITE);
+    FileStorage fs(configFile, FileStorage::WRITE);
     CV_Assert( fs.isOpened() );
 
-    cv::write(fs, "P_Interval"     , P_Interval);
-    cv::write(fs, "IDR_Period"     , IDR_Period);
-    cv::write(fs, "DynamicGOP"     , DynamicGOP);
-    cv::write(fs, "RCType"         , RCType);
-    cv::write(fs, "AvgBitrate"     , AvgBitrate);
-    cv::write(fs, "PeakBitrate"    , PeakBitrate);
-    cv::write(fs, "QP_Level_Intra" , QP_Level_Intra);
-    cv::write(fs, "QP_Level_InterP", QP_Level_InterP);
-    cv::write(fs, "QP_Level_InterB", QP_Level_InterB);
-    cv::write(fs, "DeblockMode"    , DeblockMode);
-    cv::write(fs, "ProfileLevel"   , ProfileLevel);
-    cv::write(fs, "ForceIntra"     , ForceIntra);
-    cv::write(fs, "ForceIDR"       , ForceIDR);
-    cv::write(fs, "ClearStat"      , ClearStat);
-    cv::write(fs, "DIMode"         , DIMode);
-    cv::write(fs, "Presets"        , Presets);
-    cv::write(fs, "DisableCabac"   , DisableCabac);
-    cv::write(fs, "NaluFramingType", NaluFramingType);
-    cv::write(fs, "DisableSPSPPS"  , DisableSPSPPS);
+    write(fs, "P_Interval"     , P_Interval);
+    write(fs, "IDR_Period"     , IDR_Period);
+    write(fs, "DynamicGOP"     , DynamicGOP);
+    write(fs, "RCType"         , RCType);
+    write(fs, "AvgBitrate"     , AvgBitrate);
+    write(fs, "PeakBitrate"    , PeakBitrate);
+    write(fs, "QP_Level_Intra" , QP_Level_Intra);
+    write(fs, "QP_Level_InterP", QP_Level_InterP);
+    write(fs, "QP_Level_InterB", QP_Level_InterB);
+    write(fs, "DeblockMode"    , DeblockMode);
+    write(fs, "ProfileLevel"   , ProfileLevel);
+    write(fs, "ForceIntra"     , ForceIntra);
+    write(fs, "ForceIDR"       , ForceIDR);
+    write(fs, "ClearStat"      , ClearStat);
+    write(fs, "DIMode"         , DIMode);
+    write(fs, "Presets"        , Presets);
+    write(fs, "DisableCabac"   , DisableCabac);
+    write(fs, "NaluFramingType", NaluFramingType);
+    write(fs, "DisableSPSPPS"  , DisableSPSPPS);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// createVideoWriter
+
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format)
+{
+    Ptr<EncoderCallBack> encoderCallback(new EncoderCallBackFFMPEG(fileName, frameSize, fps));
+    return createVideoWriter(encoderCallback, frameSize, fps, format);
+}
+
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
+{
+    Ptr<EncoderCallBack> encoderCallback(new EncoderCallBackFFMPEG(fileName, frameSize, fps));
+    return createVideoWriter(encoderCallback, frameSize, fps, params, format);
+}
+
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format)
+{
+    return new VideoWriterImpl(encoderCallback, frameSize, fps, format);
+}
+
+Ptr<VideoWriter> cv::gpucodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format)
+{
+    return new VideoWriterImpl(encoderCallback, frameSize, fps, params, format);
 }
 
 #endif // !defined HAVE_CUDA || !defined WIN32
-
-template <> void cv::Ptr<cv::gpu::VideoWriter_GPU::Impl>::delete_obj()
-{
-    if (obj) delete obj;
-}
diff --git a/modules/gpucodec/test/test_video.cpp b/modules/gpucodec/test/test_video.cpp
index 55fc3f87c6..26bcc02d58 100644
--- a/modules/gpucodec/test/test_video.cpp
+++ b/modules/gpucodec/test/test_video.cpp
@@ -57,19 +57,15 @@ GPU_TEST_P(Video, Reader)
 
     const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
 
-    cv::gpu::VideoReader_GPU reader(inputFile);
-    ASSERT_TRUE(reader.isOpened());
+    cv::Ptr<cv::gpucodec::VideoReader> reader = cv::gpucodec::createVideoReader(inputFile);
 
     cv::gpu::GpuMat frame;
 
     for (int i = 0; i < 10; ++i)
     {
-        ASSERT_TRUE(reader.read(frame));
+        ASSERT_TRUE(reader->nextFrame(frame));
         ASSERT_FALSE(frame.empty());
     }
-
-    reader.close();
-    ASSERT_FALSE(reader.isOpened());
 }
 
 //////////////////////////////////////////////////////
@@ -89,7 +85,7 @@ GPU_TEST_P(Video, Writer)
     cv::VideoCapture reader(inputFile);
     ASSERT_TRUE(reader.isOpened());
 
-    cv::gpu::VideoWriter_GPU d_writer;
+    cv::Ptr<cv::gpucodec::VideoWriter> d_writer;
 
     cv::Mat frame;
     cv::gpu::GpuMat d_frame;
@@ -101,14 +97,14 @@ GPU_TEST_P(Video, Writer)
 
         d_frame.upload(frame);
 
-        if (!d_writer.isOpened())
-            d_writer.open(outputFile, frame.size(), FPS);
+        if (d_writer.empty())
+            d_writer = cv::gpucodec::createVideoWriter(outputFile, frame.size(), FPS);
 
-        d_writer.write(d_frame);
+        d_writer->write(d_frame);
     }
 
     reader.release();
-    d_writer.close();
+    d_writer.release();
 
     reader.open(outputFile);
     ASSERT_TRUE(reader.isOpened());
diff --git a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
index 0c821745f9..cc73da9d9e 100644
--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
@@ -351,7 +351,7 @@ private:
 
     FAST_GPU fastDetector_;
 
-    Ptr<FilterEngine_GPU> blurFilter;
+    Ptr<gpu::Filter> blurFilter;
 
     GpuMat d_keypoints_;
 };
diff --git a/modules/gpufeatures2d/src/orb.cpp b/modules/gpufeatures2d/src/orb.cpp
index 495ca3f6ef..7cb1cbecc1 100644
--- a/modules/gpufeatures2d/src/orb.cpp
+++ b/modules/gpufeatures2d/src/orb.cpp
@@ -468,7 +468,7 @@ cv::gpu::ORB_GPU::ORB_GPU(int nFeatures, float scaleFactor, int nLevels, int edg
 
     pattern_.upload(h_pattern);
 
-    blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+    blurFilter = gpu::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
 
     blurForDescriptor = false;
 }
@@ -632,7 +632,7 @@ void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
         {
             // preprocess the resized image
             ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
+            blurFilter->apply(imagePyr_[level], buf_);
         }
 
         computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
diff --git a/modules/gpufilters/CMakeLists.txt b/modules/gpufilters/CMakeLists.txt
index 18f6d7f7b6..640de8c115 100644
--- a/modules/gpufilters/CMakeLists.txt
+++ b/modules/gpufilters/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "GPU-accelerated Image Filtering")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
 
-ocv_define_module(gpufilters opencv_imgproc OPTIONAL opencv_gpuarithm)
+ocv_define_module(gpufilters opencv_imgproc opencv_gpuarithm)
diff --git a/modules/gpufilters/doc/filtering.rst b/modules/gpufilters/doc/filtering.rst
index 348a42510e..925b05f2cf 100644
--- a/modules/gpufilters/doc/filtering.rst
+++ b/modules/gpufilters/doc/filtering.rst
@@ -7,346 +7,236 @@ Functions and classes described in this section are used to perform various line
 
 
 
-gpu::BaseRowFilter_GPU
-----------------------
-.. ocv:class:: gpu::BaseRowFilter_GPU
+gpu::Filter
+-----------
+.. ocv:class:: gpu::Filter
 
-Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
+Common interface for all GPU filters ::
 
-    class BaseRowFilter_GPU
+    class CV_EXPORTS Filter : public Algorithm
     {
     public:
-        BaseRowFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseRowFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
+        virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
     };
 
 
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
 
-
-
-gpu::BaseColumnFilter_GPU
--------------------------
-.. ocv:class:: gpu::BaseColumnFilter_GPU
-
-Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
-
-    class BaseColumnFilter_GPU
-    {
-    public:
-        BaseColumnFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseColumnFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::BaseFilter_GPU
--------------------
-.. ocv:class:: gpu::BaseFilter_GPU
-
-Base class for non-separable 2D filters. ::
-
-    class CV_EXPORTS BaseFilter_GPU
-    {
-    public:
-        BaseFilter_GPU(const Size& ksize_, const Point& anchor_);
-        virtual ~BaseFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        Size ksize;
-        Point anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::FilterEngine_GPU
----------------------
-.. ocv:class:: gpu::FilterEngine_GPU
-
-Base class for the Filter Engine. ::
-
-    class CV_EXPORTS FilterEngine_GPU
-    {
-    public:
-        virtual ~FilterEngine_GPU() {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst,
-                           Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-    };
-
-
-The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`gpu::filter2D`, :ocv:func:`gpu::erode`, :ocv:func:`gpu::Sobel` , and others.
-
-By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
-
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        // Allocate and release buffers at each iterations
-        gpu::GaussianBlur(src, dst, ksize, sigma1);
-    }
-
-    // Allocate buffers only once
-    cv::Ptr<gpu::FilterEngine_GPU> filter =
-        gpu::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-    // Release buffers only once
-    filter.release();
-
-
-``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
-
-.. note:: The GPU filters do not support the in-place mode.
-
-.. seealso:: :ocv:class:`gpu::BaseRowFilter_GPU`, :ocv:class:`gpu::BaseColumnFilter_GPU`, :ocv:class:`gpu::BaseFilter_GPU`, :ocv:func:`gpu::createFilter2D_GPU`, :ocv:func:`gpu::createSeparableFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`, :ocv:func:`gpu::createMorphologyFilter_GPU`, :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`gpu::createDerivFilter_GPU`, :ocv:func:`gpu::createGaussianFilter_GPU`
-
-
-
-gpu::createFilter2D_GPU
----------------------------
-Creates a non-separable filter engine with the specified filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createFilter2D_GPU( const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-
-    :param filter2D: Non-separable 2D filter.
-
-    :param srcType: Input image type. It must be supported by  ``filter2D`` .
-
-    :param dstType: Output image type. It must be supported by  ``filter2D`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`.
-
-
-
-gpu::createSeparableFilter_GPU
-----------------------------------
-Creates a separable filter engine with the specified filters.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableFilter_GPU( const Ptr<BaseRowFilter_GPU>& rowFilter, const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-
-    :param rowFilter: "Horizontal" 1D filter.
-
-    :param columnFilter: "Vertical" 1D filter.
-
-    :param srcType: Input image type. It must be supported by  ``rowFilter`` .
-
-    :param bufType: Buffer image type. It must be supported by  ``rowFilter``  and  ``columnFilter`` .
-
-    :param dstType: Output image type. It must be supported by  ``columnFilter`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createSeparableLinearFilter_GPU`.
-
-
-
-gpu::getRowSumFilter_GPU
-----------------------------
-Creates a horizontal 1D box filter.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1)
-
-    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::getColumnSumFilter_GPU
--------------------------------
-Creates a vertical 1D box filter.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1)
-
-    :param sumType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param dstType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::createBoxFilter_GPU
-----------------------------
-Creates a normalized 2D box filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor = Point(-1,-1))
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1))
-
-    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
-
-    :param dstType: Output image type.  It supports only the same values as the source type.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`boxFilter`
-
-
-
-gpu::boxFilter
+gpu::Filter::apply
 ------------------
-Smooths the image using the normalized box filter.
+Applies the specified filter to the image.
 
-.. ocv:function:: void gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::Filter::apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0
 
-    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+    :param src: Input image.
 
-    :param dst: Output image type. The size and type is the same as ``src`` .
-
-    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+    :param dst: Output image.
 
     :param stream: Stream for the asynchronous version.
 
-.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+
+gpu::createBoxFilter
+--------------------
+Creates a normalized 2D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` and ``CV_8UC4`` are supported for now.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
 
 .. seealso:: :ocv:func:`boxFilter`
 
 
 
-gpu::blur
--------------
-Acts as a synonym for the normalized box filter.
+gpu::createLinearFilter
+-----------------------
+Creates a non-separable linear 2D filter.
 
-.. ocv:function:: void gpu::blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: Ptr<Filter> gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
 
-    :param dst: Output image type with the same size and type as  ``src`` .
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
 
-    :param ksize: Kernel size.
+    :param kernel: 2D array of filter coefficients.
 
     :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
 
-    :param stream: Stream for the asynchronous version.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderVal: Default border value.
 
-.. seealso:: :ocv:func:`blur`, :ocv:func:`gpu::boxFilter`
+.. seealso:: :ocv:func:`filter2D`
 
 
 
-gpu::createMorphologyFilter_GPU
------------------------------------
+gpu::createLaplacianFilter
+--------------------------
+Creates a Laplacian operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
+
+    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+.. seealso:: :ocv:func:`Laplacian`
+
+
+
+gpu::createSeparableLinearFilter
+--------------------------------
+Creates a separable linear filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source array type.
+
+    :param dstType: Destination array type.
+
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`sepFilter2D`
+
+
+
+gpu::createDerivFilter
+----------------------
+Creates a generalized Deriv operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize = false, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+
+    :param normalize: Flag indicating whether to normalize (scale down) the filter coefficients or not. See  :ocv:func:`getDerivKernels` for details.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+
+
+gpu::createSobelFilter
+----------------------
+Creates a Sobel operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Sobel`
+
+
+
+gpu::createScharrFilter
+-----------------------
+Creates a vertical or horizontal Scharr operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Order of the derivative x.
+
+    :param dy: Order of the derivative y.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Scharr`
+
+
+
+gpu::createGaussianFilter
+-------------------------
+Creates a Gaussian filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2 = 0, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`GaussianBlur`
+
+
+
+gpu::createMorphologyFilter
+---------------------------
 Creates a 2D morphological filter.
 
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1)
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor=Point(-1,-1))
-
-    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
-
-    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
-
-    :param kernel: 2D 8-bit structuring element for the morphological operation.
-
-    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
-
-    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`createMorphologyFilter`
-
-
-
-gpu::erode
---------------
-Erodes an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. Only  ``CV_8UC1``  and  ``CV_8UC4``  types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param kernel: Structuring element used for erosion. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times erosion to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`erode`
-
-
-
-gpu::dilate
----------------
-Dilates an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
-
-    :param dst: Destination image with the same size and type as ``src``.
-
-    :param kernel: Structuring element used for dilation. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times dilation to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`dilate`
-
-
-
-gpu::morphologyEx
----------------------
-Applies an advanced morphological operation to an image.
-
-.. ocv:function::  void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
+.. ocv:function:: Ptr<Filter> gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1)
 
     :param op: Type of morphological operation. The following types are possible:
 
+        * **MORPH_ERODE** erode
+
+        * **MORPH_DILATE** dilate
+
         * **MORPH_OPEN** opening
 
         * **MORPH_CLOSE** closing
@@ -357,363 +247,88 @@ Applies an advanced morphological operation to an image.
 
         * **MORPH_BLACKHAT** "black hat"
 
-    :param kernel: Structuring element.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
 
-    :param anchor: Position of an anchor within the element. The default value ``Point(-1, -1)`` means that the anchor is at the element center.
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
+
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
 
     :param iterations: Number of times erosion and dilation to be applied.
 
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
 .. seealso:: :ocv:func:`morphologyEx`
 
 
 
-gpu::createLinearFilter_GPU
--------------------------------
-Creates a non-separable linear filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT)
-
-    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dstType: Output image type. The same type as ``src`` is supported.
-
-    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`gpu::convolve`.
-
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-.. seealso:: :ocv:func:`createLinearFilter`
-
-
-
-gpu::filter2D
------------------
-Applies the non-separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
-
-    :param kernel: 2D array of filter coefficients.
-
-    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`filter2D`, :ocv:func:`gpu::convolve`
-
-
-
-gpu::Laplacian
-------------------
-Applies the Laplacian operator to an image.
-
-.. ocv:function:: void gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image. The size and number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. It supports only the same depth as the source image depth.
-
-    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
-
-    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`Laplacian`, :ocv:func:`gpu::filter2D`
-
-
-
-gpu::getLinearRowFilter_GPU
--------------------------------
-Creates a primitive row filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getLinearRowFilter_GPU( int srcType, int bufType, const Mat& rowKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
-
-    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``srcType == CV_8UC1`` or ``srcType == CV_8UC4`` and ``bufType == srcType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside the image.
-
-.. seealso:: :ocv:func:`createSeparableLinearFilter` .
-
-
-
-gpu::getLinearColumnFilter_GPU
-----------------------------------
-Creates a primitive column filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getLinearColumnFilter_GPU( int bufType, int dstType, const Mat& columnKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
-
-    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
-
-    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` . For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``dstType == CV_8UC1`` or ``dstType == CV_8UC4`` and ``bufType == dstType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside image.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::createSeparableLinearFilter_GPU
-----------------------------------------
-Creates a separable linear filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
-
-    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`. For details on limitations, see :ocv:func:`gpu::getLinearRowFilter_GPU`, cpp:ocv:func:`gpu::getLinearColumnFilter_GPU`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`gpu::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::sepFilter2D
---------------------
-Applies a separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` .
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param kernelX: Horizontal filter coefficients.
-
-    :param kernelY: Vertical filter coefficients.
-
-    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
-
-
-
-gpu::createDerivFilter_GPU
-------------------------------
-Creates a filter engine for the generalized Sobel operator.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
-
-
-
-gpu::Sobel
---------------
-Applies the generalized Sobel operator to an image.
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as source image.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Sobel`
-
-
-
-gpu::Scharr
----------------
-Calculates the first x- or y- image derivative using the Scharr operator.
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` has.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Order of the derivative x.
-
-    :param dy: Order of the derivative y.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Scharr`
-
-
-
-gpu::createGaussianFilter_GPU
----------------------------------
-Creates a Gaussian filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createGaussianFilter_GPU( int type, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
-
-    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
-
-
-
-gpu::GaussianBlur
----------------------
-Smooths an image using the Gaussian filter.
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param ksize: Gaussian kernel size.  ``ksize.width``  and  ``ksize.height``  can differ but they both must be positive and odd. If they are zeros, they are computed from  ``sigma1``  and  ``sigma2`` .
-
-    :param sigma1: Gaussian kernel standard deviation in X direction.
-
-    :param sigma2: Gaussian kernel standard deviation in Y direction. If  ``sigma2``  is zero, it is set to be equal to  ``sigma1`` . If they are both zeros, they are computed from  ``ksize.width``  and  ``ksize.height``, respectively. See  :ocv:func:`getGaussianKernel` for details. To fully control the result regardless of possible future modification of all this semantics, you are recommended to specify all of  ``ksize`` , ``sigma1`` , and  ``sigma2`` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createGaussianFilter_GPU`, :ocv:func:`GaussianBlur`
-
-
-
-gpu::getMaxFilter_GPU
--------------------------
+gpu::createBoxMaxFilter
+-----------------------
 Creates the maximum filter.
 
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
 
     :param ksize: Kernel size.
 
     :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
 
 
 
-gpu::getMinFilter_GPU
--------------------------
+gpu::createBoxMinFilter
+-----------------------
 Creates the minimum filter.
 
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
 
-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
 
     :param ksize: Kernel size.
 
     :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
 
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createRowSumFilter
+-----------------------
+Creates a horizontal 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createColumnSumFilter
+--------------------------
+Creates a vertical 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 582c55d999..b0ebfd73c5 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -48,221 +48,101 @@
 #endif
 
 #include "opencv2/core/gpu.hpp"
-#include "opencv2/core/base.hpp"
+#include "opencv2/imgproc.hpp"
 
 namespace cv { namespace gpu {
 
-/*!
-The Base Class for 1D or Row-wise Filters
-
-This is the base class for linear or non-linear filters that process 1D data.
-In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseRowFilter_GPU
+class CV_EXPORTS Filter : public Algorithm
 {
 public:
-    BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseRowFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
+    virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };
 
-/*!
-The Base Class for Column-wise Filters
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Box Filter
 
-This is the base class for linear or non-linear filters that process columns of 2D arrays.
-Such filters are used for the "vertical" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseColumnFilter_GPU
-{
-public:
-    BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseColumnFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
-};
-
-/*!
-The Base Class for Non-Separable 2D Filters.
-
-This is the base class for linear or non-linear 2D filters.
-*/
-class CV_EXPORTS BaseFilter_GPU
-{
-public:
-    BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    Size ksize;
-    Point anchor;
-};
-
-/*!
-The Base Class for Filter Engine.
-
-The class can be used to apply an arbitrary filtering operation to an image.
-It contains all the necessary intermediate buffers.
-*/
-class CV_EXPORTS FilterEngine_GPU
-{
-public:
-    virtual ~FilterEngine_GPU() {}
-
-    virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-};
-
-//! returns the non-separable filter engine with the specified filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
-
-//! returns the separable filter engine with the specified filters
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);
-
-//! returns horizontal 1D box filter
-//! supports only CV_8UC1 source type and CV_32FC1 sum type
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
-
-//! returns vertical 1D box filter
-//! supports only CV_8UC1 sum type and CV_32FC1 dst type
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
-
-//! returns 2D box filter
-//! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
-CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
-
-//! returns box filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
-    const Point& anchor = Point(-1,-1));
-
-//! returns 2D morphological filter
-//! only MORPH_ERODE and MORPH_DILATE are supported
-//! supports CV_8UC1 and CV_8UC4 types
-//! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
-    Point anchor=Point(-1,-1));
-
-//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-
-//! returns 2D filter with the specified kernel
-//! supports CV_8U, CV_16U and CV_32F one and four channel image
-CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-//! returns the non-separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-    Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT);
-
-//! returns the primitive row filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the primitive column filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-
-//! returns filter engine for the generalized Sobel operator
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns the Gaussian filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns maximum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-//! returns minimum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-//! smooths the image using the normalized box filter
+//! creates a normalized 2D box filter
 //! supports CV_8UC1, CV_8UC4 types
-CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
+                                       int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! a synonym for normalized box filter
-static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
-{
-    boxFilter(src, dst, -1, ksize, anchor, stream);
-}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
 
-//! erodes the image (applies the local minimum operator)
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                      Point anchor = Point(-1, -1), int iterations = 1,
-                      Stream& stream = Stream::Null());
+//! Creates a non-separable linear 2D filter
+//! supports 1 and 4 channel CV_8U, CV_16U and CV_32F input
+CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! dilates the image (applies the local maximum operator)
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                       Point anchor = Point(-1, -1), int iterations = 1,
-                       Stream& stream = Stream::Null());
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
 
-//! applies an advanced morphological operation to the image
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
-                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-
-//! applies non-separable 2D linear filter to the image
-CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
-//! applies separable 2D linear filter to the image
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                            Stream& stream = Stream::Null());
-
-//! applies generalized Sobel operator to the image
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! applies the vertical or horizontal Scharr operator to the image
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! smooths the image using Gaussian filter.
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! applies Laplacian operator to the image
+//! creates a Laplacian operator
 //! supports only ksize = 1 and ksize = 3
-CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
+                                             int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
+
+//! creates a separable linear filter
+CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
+                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+//! creates a generalized Deriv operator
+CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
+                                         int ksize, bool normalize = false, double scale = 1,
+                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! creates a Sobel operator
+CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
+                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! creates a vertical or horizontal Scharr operator
+CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
+                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+//! creates a Gaussian filter
+CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
+                                            double sigma1, double sigma2 = 0,
+                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+//! creates a 2D morphological filter
+//! supports CV_8UC1 and CV_8UC4 types
+CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Image Rank Filter
+
+//! result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
+
+//! creates a horizontal 1D box filter
+//! supports only CV_8UC1 source type and CV_32FC1 sum type
+CV_EXPORTS Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! creates a vertical 1D box filter
+//! supports only CV_8UC1 sum type and CV_32FC1 dst type
+CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
 }} // namespace cv { namespace gpu {
 
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index 64cf4cc5db..6ad0998a5b 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -70,7 +70,9 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
+        cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(d_src.type(), -1, cv::Size(ksize, ksize));
+
+        TEST_CYCLE() blurFilter->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst, 1);
     }
@@ -84,6 +86,79 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(d_src.type(), -1, kernel);
+
+        TEST_CYCLE() filter2D->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(d_src.type(), -1, ksize);
+
+        TEST_CYCLE() laplacian->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // Sobel
 
@@ -102,9 +177,10 @@ PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);
+        cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(d_src.type(), -1, 1, 1, ksize);
+
+        TEST_CYCLE() sobel->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -135,9 +211,10 @@ PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);
+        cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(d_src.type(), -1, 1, 0);
+
+        TEST_CYCLE() scharr->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -169,9 +246,10 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE() gauss->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -185,39 +263,6 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     }
 }
 
-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // Erode
 
@@ -237,9 +282,10 @@ PERF_TEST_P(Sz_Type, Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8U
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), ker);
+
+        TEST_CYCLE() erode->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -272,9 +318,10 @@ PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), ker);
+
+        TEST_CYCLE() dilate->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -312,10 +359,10 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf1;
-        cv::gpu::GpuMat d_buf2;
 
-        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);
+        cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), ker);
+
+        TEST_CYCLE() morph->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -328,39 +375,3 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
         CPU_SANITY_CHECK(dst);
     }
 }
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    declare.in(kernel, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
diff --git a/modules/gpufilters/src/cuda/filter2d.cu b/modules/gpufilters/src/cuda/filter2d.cu
index 80c93c54ed..4e913124df 100644
--- a/modules/gpufilters/src/cuda/filter2d.cu
+++ b/modules/gpufilters/src/cuda/filter2d.cu
@@ -48,111 +48,104 @@
 
 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
+    template <class SrcPtr, typename D>
+    __global__ void filter2D(const SrcPtr src, PtrStepSz<D> dst,
+                             const float* __restrict__ kernel,
+                             const int kWidth, const int kHeight,
+                             const int anchorX, const int anchorY)
     {
-        #define FILTER2D_MAX_KERNEL_SIZE 16
+        typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
 
-        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-        template <class SrcT, typename D>
-        __global__ void filter2D(const SrcT src, PtrStepSz<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        sum_t res = VecTraits<sum_t>::all(0);
+        int kInd = 0;
+
+        for (int i = 0; i < kHeight; ++i)
         {
-            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            sum_t res = VecTraits<sum_t>::all(0);
-            int kInd = 0;
-
-            for (int i = 0; i < kHeight; ++i)
-            {
-                for (int j = 0; j < kWidth; ++j)
-                    res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
-            }
-
-            dst(y, x) = saturate_cast<D>(res);
+            for (int j = 0; j < kWidth; ++j)
+                res = res + src(y - anchorY + i, x - anchorX + j) * kernel[kInd++];
         }
 
-        template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
-
-        #define IMPLEMENT_FILTER2D_TEX_READER(type) \
-            texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_filter2D_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
-            { \
-                static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, \
-                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(16, 16); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_filter2D_ ## type , srcWhole); \
-                    tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
-                    Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
-                    filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    if (stream == 0) \
-                        cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        IMPLEMENT_FILTER2D_TEX_READER(uchar);
-        IMPLEMENT_FILTER2D_TEX_READER(uchar4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(ushort);
-        IMPLEMENT_FILTER2D_TEX_READER(ushort4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(float);
-        IMPLEMENT_FILTER2D_TEX_READER(float4);
-
-        #undef IMPLEMENT_FILTER2D_TEX_READER
-
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                Filter2DCaller<T, D, BrdConstant>::call,
-                Filter2DCaller<T, D, BrdReplicate>::call,
-                Filter2DCaller<T, D, BrdReflect>::call,
-                Filter2DCaller<T, D, BrdWrap>::call,
-                Filter2DCaller<T, D, BrdReflect101>::call
-            };
-
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
-        }
-
-        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        dst(y, x) = saturate_cast<D>(res);
     }
+
+    template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+
+    #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+        texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        struct tex_filter2D_ ## type ## _reader \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            const int xoff; \
+            const int yoff; \
+            tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
+            } \
+        }; \
+        template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
+        { \
+            static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel, \
+                int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+            { \
+                typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                dim3 block(16, 16); \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kernel, kWidth, kHeight, anchorX, anchorY); \
+                cudaSafeCall( cudaGetLastError() ); \
+                if (stream == 0) \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+            } \
+        };
+
+    IMPLEMENT_FILTER2D_TEX_READER(uchar);
+    IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(ushort);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(float);
+    IMPLEMENT_FILTER2D_TEX_READER(float4);
+
+    #undef IMPLEMENT_FILTER2D_TEX_READER
+
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel,
+                               int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            Filter2DCaller<T, D, BrdConstant>::call,
+            Filter2DCaller<T, D, BrdReplicate>::call,
+            Filter2DCaller<T, D, BrdReflect>::call,
+            Filter2DCaller<T, D, BrdWrap>::call,
+            Filter2DCaller<T, D, BrdReflect101>::call
+        };
+
+        funcs[borderMode]((PtrStepSz<T>) srcWhole, ofsX, ofsY, (PtrStepSz<D>) dst, kernel,
+                          kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+    }
+
+    template void filter2D<uchar  , uchar  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<uchar4 , uchar4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort , ushort >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float  , float  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float4 , float4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
 }}}
 
 #endif // CUDA_DISABLER
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index d40293d4ac..5a852c9234 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -47,286 +47,45 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<Filter> cv::gpu::createBoxFilter(int, int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
-void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_no_cuda(); }
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
-void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, int, Stream&) { throw_no_cuda(); }
+Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createLaplacianFilter(int, int, int, double, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int, int, InputArray, InputArray, Point, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createDerivFilter(int, int, int, int, int, bool, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createSobelFilter(int, int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createScharrFilter(int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createMorphologyFilter(int, int, InputArray, Point, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createBoxMinFilter(int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createRowSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createColumnSumFilter(int, int, int, int, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
 #else
 
 namespace
 {
-    inline void normalizeAnchor(int& anchor, int ksize)
+    void normalizeAnchor(int& anchor, int ksize)
     {
         if (anchor < 0)
             anchor = ksize >> 1;
 
-        CV_Assert(0 <= anchor && anchor < ksize);
+        CV_Assert( 0 <= anchor && anchor < ksize );
     }
 
-    inline void normalizeAnchor(Point& anchor, const Size& ksize)
+    void normalizeAnchor(Point& anchor, Size ksize)
     {
         normalizeAnchor(anchor.x, ksize.width);
         normalizeAnchor(anchor.y, ksize.height);
     }
-
-    inline void normalizeROI(Rect& roi, const Size& ksize, const Point& anchor, const Size& src_size)
-    {
-        if (roi == Rect(0,0,-1,-1))
-            roi = Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height);
-
-        CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
-    }
-
-    inline void normalizeKernel(const Mat& kernel, GpuMat& gpu_krnl, int type = CV_8U, int* nDivisor = 0, bool reverse = false)
-    {
-        int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
-        if (nDivisor) *nDivisor = scale;
-
-        Mat temp(kernel.size(), type);
-        kernel.convertTo(temp, type, scale);
-        Mat cont_krnl = temp.reshape(1, 1);
-
-        if (reverse)
-        {
-            int count = cont_krnl.cols >> 1;
-            for (int i = 0; i < count; ++i)
-            {
-                std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
-            }
-        }
-
-        gpu_krnl.upload(cont_krnl);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-namespace
-{
-    struct Filter2DEngine_GPU : public FilterEngine_GPU
-    {
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
-            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
-        {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-
-            (*filter2D)(srcROI, dstROI, stream);
-        }
-
-        Ptr<BaseFilter_GPU> filter2D;
-        int srcType, dstType;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// SeparableFilter
-
-namespace
-{
-    struct SeparableFilterEngine_GPU : public FilterEngine_GPU
-    {
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf;
-        }
-
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_,
-                                  GpuMat& buf_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf_;
-        }
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            ensureSizeIsEnough(src_size, bufType, *pbuf);
-
-            normalizeROI(roi, ksize, anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-            GpuMat bufROI = (*pbuf)(roi);
-
-            (*rowFilter)(srcROI, bufROI, stream);
-            (*columnFilter)(bufROI, dstROI, stream);
-        }
-
-        Ptr<BaseRowFilter_GPU> rowFilter;
-        Ptr<BaseColumnFilter_GPU> columnFilter;
-
-        int srcType, bufType, dstType;
-
-        Size ksize;
-        Point anchor;
-
-        GpuMat buf;
-        GpuMat* pbuf;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// 1D Sum Filter
-
-namespace
-{
-    struct NppRowSumFilter : public BaseRowFilter_GPU
-    {
-        NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor)
-{
-    CV_Assert(srcType == CV_8UC1 && sumType == CV_32FC1);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseRowFilter_GPU>(new NppRowSumFilter(ksize, anchor));
-}
-
-namespace
-{
-    struct NppColumnSumFilter : public BaseColumnFilter_GPU
-    {
-        NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor)
-{
-    CV_Assert(sumType == CV_8UC1 && dstType == CV_32FC1);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseColumnFilter_GPU>(new NppColumnSumFilter(ksize, anchor));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -334,328 +93,83 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstTy
 
 namespace
 {
-    typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        NppiSize oMaskSize, NppiPoint oAnchor);
-
-    struct NPPBoxFilter : public BaseFilter_GPU
+    class NPPBoxFilter : public Filter
     {
-        NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
+    public:
+        NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
+    private:
+        typedef NppStatus (*nppFilterBox_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep,
+                                            NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor);
 
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        nppFilterBox_t func;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
-{
-    static const nppFilterBox_t nppFilterBox_callers[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPBoxFilter(ksize, anchor, nppFilterBox_callers[CV_MAT_CN(srcType)]));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor)
-{
-    Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor);
-    return createFilter2D_GPU(boxFilter, srcType, dstType);
-}
-
-void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor, Stream& stream)
-{
-    int sdepth = src.depth(), cn = src.channels();
-    if( ddepth < 0 )
-        ddepth = sdepth;
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, cn));
-
-    Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor);
-    f->apply(src, dst, Rect(0,0,-1,-1), stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Morphology Filter
-
-namespace
-{
-    typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint);
-
-    struct NPPMorphFilter : public BaseFilter_GPU
-    {
-        NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        nppMorfFilter_t func;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
-{
-    static const nppMorfFilter_t nppMorfFilter_callers[2][5] =
-    {
-        {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
-        {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
+        Size ksize_;
+        Point anchor_;
+        int type_;
+        nppFilterBox_t func_;
+        int borderMode_;
+        Scalar borderVal_;
+        GpuMat srcBorder_;
     };
 
-    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
-    CV_Assert(type == CV_8UC1 || type == CV_8UC4);
-
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl);
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
-}
-
-namespace
-{
-    struct MorphologyFilterEngine_GPU : public FilterEngine_GPU
+    NPPBoxFilter::NPPBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        ksize_(ksize), anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
     {
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) :
-            filter2D(filter2D_), type(type_), iters(iters_)
-        {
-            pbuf = &buf;
-        }
+        static const nppFilterBox_t funcs[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};
 
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) :
-            filter2D(filter2D_), type(type_), iters(iters_)
-        {
-            pbuf = &buf_;
-        }
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+        CV_Assert( dstType == srcType );
 
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == type);
+        normalizeAnchor(anchor_, ksize);
 
-            Size src_size = src.size();
-
-            dst.create(src_size, type);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-            if (iters > 1)
-                pbuf->create(src_size, type);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-
-            (*filter2D)(srcROI, dstROI, stream);
-
-            for(int i = 1; i < iters; ++i)
-            {
-                dst.swap((*pbuf));
-
-                dstROI = dst(roi);
-                GpuMat bufROI = (*pbuf)(roi);
-
-                (*filter2D)(bufROI, dstROI, stream);
-            }
-        }
-
-        Ptr<BaseFilter_GPU> filter2D;
-
-        int type;
-        int iters;
-
-        GpuMat buf;
-        GpuMat* pbuf;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor, int iterations)
-{
-    CV_Assert(iterations > 0);
-
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, const Point& anchor, int iterations)
-{
-    CV_Assert(iterations > 0);
-
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-
-    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations, buf));
-}
-
-namespace
-{
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream = Stream::Null())
-    {
-        Mat kernel;
-        Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
-
-        normalizeAnchor(anchor, ksize);
-
-        if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
-        {
-            src.copyTo(dst, stream);
-            return;
-        }
-
-        dst.create(src.size(), src.type());
-
-        if (!_kernel.data)
-        {
-            kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2));
-            anchor = Point(iterations, iterations);
-            iterations = 1;
-        }
-        else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
-        {
-            anchor = Point(anchor.x * iterations, anchor.y * iterations);
-            kernel = getStructuringElement(MORPH_RECT,
-                                           Size(ksize.width + (iterations - 1) * (ksize.width - 1),
-                                                ksize.height + (iterations - 1) * (ksize.height - 1)),
-                                           anchor);
-            iterations = 1;
-        }
-        else
-            kernel = _kernel;
-
-        Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, buf, anchor, iterations);
-
-        f->apply(src, dst, Rect(0,0,-1,-1), stream);
+        func_ = funcs[CV_MAT_CN(srcType)];
     }
 
-    void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations)
+    void NPPBoxFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
     {
-        GpuMat buf;
-        morphOp(op, src, dst, _kernel, buf, anchor, iterations);
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                           dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
     }
 }
 
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
+Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations);
-}
+    if (dstType < 0)
+        dstType = srcType;
 
-void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
-{
-    morphOp(MORPH_ERODE, src, dst, kernel, buf, anchor, iterations, stream);
-}
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
 
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
-{
-    morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations);
-}
-
-void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
-{
-    morphOp(MORPH_DILATE, src, dst, kernel, buf, anchor, iterations, stream);
-}
-
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations)
-{
-    GpuMat buf1;
-    GpuMat buf2;
-    morphologyEx(src, dst, op, kernel, buf1, buf2, anchor, iterations);
-}
-
-void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor, int iterations, Stream& stream)
-{
-    switch( op )
-    {
-    case MORPH_ERODE:
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_DILATE:
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_OPEN:
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
-        dilate(buf2, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-    case MORPH_CLOSE:
-        dilate(src, buf2, kernel, buf1, anchor, iterations, stream);
-        erode(buf2, dst, kernel, buf1, anchor, iterations, stream);
-        break;
-
-#ifdef HAVE_OPENCV_GPUARITHM
-    case MORPH_GRADIENT:
-        erode(src, buf2, kernel, buf1, anchor, iterations, stream);
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(dst, buf2, dst, GpuMat(), -1, stream);
-        break;
-
-    case MORPH_TOPHAT:
-        erode(src, dst, kernel, buf1, anchor, iterations, stream);
-        dilate(dst, buf2, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(src, buf2, dst, GpuMat(), -1, stream);
-        break;
-
-    case MORPH_BLACKHAT:
-        dilate(src, dst, kernel, buf1, anchor, iterations, stream);
-        erode(dst, buf2, kernel, buf1, anchor, iterations, stream);
-        gpu::subtract(buf2, src, dst, GpuMat(), -1, stream);
-        break;
-#endif
-
-    default:
-        CV_Error(cv::Error::StsBadArg, "unknown morphological operation");
-    }
+    return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -663,165 +177,127 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
 
 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
-    {
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream);
-    }
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream);
 }}}
 
 namespace
 {
-    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);
-
-    struct NPPLinearFilter : public BaseFilter_GPU
+    class LinearFilter : public Filter
     {
-        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+    public:
+        LinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter2D_t func;
-    };
-
-    typedef void (*gpuFilter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
+    private:
+        typedef void (*filter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                                   int kWidth, int kHeight, int anchorX, int anchorY,
                                    int borderMode, const float* borderValue, cudaStream_t stream);
 
-    struct GpuFilter2D : public BaseFilter_GPU
-    {
-        GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) :
-            BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_)
-        {
-        }
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-        {
-            using namespace cv::gpu::cudev::imgproc;
-
-            Point ofs;
-            Size wholeSize;
-            src.locateROI(wholeSize, ofs);
-            GpuMat srcWhole(wholeSize, src.type(), src.datastart);
-
-            static const Scalar_<float> zero = Scalar_<float>::all(0.0f);
-            func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), brd_type, zero.val, StreamAccessor::getStream(stream));
-        }
-
-        gpuFilter2D_t func;
-        GpuMat kernel;
-        int brd_type;
+        GpuMat kernel_;
+        Point anchor_;
+        int type_;
+        filter2D_t func_;
+        int borderMode_;
+        Scalar_<float> borderVal_;
     };
-}
 
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    int sdepth = CV_MAT_DEPTH(srcType);
-    int scn = CV_MAT_CN(srcType);
-
-    CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F);
-    CV_Assert(scn == 1 || scn == 4);
-    CV_Assert(dstType == srcType);
-    CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP);
-
-    Size ksize = kernel.size();
-
-#if 0
-    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT)
+    LinearFilter::LinearFilter(int srcType, int dstType, InputArray _kernel, Point anchor, int borderMode, Scalar borderVal) :
+        anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
     {
-        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int scn = CV_MAT_CN(srcType);
 
-        GpuMat gpu_krnl;
-        int nDivisor;
-        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
+        Mat kernel = _kernel.getMat();
 
-        normalizeAnchor(anchor, ksize);
+        CV_Assert( sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F );
+        CV_Assert( scn == 1 || scn == 4 );
+        CV_Assert( dstType == srcType );
+        CV_Assert( kernel.channels() == 1 );
+        CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
 
-        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
-    }
-#endif
+        Mat kernel32F;
+        kernel.convertTo(kernel32F, CV_32F);
 
-    CV_Assert(ksize.width * ksize.height <= 16 * 16);
+        kernel_ = gpu::createContinuous(kernel.size(), CV_32FC1);
+        kernel_.upload(kernel32F);
 
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl, CV_32F);
+        normalizeAnchor(anchor_, kernel.size());
 
-    normalizeAnchor(anchor, ksize);
-
-    gpuFilter2D_t func = 0;
-
-    switch (srcType)
-    {
-    case CV_8UC1:
-        func = filter2D_gpu<uchar, uchar>;
-        break;
-    case CV_8UC4:
-        func = filter2D_gpu<uchar4, uchar4>;
-        break;
-    case CV_16UC1:
-        func = filter2D_gpu<ushort, ushort>;
-        break;
-    case CV_16UC4:
-        func = filter2D_gpu<ushort4, ushort4>;
-        break;
-    case CV_32FC1:
-        func = filter2D_gpu<float, float>;
-        break;
-    case CV_32FC4:
-        func = filter2D_gpu<float4, float4>;
-        break;
+        switch (srcType)
+        {
+        case CV_8UC1:
+            func_ = cudev::filter2D<uchar, uchar>;
+            break;
+        case CV_8UC4:
+            func_ = cudev::filter2D<uchar4, uchar4>;
+            break;
+        case CV_16UC1:
+            func_ = cudev::filter2D<ushort, ushort>;
+            break;
+        case CV_16UC4:
+            func_ = cudev::filter2D<ushort4, ushort4>;
+            break;
+        case CV_32FC1:
+            func_ = cudev::filter2D<float, float>;
+            break;
+        case CV_32FC4:
+            func_ = cudev::filter2D<float4, float4>;
+            break;
+        }
     }
 
-    return Ptr<BaseFilter_GPU>(new GpuFilter2D(ksize, anchor, func, gpu_krnl, brd_type));
+    void LinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        Point ofs;
+        Size wholeSize;
+        src.locateROI(wholeSize, ofs);
+
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart);
+
+        func_(srcWhole, ofs.x, ofs.y, dst, kernel_.ptr<float>(),
+              kernel_.cols, kernel_.rows, anchor_.x, anchor_.y,
+              borderMode_, borderVal_.val, StreamAccessor::getStream(_stream));
+    }
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType)
+Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal)
 {
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType);
+    if (dstType < 0)
+        dstType = srcType;
 
-    return createFilter2D_GPU(linearFilter, srcType, dstType);
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
+    return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
 }
 
-void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter
+
+Ptr<Filter> cv::gpu::createLaplacianFilter(int srcType, int dstType, int ksize, double scale, int borderMode, Scalar borderVal)
 {
-    if (ddepth < 0)
-        ddepth = src.depth();
+    CV_Assert( ksize == 1 || ksize == 3 );
 
-    int dst_type = CV_MAKE_TYPE(ddepth, src.channels());
+    static const float K[2][9] =
+    {
+        {0.0f, 1.0f, 0.0f, 1.0f, -4.0f, 1.0f, 0.0f, 1.0f, 0.0f},
+        {2.0f, 0.0f, 2.0f, 0.0f, -8.0f, 0.0f, 2.0f, 0.0f, 2.0f}
+    };
 
-    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType);
+    Mat kernel(3, 3, CV_32FC1, (void*)K[ksize == 3]);
+    if (scale != 1)
+        kernel *= scale;
 
-    dst.create(src.size(), dst_type);
-
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+    return gpu::createLinearFilter(srcType, dstType, kernel, Point(-1,-1), borderMode, borderVal);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -838,294 +314,130 @@ namespace filter
 
 namespace
 {
-    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
-        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
-
-    typedef void (*gpuFilter1D_t)(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-
-    struct NppLinearRowFilter : public BaseRowFilter_GPU
+    class SeparableLinearFilter : public Filter
     {
-        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+    public:
+        SeparableLinearFilter(int srcType, int dstType,
+                              InputArray rowKernel, InputArray columnKernel,
+                              Point anchor, int rowBorderMode, int columnBorderMode);
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+        int srcType_, bufType_, dstType_;
+        GpuMat rowKernel_, columnKernel_;
+        func_t rowFilter_, columnFilter_;
+        Point anchor_;
+        int rowBorderMode_, columnBorderMode_;
+
+        GpuMat buf_;
+    };
+
+    SeparableLinearFilter::SeparableLinearFilter(int srcType, int dstType,
+                                                 InputArray _rowKernel, InputArray _columnKernel,
+                                                 Point anchor, int rowBorderMode, int columnBorderMode) :
+        srcType_(srcType), dstType_(dstType), anchor_(anchor), rowBorderMode_(rowBorderMode), columnBorderMode_(columnBorderMode)
+    {
+        static const func_t rowFilterFuncs[7][4] =
         {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
+            {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+            {0, 0, 0, 0},
+            {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
+            {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
+            {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
+            {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+            {0, 0, 0, 0}
+        };
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearRowFilter : public BaseRowFilter_GPU
-    {
-        GpuLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
+        static const func_t columnFilterFuncs[7][4] =
         {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
+            {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+            {0, 0, 0, 0},
+            {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
+            {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+            {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
+            {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+            {0, 0, 0, 0}
+        };
 
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int cn = CV_MAT_CN(srcType);
+        const int ddepth = CV_MAT_DEPTH(dstType);
 
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
-        {0, 0, 0, 0},
-        {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
-        {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
-        {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
-        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
-    };
+        Mat rowKernel = _rowKernel.getMat();
+        Mat columnKernel = _columnKernel.getMat();
 
-    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
+        CV_Assert( sdepth <= CV_64F && cn <= 4 );
+        CV_Assert( rowKernel.channels() == 1 );
+        CV_Assert( columnKernel.channels() == 1 );
+        CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP );
+        CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP );
 
-        GpuMat gpu_row_krnl;
-        int nDivisor;
-        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
+        Mat kernel32F;
 
-        const int ksize = gpu_row_krnl.cols;
-        normalizeAnchor(anchor, ksize);
+        rowKernel.convertTo(kernel32F, CV_32F);
+        rowKernel_.upload(kernel32F.reshape(1, 1));
 
-        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, npp_funcs[CV_MAT_CN(srcType)]));
+        columnKernel.convertTo(kernel32F, CV_32F);
+        columnKernel_.upload(kernel32F.reshape(1, 1));
+
+        CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 );
+        CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 );
+
+        normalizeAnchor(anchor_.x, rowKernel_.cols);
+        normalizeAnchor(anchor_.y, columnKernel_.cols);
+
+        bufType_ = CV_MAKE_TYPE(CV_32F, cn);
+
+        rowFilter_ = rowFilterFuncs[sdepth][cn - 1];
+        CV_Assert( rowFilter_ != 0 );
+
+        columnFilter_ = columnFilterFuncs[ddepth][cn - 1];
+        CV_Assert( columnFilter_ != 0 );
     }
 
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int sdepth = CV_MAT_DEPTH(srcType);
-    const int cn = CV_MAT_CN(srcType);
-    CV_Assert( sdepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    const gpuFilter1D_t func = funcs[sdepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_row_krnl;
-    normalizeKernel(rowKernel, gpu_row_krnl, CV_32F);
-
-    const int ksize = gpu_row_krnl.cols;
-    CV_Assert( ksize > 0 && ksize <= 32 );
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, borderType));
-}
-
-namespace
-{
-    struct NppLinearColumnFilter : public BaseColumnFilter_GPU
+    void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
     {
-        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
 
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
 
-            cudaStream_t stream = StreamAccessor::getStream(s);
+        ensureSizeIsEnough(src.size(), bufType_, buf_);
 
-            NppStreamHandler h(stream);
+        DeviceInfo devInfo;
+        const int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
 
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
-    {
-        GpuLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            if (ksize > 16 && cc < 20)
-                CV_Error(cv::Error::StsNotImplemented, "column linear filter doesn't implemented for kernel size > 16 for device with compute capabilities less than 2.0");
-
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
-
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
-        {0, 0, 0, 0},
-        {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
-        {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
-        {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
-        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
-    };
-
-    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
-
-        GpuMat gpu_col_krnl;
-        int nDivisor;
-        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
-
-        const int ksize = gpu_col_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, npp_funcs[CV_MAT_CN(bufType)]));
+        rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
+        columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
     }
-
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int ddepth = CV_MAT_DEPTH(dstType);
-    const int cn = CV_MAT_CN(dstType);
-    CV_Assert( ddepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    gpuFilter1D_t func = funcs[ddepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_col_krnl;
-    normalizeKernel(columnKernel, gpu_col_krnl, CV_32F);
-
-    const int ksize = gpu_col_krnl.cols;
-    CV_Assert(ksize > 0 && ksize <= 32);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, borderType));
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
-    const Point& anchor, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor, int rowBorderMode, int columnBorderMode)
 {
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
+    if (dstType < 0)
+        dstType = srcType;
 
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
 
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
+    if (columnBorderMode < 0)
+        columnBorderMode = rowBorderMode;
 
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
-    const Point& anchor, int rowBorderType, int columnBorderType)
-{
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
-
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
-
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                          Point anchor, int rowBorderType, int columnBorderType)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                          Point anchor, int rowBorderType, int columnBorderType,
-                          Stream& stream)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+    return new SeparableLinearFilter(srcType, dstType, rowKernel, columnKernel, anchor, rowBorderMode, columnBorderMode);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deriv Filter
 
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize, double scale, int rowBorderMode, int columnBorderMode)
 {
     Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
-{
-    GpuMat buf;
-    Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
+    getDerivKernels(kx, ky, dx, dy, ksize, normalize, CV_32F);
 
     if (scale != 1)
     {
@@ -1137,55 +449,25 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
             ky *= scale;
     }
 
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
+    return gpu::createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1, -1), rowBorderMode, columnBorderMode);
 }
 
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize, double scale, int rowBorderMode, int columnBorderMode)
 {
-    GpuMat buf;
-    Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType);
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, ksize, false, scale, rowBorderMode, columnBorderMode);
 }
 
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+Ptr<Filter> cv::gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale, int rowBorderMode, int columnBorderMode)
 {
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
-
-    if( scale != 1 )
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if( dx == 0 )
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
-}
-
-void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
-{
-    CV_Assert(ksize == 1 || ksize == 3);
-
-    static const int K[2][9] =
-    {
-        {0, 1, 0, 1, -4, 1, 0, 1, 0},
-        {2, 0, 2, 0, -8, 0, 2, 0, 2}
-    };
-    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
-    if (scale != 1)
-        kernel *= scale;
-
-    filter2D(src, dst, ddepth, kernel, Point(-1,-1), borderType, stream);
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, -1, false, scale, rowBorderMode, columnBorderMode);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
+Ptr<Filter> cv::gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2, int rowBorderMode, int columnBorderMode)
 {
-    int depth = CV_MAT_DEPTH(type);
+    const int depth = CV_MAT_DEPTH(srcType);
 
     if (sigma2 <= 0)
         sigma2 = sigma1;
@@ -1201,70 +483,298 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
     sigma1 = std::max(sigma1, 0.0);
     sigma2 = std::max(sigma2, 0.0);
 
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
+    Mat kx = getGaussianKernel(ksize.width, sigma1, CV_32F);
     Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
         ky = kx;
     else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
+        ky = getGaussianKernel(ksize.height, sigma2, CV_32F);
 
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
+    return createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1,-1), rowBorderMode, columnBorderMode);
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+namespace
 {
-    int depth = CV_MAT_DEPTH(type);
-
-    if (sigma2 <= 0)
-        sigma2 = sigma1;
-
-    // automatic detection of kernel size from sigma
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
-
-    sigma1 = std::max(sigma1, 0.0);
-    sigma2 = std::max(sigma2, 0.0);
-
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
-    Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    if (ksize.width == 1 && ksize.height == 1)
+    class MorphologyFilter : public Filter
     {
-        src.copyTo(dst);
-        return;
+    public:
+        MorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppMorfFilter_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             const Npp8u* pMask, NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        GpuMat kernel_;
+        Point anchor_;
+        int iters_;
+        nppMorfFilter_t func_;
+
+        GpuMat srcBorder_;
+        GpuMat buf_;
+    };
+
+    MorphologyFilter::MorphologyFilter(int op, int srcType, InputArray _kernel, Point anchor, int iterations) :
+        type_(srcType), anchor_(anchor), iters_(iterations)
+    {
+        static const nppMorfFilter_t funcs[2][5] =
+        {
+            {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
+            {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
+        };
+
+        CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE );
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+
+        Mat kernel = _kernel.getMat();
+        Size ksize = !kernel.empty() ? _kernel.size() : Size(3, 3);
+
+        normalizeAnchor(anchor_, ksize);
+
+        if (kernel.empty())
+        {
+            kernel = getStructuringElement(MORPH_RECT, Size(1 + iters_ * 2, 1 + iters_ * 2));
+            anchor_ = Point(iters_, iters_);
+            iters_ = 1;
+        }
+        else if (iters_ > 1 && countNonZero(kernel) == (int) kernel.total())
+        {
+            anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_);
+            kernel = getStructuringElement(MORPH_RECT,
+                                           Size(ksize.width + (iters_ - 1) * (ksize.width - 1),
+                                                ksize.height + (iters_ - 1) * (ksize.height - 1)),
+                                           anchor_);
+            iters_ = 1;
+        }
+
+        CV_Assert( kernel.channels() == 1 );
+
+        Mat kernel8U;
+        kernel.convertTo(kernel8U, CV_8U);
+
+        kernel_ = gpu::createContinuous(kernel.size(), CV_8UC1);
+        kernel_.upload(kernel8U);
+
+        func_ = funcs[op][CV_MAT_CN(srcType)];
     }
 
-    dst.create(src.size(), src.type());
+    void MorphologyFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
 
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
+        Size ksize = kernel_.size();
+        gpu::copyMakeBorder(src, srcBorder_, ksize.height, ksize.height, ksize.width, ksize.width, BORDER_DEFAULT, Scalar(), _stream);
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+
+        GpuMat bufRoi;
+        if (iters_ > 1)
+        {
+            ensureSizeIsEnough(srcBorder_.size(), type_, buf_);
+            buf_.setTo(Scalar::all(0), _stream);
+            bufRoi = buf_(Rect(ksize.width, ksize.height, src.cols, src.rows));
+        }
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize.height;
+        oMaskSize.width = ksize.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+
+        for(int i = 1; i < iters_; ++i)
+        {
+            dst.copyTo(bufRoi, _stream);
+
+            nppSafeCall( func_(bufRoi.ptr<Npp8u>(), static_cast<int>(bufRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                               oSizeROI, kernel_.ptr<Npp8u>(), oMaskSize, oAnchor) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
+namespace
 {
-    if (ksize.width == 1 && ksize.height == 1)
+    class MorphologyExFilter : public Filter
     {
-        src.copyTo(dst);
-        return;
+    public:
+        MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+    protected:
+        Ptr<gpu::Filter> erodeFilter_, dilateFilter_;
+        GpuMat buf_;
+    };
+
+    MorphologyExFilter::MorphologyExFilter(int srcType, InputArray kernel, Point anchor, int iterations)
+    {
+        erodeFilter_ = gpu::createMorphologyFilter(MORPH_ERODE, srcType, kernel, anchor, iterations);
+        dilateFilter_ = gpu::createMorphologyFilter(MORPH_DILATE, srcType, kernel, anchor, iterations);
     }
 
-    dst.create(src.size(), src.type());
+    // MORPH_OPEN
 
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
+    class MorphologyOpenFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyOpenFilter::MorphologyOpenFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyOpenFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_CLOSE
+
+    class MorphologyCloseFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyCloseFilter::MorphologyCloseFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyCloseFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, buf_, stream);
+        erodeFilter_->apply(buf_, dst, stream);
+    }
+
+    // MORPH_GRADIENT
+
+    class MorphologyGradientFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyGradientFilter::MorphologyGradientFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyGradientFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, buf_, stream);
+        dilateFilter_->apply(src, dst, stream);
+        gpu::subtract(dst, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_TOPHAT
+
+    class MorphologyTophatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyTophatFilter::MorphologyTophatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyTophatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        erodeFilter_->apply(src, dst, stream);
+        dilateFilter_->apply(dst, buf_, stream);
+        gpu::subtract(src, buf_, dst, noArray(), -1, stream);
+    }
+
+    // MORPH_BLACKHAT
+
+    class MorphologyBlackhatFilter : public MorphologyExFilter
+    {
+    public:
+        MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+    };
+
+    MorphologyBlackhatFilter::MorphologyBlackhatFilter(int srcType, InputArray kernel, Point anchor, int iterations) :
+        MorphologyExFilter(srcType, kernel, anchor, iterations)
+    {
+    }
+
+    void MorphologyBlackhatFilter::apply(InputArray src, OutputArray dst, Stream& stream)
+    {
+        dilateFilter_->apply(src, dst, stream);
+        erodeFilter_->apply(dst, buf_, stream);
+        gpu::subtract(buf_, src, dst, noArray(), -1, stream);
+    }
+}
+
+Ptr<Filter> cv::gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor, int iterations)
+{
+    switch( op )
+    {
+    case MORPH_ERODE:
+    case MORPH_DILATE:
+        return new MorphologyFilter(op, srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_OPEN:
+        return new MorphologyOpenFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_CLOSE:
+        return new MorphologyCloseFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_GRADIENT:
+        return new MorphologyGradientFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_TOPHAT:
+        return new MorphologyTophatFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    case MORPH_BLACKHAT:
+        return new MorphologyBlackhatFilter(srcType, kernel, anchor, iterations);
+        break;
+
+    default:
+        CV_Error(Error::StsBadArg, "Unknown morphological operation");
+        return Ptr<Filter>();
+    }
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1272,59 +782,217 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& b
 
 namespace
 {
-    typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        NppiSize oMaskSize, NppiPoint oAnchor);
-
-    struct NPPRankFilter : public BaseFilter_GPU
+    enum
     {
-        NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        nppFilterRank_t func;
+        RANK_MAX,
+        RANK_MIN
     };
+
+    class NPPRankFilter : public Filter
+    {
+    public:
+        NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef NppStatus (*nppFilterRank_t)(const Npp8u* pSrc, Npp32s nSrcStep, Npp8u* pDst, Npp32s nDstStep, NppiSize oSizeROI,
+                                             NppiSize oMaskSize, NppiPoint oAnchor);
+
+        int type_;
+        Size ksize_;
+        Point anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+        nppFilterRank_t func_;
+
+        GpuMat srcBorder_;
+    };
+
+    NPPRankFilter::NPPRankFilter(int op, int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal) :
+        type_(srcType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        static const nppFilterRank_t maxFuncs[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
+        static const nppFilterRank_t minFuncs[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+
+        CV_Assert( srcType == CV_8UC1 || srcType == CV_8UC4 );
+
+        normalizeAnchor(anchor_, ksize_);
+
+        if (op == RANK_MAX)
+            func_ = maxFuncs[CV_MAT_CN(srcType)];
+        else
+            func_ = minFuncs[CV_MAT_CN(srcType)];
+    }
+
+    void NPPRankFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_.height, ksize_.height, ksize_.width, ksize_.width, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_.width, ksize_.height, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        NppiSize oMaskSize;
+        oMaskSize.height = ksize_.height;
+        oMaskSize.width = ksize_.width;
+
+        NppiPoint oAnchor;
+        oAnchor.x = anchor_.x;
+        oAnchor.y = anchor_.y;
+
+        nppSafeCall( func_(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step),
+                           oSizeROI, oMaskSize, oAnchor) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
-Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
+Ptr<Filter> cv::gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};
-
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
+    return new NPPRankFilter(RANK_MAX, srcType, ksize, anchor, borderMode, borderVal);
 }
 
-Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor)
+Ptr<Filter> cv::gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor, int borderMode, Scalar borderVal)
 {
-    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};
+    return new NPPRankFilter(RANK_MIN, srcType, ksize, anchor, borderMode, borderVal);
+}
 
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
 
-    normalizeAnchor(anchor, ksize);
+namespace
+{
+    class NppRowSumFilter : public Filter
+    {
+    public:
+        NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
 
-    return Ptr<BaseFilter_GPU>(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)]));
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+
+        GpuMat srcBorder_;
+    };
+
+    NppRowSumFilter::NppRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppRowSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, 0, 0, ksize_, ksize_, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(ksize_, 0, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowRow_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
+{
+    return new NppRowSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
+}
+
+namespace
+{
+    class NppColumnSumFilter : public Filter
+    {
+    public:
+        NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int srcType_, dstType_;
+        int ksize_;
+        int anchor_;
+        int borderMode_;
+        Scalar borderVal_;
+
+        GpuMat srcBorder_;
+    };
+
+    NppColumnSumFilter::NppColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal) :
+        srcType_(srcType), dstType_(dstType), ksize_(ksize), anchor_(anchor), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        CV_Assert( srcType_ == CV_8UC1 );
+        CV_Assert( dstType_ == CV_32FC1 );
+
+        normalizeAnchor(anchor_, ksize_);
+    }
+
+    void NppColumnSumFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        gpu::copyMakeBorder(src, srcBorder_, ksize_, ksize_, 0, 0, borderMode_, borderVal_, _stream);
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        GpuMat srcRoi = srcBorder_(Rect(0, ksize_, src.cols, src.rows));
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+        NppStreamHandler h(stream);
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        nppSafeCall( nppiSumWindowColumn_8u32f_C1R(srcRoi.ptr<Npp8u>(), static_cast<int>(srcRoi.step),
+                                                   dst.ptr<Npp32f>(), static_cast<int>(dst.step),
+                                                   oSizeROI, ksize_, anchor_) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+Ptr<Filter> cv::gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor, int borderMode, Scalar borderVal)
+{
+    return new NppColumnSumFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
 #endif
diff --git a/modules/gpufilters/src/precomp.hpp b/modules/gpufilters/src/precomp.hpp
index 3add0f2af1..c3d5e020d2 100644
--- a/modules/gpufilters/src/precomp.hpp
+++ b/modules/gpufilters/src/precomp.hpp
@@ -46,14 +46,9 @@
 #include <limits>
 
 #include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuarithm.hpp"
 #include "opencv2/imgproc.hpp"
 
 #include "opencv2/core/private.gpu.hpp"
 
-#include "opencv2/opencv_modules.hpp"
-
-#ifdef HAVE_OPENCV_GPUARITHM
-#  include "opencv2/gpuarithm.hpp"
-#endif
-
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index 5adcd87a41..03bea05e6d 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -70,13 +70,14 @@ namespace
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Blur
 
-PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi)
+PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
     int type;
     cv::Size ksize;
     cv::Point anchor;
+    int borderType;
     bool useRoi;
 
     virtual void SetUp()
@@ -86,7 +87,8 @@ PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, Use
         type = GET_PARAM(2);
         ksize = GET_PARAM(3);
         anchor = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
 
         cv::gpu::setDevice(devInfo.deviceID());
     }
@@ -96,13 +98,15 @@ GPU_TEST_P(Blur, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(src.type(), -1, ksize, anchor, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::blur(loadMat(src, useRoi), dst, ksize, anchor);
+    blurFilter->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
-    cv::blur(src, dst_gold, ksize, anchor);
+    cv::blur(src, dst_gold, ksize, anchor, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
@@ -111,6 +115,173 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
     testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
     testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Filter2D, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(src.type(), -1, kernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter2D->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Laplacian, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(src.type(), -1, ksize.width);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    laplacian->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::Laplacian(src, dst_gold, -1, ksize.width);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SeparableLinearFilter
+
+PARAM_TEST_CASE(SeparableLinearFilter, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        anchor = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+GPU_TEST_P(SeparableLinearFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat rowKernel = randomMat(Size(ksize.width, 1), CV_32FC1, 0.0, 1.0);
+    cv::Mat columnKernel = randomMat(Size(ksize.height, 1), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter = cv::gpu::createSeparableLinearFilter(src.type(), -1, rowKernel, columnKernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::sepFilter2D(src, dst_gold, -1, rowKernel, columnKernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, SeparableLinearFilter, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(23, 15)),
+                    KSize(cv::Size(31, 3))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -155,13 +326,15 @@ GPU_TEST_P(Sobel, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(src.type(), -1, dx, dy, ksize.width, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Sobel(loadMat(src, useRoi), dst, -1, dx, dy, ksize.width, 1.0, borderType);
+    sobel->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Sobel, testing::Combine(
@@ -218,13 +391,15 @@ GPU_TEST_P(Scharr, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(src.type(), -1, dx, dy, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Scharr(loadMat(src, useRoi), dst, -1, dx, dy, 1.0, borderType);
+    scharr->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Scharr, testing::Combine(
@@ -277,28 +452,15 @@ GPU_TEST_P(GaussianBlur, Accuracy)
     double sigma1 = randomDouble(0.1, 1.0);
     double sigma2 = randomDouble(0.1, 1.0);
 
-    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
+    cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, borderType);
 
-        cv::Mat dst_gold;
-        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    gauss->apply(loadMat(src, useRoi), dst);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
-    }
+    cv::Mat dst_gold;
+    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 4.0 : 1e-4);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
@@ -327,49 +489,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Laplacian, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
-
-    cv::Mat dst_gold;
-    cv::Laplacian(src, dst_gold, -1, ksize.width);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
-    WHOLE_SUBMAT));
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
 
@@ -400,8 +519,10 @@ GPU_TEST_P(Erode, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::erode(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    erode->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::erode(src, dst_gold, kernel, anchor, iterations);
@@ -449,8 +570,10 @@ GPU_TEST_P(Dilate, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::dilate(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    dilate->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::dilate(src, dst_gold, kernel, anchor, iterations);
@@ -502,8 +625,10 @@ GPU_TEST_P(MorphEx, Accuracy)
     cv::Mat src = randomMat(size, type);
     cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);
 
+    cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), kernel, anchor, iterations);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::morphologyEx(loadMat(src, useRoi), dst, morphOp, kernel, anchor, iterations);
+    morph->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::morphologyEx(src, dst_gold, morphOp, kernel, anchor, iterations);
@@ -522,56 +647,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, MorphEx, testing::Combine(
     testing::Values(Iterations(1), Iterations(2), Iterations(3)),
     WHOLE_SUBMAT));
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    cv::Point anchor;
-    int borderType;
-    bool useRoi;
-
-    cv::Mat img;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        anchor = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Filter2D, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType);
-
-    cv::Mat dst_gold;
-    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
-    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
-    WHOLE_SUBMAT));
-
 #endif // HAVE_CUDA
diff --git a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
index cf1b8e6706..3fe9f82f4c 100644
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -158,7 +158,7 @@ struct CV_EXPORTS CannyBuf
     GpuMat mag;
     GpuMat map;
     GpuMat st1, st2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
+    Ptr<Filter> filterDX, filterDY;
 };
 
 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
diff --git a/modules/gpuimgproc/src/canny.cpp b/modules/gpuimgproc/src/canny.cpp
index 8d361fe50f..9a33575648 100644
--- a/modules/gpuimgproc/src/canny.cpp
+++ b/modules/gpuimgproc/src/canny.cpp
@@ -65,8 +65,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 
         if (apperture_size != 3)
         {
-            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+            filterDX = createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size, false, 1, BORDER_REPLICATE);
+            filterDY = createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size, false, 1, BORDER_REPLICATE);
         }
     }
 
@@ -150,8 +150,8 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
     }
     else
     {
-        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
-        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+        buf.filterDX->apply(src, buf.dx);
+        buf.filterDY->apply(src, buf.dy);
 
         calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
     }
diff --git a/modules/gpuimgproc/src/corners.cpp b/modules/gpuimgproc/src/corners.cpp
index 44dc1505d4..824a3308ee 100644
--- a/modules/gpuimgproc/src/corners.cpp
+++ b/modules/gpuimgproc/src/corners.cpp
@@ -70,6 +70,8 @@ namespace
 {
     void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
     {
+        (void) buf;
+
         double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
 
         if (ksize < 0)
@@ -83,16 +85,21 @@ namespace
         Dx.create(src.size(), CV_32F);
         Dy.create(src.size(), CV_32F);
 
+        Ptr<gpu::Filter> filterDx, filterDy;
+
         if (ksize > 0)
         {
-            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
-            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
+            filterDx = gpu::createSobelFilter(src.type(), CV_32F, 1, 0, ksize, scale, borderType);
+            filterDy = gpu::createSobelFilter(src.type(), CV_32F, 0, 1, ksize, scale, borderType);
         }
         else
         {
-            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
-            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
+            filterDx = gpu::createScharrFilter(src.type(), CV_32F, 1, 0, scale, borderType);
+            filterDy = gpu::createScharrFilter(src.type(), CV_32F, 0, 1, scale, borderType);
         }
+
+        filterDx->apply(src, Dx);
+        filterDy->apply(src, Dy);
     }
 }
 
diff --git a/modules/gpuimgproc/src/hough.cpp b/modules/gpuimgproc/src/hough.cpp
index bc0a8a400d..15e5297623 100644
--- a/modules/gpuimgproc/src/hough.cpp
+++ b/modules/gpuimgproc/src/hough.cpp
@@ -761,7 +761,7 @@ namespace
         {
             buildRTable_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
                             r_table, r_sizes.ptr<int>(), make_short2(templCenter.x, templCenter.y), levels);
-            min(r_sizes, maxSize, r_sizes);
+            gpu::min(r_sizes, maxSize, r_sizes);
         }
     }
 
diff --git a/modules/gpuimgproc/src/match_template.cpp b/modules/gpuimgproc/src/match_template.cpp
index 008d3da1ce..059d41ca9f 100644
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@@ -172,15 +172,16 @@ namespace
             return;
         }
 
-        gpu::ConvolveBuf convolve_buf;
-        convolve_buf.user_block_size = buf.user_block_size;
+        Ptr<gpu::Convolution> conv = gpu::createConvolution(buf.user_block_size);
 
         if (image.channels() == 1)
-            gpu::convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
+        {
+            conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream);
+        }
         else
         {
             GpuMat result_;
-            gpu::convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
+            conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream);
             extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
         }
     }
@@ -268,7 +269,7 @@ namespace
             buf.image_sums.resize(1);
             gpu::integral(image, buf.image_sums[0], stream);
 
-            unsigned int templ_sum = (unsigned int)sum(templ)[0];
+            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
             matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
         }
         else
diff --git a/modules/gpulegacy/include/opencv2/gpulegacy/NCV.hpp b/modules/gpulegacy/include/opencv2/gpulegacy/NCV.hpp
index e993c64087..cb84c23ad5 100644
--- a/modules/gpulegacy/include/opencv2/gpulegacy/NCV.hpp
+++ b/modules/gpulegacy/include/opencv2/gpulegacy/NCV.hpp
@@ -126,7 +126,7 @@ typedef                int Ncv32s;
 typedef       unsigned int Ncv32u;
 typedef              short Ncv16s;
 typedef     unsigned short Ncv16u;
-typedef               char Ncv8s;
+typedef        signed char Ncv8s;
 typedef      unsigned char Ncv8u;
 typedef              float Ncv32f;
 typedef             double Ncv64f;
diff --git a/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp b/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp
index 6409fab941..5cf902a9f4 100644
--- a/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp
+++ b/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp
@@ -51,7 +51,7 @@ template<typename TBase> inline __host__ __device__ TBase _pixMaxVal();
 template<> static inline __host__ __device__ Ncv8u  _pixMaxVal<Ncv8u>()  {return UCHAR_MAX;}
 template<> static inline __host__ __device__ Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
 template<> static inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return  UINT_MAX;}
-template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  CHAR_MAX;}
+template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  SCHAR_MAX;}
 template<> static inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return  SHRT_MAX;}
 template<> static inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return   INT_MAX;}
 template<> static inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return   FLT_MAX;}
@@ -61,7 +61,7 @@ template<typename TBase> inline __host__ __device__ TBase _pixMinVal();
 template<> static inline __host__ __device__ Ncv8u  _pixMinVal<Ncv8u>()  {return 0;}
 template<> static inline __host__ __device__ Ncv16u _pixMinVal<Ncv16u>() {return 0;}
 template<> static inline __host__ __device__ Ncv32u _pixMinVal<Ncv32u>() {return 0;}
-template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return CHAR_MIN;}
+template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return SCHAR_MIN;}
 template<> static inline __host__ __device__ Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
 template<> static inline __host__ __device__ Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
 template<> static inline __host__ __device__ Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
diff --git a/modules/gpuoptflow/test/test_optflow.cpp b/modules/gpuoptflow/test/test_optflow.cpp
index c20260e197..fce07551dc 100644
--- a/modules/gpuoptflow/test/test_optflow.cpp
+++ b/modules/gpuoptflow/test/test_optflow.cpp
@@ -80,7 +80,7 @@ GPU_TEST_P(BroxOpticalFlow, Regression)
     brox(loadMat(frame0), loadMat(frame1), u, v);
 
     std::string fname(cvtest::TS::ptr()->get_data_path());
-    if (devInfo.major() >= 2)
+    if (devInfo.majorVersion() >= 2)
         fname += "opticalflow/brox_optical_flow_cc20.bin";
     else
         fname += "opticalflow/brox_optical_flow.bin";
diff --git a/modules/gpustereo/doc/stereo.rst b/modules/gpustereo/doc/stereo.rst
index cd2add0b94..4064fe0a3e 100644
--- a/modules/gpustereo/doc/stereo.rst
+++ b/modules/gpustereo/doc/stereo.rst
@@ -5,135 +5,75 @@ Stereo Correspondence
 
 
 
-gpu::StereoBM_GPU
------------------
-.. ocv:class:: gpu::StereoBM_GPU
+gpu::StereoBM
+-------------
+.. ocv:class:: gpu::StereoBM : public cv::StereoBM
 
 Class computing stereo correspondence (disparity map) using the block matching algorithm. ::
 
-    class StereoBM_GPU
-    {
-    public:
-        enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
-
-        enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-        StereoBM_GPU();
-        StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP,
-                     int winSize = DEFAULT_WINSZ);
-
-        void operator() (const GpuMat& left, const GpuMat& right,
-                         GpuMat& disparity, Stream& stream = Stream::Null());
-
-        static bool checkIfGpuCallReasonable();
-
-        int preset;
-        int ndisp;
-        int winSize;
-
-        float avergeTexThreshold;
-
-        ...
-    };
-
-
-The class also performs pre- and post-filtering steps: Sobel pre-filtering (if ``PREFILTER_XSOBEL`` flag is set) and low textureness filtering (if ``averageTexThreshols > 0`` ). If ``avergeTexThreshold = 0`` , low textureness filtering is disabled. Otherwise, the disparity is set to 0 in each point ``(x, y)`` , where for the left image
-
-.. math::
-    \sum HorizontalGradiensInWindow(x, y, winSize) < (winSize \cdot winSize) \cdot avergeTexThreshold
-
-This means that the input left image is low textured.
+.. seealso:: :ocv:class:`StereoBM`
 
 
 
-gpu::StereoBM_GPU::StereoBM_GPU
------------------------------------
-Enables :ocv:class:`gpu::StereoBM_GPU` constructors.
+gpu::createStereoBM
+-------------------
+Creates StereoBM object.
 
-.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU()
+.. ocv:function:: Ptr<gpu::StereoBM> gpu::createStereoBM(int numDisparities = 64, int blockSize = 19)
 
-.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ)
+    :param numDisparities: the disparity search range. For each pixel algorithm will find the best disparity from 0 (default minimum disparity) to ``numDisparities``. The search range can then be shifted by changing the minimum disparity.
 
-    :param preset: Parameter presetting:
-
-        * **BASIC_PRESET** Basic mode without pre-processing.
-
-        * **PREFILTER_XSOBEL** Sobel pre-filtering mode.
-
-    :param ndisparities: Number of disparities. It must be a multiple of 8 and less or equal to 256.
-
-    :param winSize: Block size.
-
-
-
-gpu::StereoBM_GPU::operator ()
-----------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void gpu::StereoBM_GPU::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. Only  ``CV_8UC1``  type is supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. It is a  ``CV_8UC1``  image with the same size as the input images.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::StereoBM_GPU::checkIfGpuCallReasonable
------------------------------------------------
-Uses a heuristic method to estimate whether the current GPU is faster than the CPU in this algorithm. It queries the currently active device.
-
-.. ocv:function:: bool gpu::StereoBM_GPU::checkIfGpuCallReasonable()
+    :param blockSize: the linear size of the blocks compared by the algorithm. The size should be odd (as the block is centered at the current pixel). Larger block size implies smoother, though less accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher chance for algorithm to find a wrong correspondence.
 
 
 
 gpu::StereoBeliefPropagation
 ----------------------------
-.. ocv:class:: gpu::StereoBeliefPropagation
+.. ocv:class:: gpu::StereoBeliefPropagation : public cv::StereoMatcher
 
 Class computing stereo correspondence using the belief propagation algorithm. ::
 
-    class StereoBeliefPropagation
+    class CV_EXPORTS StereoBeliefPropagation : public cv::StereoMatcher
     {
     public:
-        enum { DEFAULT_NDISP  = 64 };
-        enum { DEFAULT_ITERS  = 5  };
-        enum { DEFAULT_LEVELS = 5  };
+        using cv::StereoMatcher::compute;
 
-        static void estimateRecommendedParams(int width, int height,
-            int& ndisp, int& iters, int& levels);
+        virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
 
-        explicit StereoBeliefPropagation(int ndisp = DEFAULT_NDISP,
-            int iters  = DEFAULT_ITERS,
-            int levels = DEFAULT_LEVELS,
-            int msg_type = CV_32F);
-        StereoBeliefPropagation(int ndisp, int iters, int levels,
-            float max_data_term, float data_weight,
-            float max_disc_term, float disc_single_jump,
-            int msg_type = CV_32F);
+        //! version for user specified data term
+        virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;
 
-        void operator()(const GpuMat& left, const GpuMat& right,
-                        GpuMat& disparity, Stream& stream = Stream::Null());
-        void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
+        //! number of BP iterations on each level
+        virtual int getNumIters() const = 0;
+        virtual void setNumIters(int iters) = 0;
 
-        int ndisp;
+        //! number of levels
+        virtual int getNumLevels() const = 0;
+        virtual void setNumLevels(int levels) = 0;
 
-        int iters;
-        int levels;
+        //! truncation of data cost
+        virtual double getMaxDataTerm() const = 0;
+        virtual void setMaxDataTerm(double max_data_term) = 0;
 
-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
+        //! data weight
+        virtual double getDataWeight() const = 0;
+        virtual void setDataWeight(double data_weight) = 0;
 
-        int msg_type;
+        //! truncation of discontinuity cost
+        virtual double getMaxDiscTerm() const = 0;
+        virtual void setMaxDiscTerm(double max_disc_term) = 0;
 
-        ...
+        //! discontinuity single jump
+        virtual double getDiscSingleJump() const = 0;
+        virtual void setDiscSingleJump(double disc_single_jump) = 0;
+
+        virtual int getMsgType() const = 0;
+        virtual void setMsgType(int msg_type) = 0;
+
+        static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
     };
 
+
 The class implements algorithm described in [Felzenszwalb2006]_ . It can compute own data cost (using a truncated linear model) or use a user-provided data cost.
 
 .. note::
@@ -152,32 +92,6 @@ The class implements algorithm described in [Felzenszwalb2006]_ . It can compute
 
     ``width_step`` is the number of bytes in a line including padding.
 
-
-
-gpu::StereoBeliefPropagation::StereoBeliefPropagation
----------------------------------------------------------
-Enables the :ocv:class:`gpu::StereoBeliefPropagation` constructors.
-
-.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int msg_type = CV_32F)
-
-.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp, int iters, int levels, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param max_data_term: Threshold for data cost truncation.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Threshold for discontinuity truncation.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
 ``StereoBeliefPropagation`` uses a truncated linear model for the data cost and discontinuity terms:
 
 .. math::
@@ -190,33 +104,45 @@ Enables the :ocv:class:`gpu::StereoBeliefPropagation` constructors.
 
 For more details, see [Felzenszwalb2006]_.
 
-By default, :ocv:class:`gpu::StereoBeliefPropagation` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+By default, ``StereoBeliefPropagation`` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
 
 .. math::
 
     10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
 
+.. seealso:: :ocv:class:`StereoMatcher`
+
+
+
+gpu::createStereoBeliefPropagation
+----------------------------------
+Creates StereoBeliefPropagation object.
+
+.. ocv:function:: Ptr<gpu::StereoBeliefPropagation> gpu::createStereoBeliefPropagation(int ndisp = 64, int iters = 5, int levels = 5, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
 
 
 gpu::StereoBeliefPropagation::estimateRecommendedParams
------------------------------------------------------------
+-------------------------------------------------------
 Uses a heuristic method to compute the recommended parameters ( ``ndisp``, ``iters`` and ``levels`` ) for the specified image size ( ``width`` and ``height`` ).
 
 .. ocv:function:: void gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
 
 
 
-gpu::StereoBeliefPropagation::operator ()
----------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair or data cost.
+gpu::StereoBeliefPropagation::compute
+-------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified data cost.
 
-.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
+.. ocv:function:: void gpu::StereoBeliefPropagation::compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null())
 
     :param data: User-specified data cost, a matrix of ``msg_type`` type and ``Size(<image columns>*ndisp, <image rows>)`` size.
 
@@ -228,89 +154,26 @@ Enables the stereo correspondence operator that finds the disparity for the spec
 
 gpu::StereoConstantSpaceBP
 --------------------------
-.. ocv:class:: gpu::StereoConstantSpaceBP
+.. ocv:class:: gpu::StereoConstantSpaceBP : public gpu::StereoBeliefPropagation
 
 Class computing stereo correspondence using the constant space belief propagation algorithm. ::
 
-    class StereoConstantSpaceBP
+    class CV_EXPORTS StereoConstantSpaceBP : public gpu::StereoBeliefPropagation
     {
     public:
-        enum { DEFAULT_NDISP    = 128 };
-        enum { DEFAULT_ITERS    = 8   };
-        enum { DEFAULT_LEVELS   = 4   };
-        enum { DEFAULT_NR_PLANE = 4   };
+        //! number of active disparity on the first level
+        virtual int getNrPlane() const = 0;
+        virtual void setNrPlane(int nr_plane) = 0;
 
-        static void estimateRecommendedParams(int width, int height,
-            int& ndisp, int& iters, int& levels, int& nr_plane);
+        virtual bool getUseLocalInitDataCost() const = 0;
+        virtual void setUseLocalInitDataCost(bool use_local_init_data_cost) = 0;
 
-        explicit StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP,
-            int iters    = DEFAULT_ITERS,
-            int levels   = DEFAULT_LEVELS,
-            int nr_plane = DEFAULT_NR_PLANE,
-            int msg_type = CV_32F);
-        StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-            float max_data_term, float data_weight,
-            float max_disc_term, float disc_single_jump,
-            int min_disp_th = 0,
-            int msg_type = CV_32F);
-
-        void operator()(const GpuMat& left, const GpuMat& right,
-                        GpuMat& disparity, Stream& stream = Stream::Null());
-
-        int ndisp;
-
-        int iters;
-        int levels;
-
-        int nr_plane;
-
-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
-
-        int min_disp_th;
-
-        int msg_type;
-
-        bool use_local_init_data_cost;
-
-        ...
+        static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
     };
 
 
 The class implements algorithm described in [Yang2010]_. ``StereoConstantSpaceBP`` supports both local minimum and global minimum data cost initialization algorithms. For more details, see the paper mentioned above. By default, a local algorithm is used. To enable a global algorithm, set ``use_local_init_data_cost`` to ``false`` .
 
-
-
-gpu::StereoConstantSpaceBP::StereoConstantSpaceBP
------------------------------------------------------
-Enables the :ocv:class:`gpu::StereoConstantSpaceBP` constructors.
-
-.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int nr_plane = DEFAULT_NR_PLANE, int msg_type = CV_32F)
-
-.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th = 0, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param nr_plane: Number of disparity levels on the first level.
-
-    :param max_data_term: Truncation of data cost.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Truncation of discontinuity.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param min_disp_th: Minimal disparity threshold.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
 ``StereoConstantSpaceBP`` uses a truncated linear model for the data cost and discontinuity terms:
 
 .. math::
@@ -331,54 +194,65 @@ By default, ``StereoConstantSpaceBP`` uses floating-point arithmetics and the ``
 
 
 
+gpu::createStereoConstantSpaceBP
+--------------------------------
+Creates StereoConstantSpaceBP object.
+
+.. ocv:function:: Ptr<gpu::StereoConstantSpaceBP> gpu::createStereoConstantSpaceBP(int ndisp = 128, int iters = 8, int levels = 4, int nr_plane = 4, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param nr_plane: Number of disparity levels on the first level.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+
+
 gpu::StereoConstantSpaceBP::estimateRecommendedParams
----------------------------------------------------------
+-----------------------------------------------------
 Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified image size (widthand height).
 
 .. ocv:function:: void gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
 
 
 
-gpu::StereoConstantSpaceBP::operator ()
--------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void gpu::StereoConstantSpaceBP::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the output type is  ``disparity.type()`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
 gpu::DisparityBilateralFilter
 -----------------------------
-.. ocv:class:: gpu::DisparityBilateralFilter
+.. ocv:class:: gpu::DisparityBilateralFilter : public cv::Algorithm
 
 Class refining a disparity map using joint bilateral filtering. ::
 
-    class CV_EXPORTS DisparityBilateralFilter
+    class CV_EXPORTS DisparityBilateralFilter : public cv::Algorithm
     {
     public:
-        enum { DEFAULT_NDISP  = 64 };
-        enum { DEFAULT_RADIUS = 3 };
-        enum { DEFAULT_ITERS  = 1 };
+        //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
+        //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
+        virtual void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 
-        explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP,
-            int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);
+        virtual int getNumDisparities() const = 0;
+        virtual void setNumDisparities(int numDisparities) = 0;
 
-        DisparityBilateralFilter(int ndisp, int radius, int iters,
-            float edge_threshold, float max_disc_threshold,
-            float sigma_range);
+        virtual int getRadius() const = 0;
+        virtual void setRadius(int radius) = 0;
 
-        void operator()(const GpuMat& disparity, const GpuMat& image,
-                        GpuMat& dst, Stream& stream = Stream::Null());
+        virtual int getNumIters() const = 0;
+        virtual void setNumIters(int iters) = 0;
 
-        ...
+        //! truncation of data continuity
+        virtual double getEdgeThreshold() const = 0;
+        virtual void setEdgeThreshold(double edge_threshold) = 0;
+
+        //! truncation of disparity continuity
+        virtual double getMaxDiscThreshold() const = 0;
+        virtual void setMaxDiscThreshold(double max_disc_threshold) = 0;
+
+        //! filter range sigma
+        virtual double getSigmaRange() const = 0;
+        virtual void setSigmaRange(double sigma_range) = 0;
     };
 
 
@@ -386,13 +260,11 @@ The class implements [Yang2010]_ algorithm.
 
 
 
-gpu::DisparityBilateralFilter::DisparityBilateralFilter
------------------------------------------------------------
-Enables the :ocv:class:`gpu::DisparityBilateralFilter` constructors.
+gpu::createDisparityBilateralFilter
+-----------------------------------
+Creates DisparityBilateralFilter object.
 
-.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS)
-
-.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range)
+.. ocv:function:: Ptr<gpu::DisparityBilateralFilter> gpu::createDisparityBilateralFilter(int ndisp = 64, int radius = 3, int iters = 1)
 
     :param ndisp: Number of disparities.
 
@@ -400,19 +272,13 @@ Enables the :ocv:class:`gpu::DisparityBilateralFilter` constructors.
 
     :param iters: Number of iterations.
 
-    :param edge_threshold: Threshold for edges.
-
-    :param max_disc_threshold: Constant to reject outliers.
-
-    :param sigma_range: Filter range.
 
 
-
-gpu::DisparityBilateralFilter::operator ()
-----------------------------------------------
+gpu::DisparityBilateralFilter::apply
+------------------------------------
 Refines a disparity map using joint bilateral filtering.
 
-.. ocv:function:: void gpu::DisparityBilateralFilter::operator ()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::DisparityBilateralFilter::apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null())
 
     :param disparity: Input disparity map.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
 
@@ -424,29 +290,11 @@ Refines a disparity map using joint bilateral filtering.
 
 
 
-gpu::drawColorDisp
-----------------------
-Colors a disparity image.
-
-.. ocv:function:: void gpu::drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null())
-
-    :param src_disp: Source disparity image.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
-
-    :param dst_disp: Output disparity image. It has the same size as  ``src_disp`` . The  type is ``CV_8UC4``  in  ``BGRA``  format (alpha = 255).
-
-    :param ndisp: Number of disparities.
-
-    :param stream: Stream for the asynchronous version.
-
-This function draws a colored disparity map by converting disparity values from ``[0..ndisp)`` interval first to ``HSV`` color space (where different disparity values correspond to different hues) and then converting the pixels to ``RGB`` for visualization.
-
-
-
 gpu::reprojectImageTo3D
----------------------------
+-----------------------
 Reprojects a disparity image to 3D space.
 
-.. ocv:function:: void gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray Q, int dst_cn = 4, Stream& stream = Stream::Null())
 
     :param disp: Input disparity image.  ``CV_8U``  and  ``CV_16S``  types are supported.
 
@@ -462,6 +310,23 @@ Reprojects a disparity image to 3D space.
 
 
 
-.. [Felzenszwalb2006] Pedro F. Felzenszwalb algorithm [Pedro F. Felzenszwalb and Daniel P. Huttenlocher. *Efficient belief propagation for early vision*. International Journal of Computer Vision, 70(1), October 2006
+gpu::drawColorDisp
+------------------
+Colors a disparity image.
 
+.. ocv:function:: void gpu::drawColorDisp(InputArray src_disp, OutputArray dst_disp, int ndisp, Stream& stream = Stream::Null())
+
+    :param src_disp: Source disparity image.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
+
+    :param dst_disp: Output disparity image. It has the same size as  ``src_disp`` . The  type is ``CV_8UC4``  in  ``BGRA``  format (alpha = 255).
+
+    :param ndisp: Number of disparities.
+
+    :param stream: Stream for the asynchronous version.
+
+This function draws a colored disparity map by converting disparity values from ``[0..ndisp)`` interval first to ``HSV`` color space (where different disparity values correspond to different hues) and then converting the pixels to ``RGB`` for visualization.
+
+
+
+.. [Felzenszwalb2006] Pedro F. Felzenszwalb algorithm [Pedro F. Felzenszwalb and Daniel P. Huttenlocher. *Efficient belief propagation for early vision*. International Journal of Computer Vision, 70(1), October 2006
 .. [Yang2010] Q. Yang, L. Wang, and N. Ahuja. *A constant-space belief propagation algorithm for stereo matching*. In CVPR, 2010.
diff --git a/modules/gpustereo/include/opencv2/gpustereo.hpp b/modules/gpustereo/include/opencv2/gpustereo.hpp
index ecda512068..250e89b85c 100644
--- a/modules/gpustereo/include/opencv2/gpustereo.hpp
+++ b/modules/gpustereo/include/opencv2/gpustereo.hpp
@@ -48,199 +48,145 @@
 #endif
 
 #include "opencv2/core/gpu.hpp"
+#include "opencv2/calib3d.hpp"
 
 namespace cv { namespace gpu {
 
-class CV_EXPORTS StereoBM_GPU
+/////////////////////////////////////////
+// StereoBM
+
+class CV_EXPORTS StereoBM : public cv::StereoBM
 {
 public:
-    enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+    using cv::StereoBM::compute;
 
-    enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-    //! the default constructor
-    StereoBM_GPU();
-    //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
-    StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
-    //! Output disparity has CV_8U type.
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
-
-    //! Some heuristics that tries to estmate
-    // if current GPU will be faster than CPU in this algorithm.
-    // It queries current active device.
-    static bool checkIfGpuCallReasonable();
-
-    int preset;
-    int ndisp;
-    int winSize;
-
-    // If avergeTexThreshold  == 0 => post procesing is disabled
-    // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
-    // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
-    // i.e. input left image is low textured.
-    float avergeTexThreshold;
-
-private:
-    GpuMat minSSD, leBuf, riBuf;
+    virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
 };
 
-// "Efficient Belief Propagation for Early Vision"
-// P.Felzenszwalb
-class CV_EXPORTS StereoBeliefPropagation
+CV_EXPORTS Ptr<gpu::StereoBM> createStereoBM(int numDisparities = 64, int blockSize = 19);
+
+/////////////////////////////////////////
+// StereoBeliefPropagation
+
+//! "Efficient Belief Propagation for Early Vision" P.Felzenszwalb
+class CV_EXPORTS StereoBeliefPropagation : public cv::StereoMatcher
 {
 public:
-    enum { DEFAULT_NDISP  = 64 };
-    enum { DEFAULT_ITERS  = 5  };
-    enum { DEFAULT_LEVELS = 5  };
-
-    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
-
-    //! the default constructor
-    explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
-                                     int iters  = DEFAULT_ITERS,
-                                     int levels = DEFAULT_LEVELS,
-                                     int msg_type = CV_32F);
-
-    //! the full constructor taking the number of disparities, number of BP iterations on each level,
-    //! number of levels, truncation of data cost, data weight,
-    //! truncation of discontinuity cost and discontinuity single jump
-    //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term)
-    //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term)
-    //! please see paper for more details
-    StereoBeliefPropagation(int ndisp, int iters, int levels,
-        float max_data_term, float data_weight,
-        float max_disc_term, float disc_single_jump,
-        int msg_type = CV_32F);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
-    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
+    using cv::StereoMatcher::compute;
 
+    virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
 
     //! version for user specified data term
-    void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
+    virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;
 
-    int ndisp;
+    //! number of BP iterations on each level
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;
 
-    int iters;
-    int levels;
+    //! number of levels
+    virtual int getNumLevels() const = 0;
+    virtual void setNumLevels(int levels) = 0;
 
-    float max_data_term;
-    float data_weight;
-    float max_disc_term;
-    float disc_single_jump;
+    //! truncation of data cost
+    virtual double getMaxDataTerm() const = 0;
+    virtual void setMaxDataTerm(double max_data_term) = 0;
 
-    int msg_type;
-private:
-    GpuMat u, d, l, r, u2, d2, l2, r2;
-    std::vector<GpuMat> datas;
-    GpuMat out;
+    //! data weight
+    virtual double getDataWeight() const = 0;
+    virtual void setDataWeight(double data_weight) = 0;
+
+    //! truncation of discontinuity cost
+    virtual double getMaxDiscTerm() const = 0;
+    virtual void setMaxDiscTerm(double max_disc_term) = 0;
+
+    //! discontinuity single jump
+    virtual double getDiscSingleJump() const = 0;
+    virtual void setDiscSingleJump(double disc_single_jump) = 0;
+
+    //! type for messages (CV_16SC1 or CV_32FC1)
+    virtual int getMsgType() const = 0;
+    virtual void setMsgType(int msg_type) = 0;
+
+    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
 };
 
-// "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
-// Qingxiong Yang, Liang Wang, Narendra Ahuja
-// http://vision.ai.uiuc.edu/~qyang6/
-class CV_EXPORTS StereoConstantSpaceBP
+CV_EXPORTS Ptr<gpu::StereoBeliefPropagation>
+    createStereoBeliefPropagation(int ndisp = 64, int iters = 5, int levels = 5, int msg_type = CV_32F);
+
+/////////////////////////////////////////
+// StereoConstantSpaceBP
+
+//! "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
+//! Qingxiong Yang, Liang Wang, Narendra Ahuja
+//! http://vision.ai.uiuc.edu/~qyang6/
+class CV_EXPORTS StereoConstantSpaceBP : public gpu::StereoBeliefPropagation
 {
 public:
-    enum { DEFAULT_NDISP    = 128 };
-    enum { DEFAULT_ITERS    = 8   };
-    enum { DEFAULT_LEVELS   = 4   };
-    enum { DEFAULT_NR_PLANE = 4   };
+    //! number of active disparity on the first level
+    virtual int getNrPlane() const = 0;
+    virtual void setNrPlane(int nr_plane) = 0;
+
+    virtual bool getUseLocalInitDataCost() const = 0;
+    virtual void setUseLocalInitDataCost(bool use_local_init_data_cost) = 0;
 
     static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
-
-    //! the default constructor
-    explicit StereoConstantSpaceBP(int ndisp    = DEFAULT_NDISP,
-                                   int iters    = DEFAULT_ITERS,
-                                   int levels   = DEFAULT_LEVELS,
-                                   int nr_plane = DEFAULT_NR_PLANE,
-                                   int msg_type = CV_32F);
-
-    //! the full constructor taking the number of disparities, number of BP iterations on each level,
-    //! number of levels, number of active disparity on the first level, truncation of data cost, data weight,
-    //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold
-    StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-        float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
-        int min_disp_th = 0,
-        int msg_type = CV_32F);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
-    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
-
-    int ndisp;
-
-    int iters;
-    int levels;
-
-    int nr_plane;
-
-    float max_data_term;
-    float data_weight;
-    float max_disc_term;
-    float disc_single_jump;
-
-    int min_disp_th;
-
-    int msg_type;
-
-    bool use_local_init_data_cost;
-private:
-    GpuMat messages_buffers;
-
-    GpuMat temp;
-    GpuMat out;
 };
 
-// Disparity map refinement using joint bilateral filtering given a single color image.
-// Qingxiong Yang, Liang Wang, Narendra Ahuja
-// http://vision.ai.uiuc.edu/~qyang6/
-class CV_EXPORTS DisparityBilateralFilter
+CV_EXPORTS Ptr<gpu::StereoConstantSpaceBP>
+    createStereoConstantSpaceBP(int ndisp = 128, int iters = 8, int levels = 4, int nr_plane = 4, int msg_type = CV_32F);
+
+/////////////////////////////////////////
+// DisparityBilateralFilter
+
+//! Disparity map refinement using joint bilateral filtering given a single color image.
+//! Qingxiong Yang, Liang Wang, Narendra Ahuja
+//! http://vision.ai.uiuc.edu/~qyang6/
+class CV_EXPORTS DisparityBilateralFilter : public cv::Algorithm
 {
 public:
-    enum { DEFAULT_NDISP  = 64 };
-    enum { DEFAULT_RADIUS = 3 };
-    enum { DEFAULT_ITERS  = 1 };
-
-    //! the default constructor
-    explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);
-
-    //! the full constructor taking the number of disparities, filter radius,
-    //! number of iterations, truncation of data continuity, truncation of disparity continuity
-    //! and filter range sigma
-    DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range);
-
     //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
     //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
-    void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null());
+    virtual void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 
-private:
-    int ndisp;
-    int radius;
-    int iters;
+    virtual int getNumDisparities() const = 0;
+    virtual void setNumDisparities(int numDisparities) = 0;
 
-    float edge_threshold;
-    float max_disc_threshold;
-    float sigma_range;
+    virtual int getRadius() const = 0;
+    virtual void setRadius(int radius) = 0;
 
-    GpuMat table_color;
-    GpuMat table_space;
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;
+
+    //! truncation of data continuity
+    virtual double getEdgeThreshold() const = 0;
+    virtual void setEdgeThreshold(double edge_threshold) = 0;
+
+    //! truncation of disparity continuity
+    virtual double getMaxDiscThreshold() const = 0;
+    virtual void setMaxDiscThreshold(double max_disc_threshold) = 0;
+
+    //! filter range sigma
+    virtual double getSigmaRange() const = 0;
+    virtual void setSigmaRange(double sigma_range) = 0;
 };
 
+CV_EXPORTS Ptr<gpu::DisparityBilateralFilter>
+    createDisparityBilateralFilter(int ndisp = 64, int radius = 3, int iters = 1);
+
+/////////////////////////////////////////
+// Utility
+
 //! Reprojects disparity image to 3D space.
 //! Supports CV_8U and CV_16S types of input disparity.
 //! The output is a 3- or 4-channel floating-point matrix.
 //! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.
 //! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.
-CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null());
+CV_EXPORTS void reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray Q, int dst_cn = 4, Stream& stream = Stream::Null());
 
 //! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.
 //! Supported types of input disparity: CV_8U, CV_16S.
 //! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).
-CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null());
+CV_EXPORTS void drawColorDisp(InputArray src_disp, OutputArray dst_disp, int ndisp, Stream& stream = Stream::Null());
 
 }} // namespace cv { namespace gpu {
 
diff --git a/modules/gpustereo/perf/perf_stereo.cpp b/modules/gpustereo/perf/perf_stereo.cpp
index e0438c0ae6..476a591a1c 100644
--- a/modules/gpustereo/perf/perf_stereo.cpp
+++ b/modules/gpustereo/perf/perf_stereo.cpp
@@ -63,18 +63,17 @@ PERF_TEST_P(ImagePair, StereoBM,
     const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(imgRight.empty());
 
-    const int preset = 0;
     const int ndisp = 256;
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
+        cv::Ptr<cv::StereoBM> d_bm = cv::gpu::createStereoBM(ndisp);
 
         const cv::gpu::GpuMat d_imgLeft(imgLeft);
         const cv::gpu::GpuMat d_imgRight(imgRight);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() d_bm(d_imgLeft, d_imgRight, dst);
+        TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -108,13 +107,13 @@ PERF_TEST_P(ImagePair, StereoBeliefPropagation,
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
+        cv::Ptr<cv::gpu::StereoBeliefPropagation> d_bp = cv::gpu::createStereoBeliefPropagation(ndisp);
 
         const cv::gpu::GpuMat d_imgLeft(imgLeft);
         const cv::gpu::GpuMat d_imgRight(imgRight);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() d_bp(d_imgLeft, d_imgRight, dst);
+        TEST_CYCLE() d_bp->compute(d_imgLeft, d_imgRight, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -142,13 +141,13 @@ PERF_TEST_P(ImagePair, StereoConstantSpaceBP,
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
+        cv::Ptr<cv::gpu::StereoConstantSpaceBP> d_csbp = cv::gpu::createStereoConstantSpaceBP(ndisp);
 
         const cv::gpu::GpuMat d_imgLeft(imgLeft);
         const cv::gpu::GpuMat d_imgRight(imgRight);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() d_csbp(d_imgLeft, d_imgRight, dst);
+        TEST_CYCLE() d_csbp->compute(d_imgLeft, d_imgRight, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -174,13 +173,13 @@ PERF_TEST_P(ImagePair, DisparityBilateralFilter,
 
     if (PERF_RUN_GPU())
     {
-        cv::gpu::DisparityBilateralFilter d_filter(ndisp);
+        cv::Ptr<cv::gpu::DisparityBilateralFilter> d_filter = cv::gpu::createDisparityBilateralFilter(ndisp);
 
         const cv::gpu::GpuMat d_img(img);
         const cv::gpu::GpuMat d_disp(disp);
         cv::gpu::GpuMat dst;
 
-        TEST_CYCLE() d_filter(d_disp, d_img, dst);
+        TEST_CYCLE() d_filter->apply(d_disp, d_img, dst);
 
         GPU_SANITY_CHECK(dst);
     }
diff --git a/modules/gpustereo/src/disparity_bilateral_filter.cpp b/modules/gpustereo/src/disparity_bilateral_filter.cpp
index d13fcc004f..689a9e76e3 100644
--- a/modules/gpustereo/src/disparity_bilateral_filter.cpp
+++ b/modules/gpustereo/src/disparity_bilateral_filter.cpp
@@ -47,10 +47,7 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::DisparityBilateralFilter> cv::gpu::createDisparityBilateralFilter(int, int, int) { throw_no_cuda(); return Ptr<gpu::DisparityBilateralFilter>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -65,15 +62,46 @@ namespace cv { namespace gpu { namespace cudev
     }
 }}}
 
-using namespace ::cv::gpu::cudev::disp_bilateral_filter;
-
 namespace
 {
-    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
-    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
-    const float DEFAULT_SIGMA_RANGE = 10.0f;
+    class DispBilateralFilterImpl : public gpu::DisparityBilateralFilter
+    {
+    public:
+        DispBilateralFilterImpl(int ndisp, int radius, int iters);
 
-    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
+        void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream);
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getRadius() const { return radius_; }
+        void setRadius(int radius);
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        double getEdgeThreshold() const { return edge_threshold_; }
+        void setEdgeThreshold(double edge_threshold) { edge_threshold_ = (float) edge_threshold; }
+
+        double getMaxDiscThreshold() const { return max_disc_threshold_; }
+        void setMaxDiscThreshold(double max_disc_threshold) { max_disc_threshold_ = (float) max_disc_threshold; }
+
+        double getSigmaRange() const { return sigma_range_; }
+        void setSigmaRange(double sigma_range);
+
+    private:
+        int ndisp_;
+        int radius_;
+        int iters_;
+        float edge_threshold_;
+        float max_disc_threshold_;
+        float sigma_range_;
+
+        GpuMat table_color_;
+        GpuMat table_space_;
+    };
+
+    void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
     {
         Mat cpu_table_color(1, len, CV_32F);
 
@@ -85,7 +113,7 @@ namespace
         table_color.upload(cpu_table_color);
     }
 
-    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
+    void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
     {
         int half = (win_size >> 1);
 
@@ -101,54 +129,78 @@ namespace
         table_space.upload(cpu_table_space);
     }
 
-    template <typename T>
-    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
-                                   GpuMat& table_color, GpuMat& table_space,
-                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
+    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
+    const float DEFAULT_SIGMA_RANGE = 10.0f;
+
+    DispBilateralFilterImpl::DispBilateralFilterImpl(int ndisp, int radius, int iters) :
+        ndisp_(ndisp), radius_(radius), iters_(iters),
+        edge_threshold_(DEFAULT_EDGE_THRESHOLD), max_disc_threshold_(DEFAULT_MAX_DISC_THRESHOLD),
+        sigma_range_(DEFAULT_SIGMA_RANGE)
     {
-        short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
-        short max_disc = short(ndisp * max_disc_threshold + 0.5);
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setRadius(int radius)
+    {
+        radius_ = radius;
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setSigmaRange(double sigma_range)
+    {
+        sigma_range_ = (float) sigma_range;
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+    }
+
+    template <typename T>
+    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                        GpuMat& table_color, GpuMat& table_space,
+                                        const GpuMat& disp, const GpuMat& img,
+                                        OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::disp_bilateral_filter;
+
+        const short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
+        const short max_disc = short(ndisp * max_disc_threshold + 0.5);
 
         disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
 
-        if (&dst != &disp)
-        {
+        _dst.create(disp.size(), disp.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        if (dst.data != disp.data)
             disp.copyTo(dst, stream);
-        }
 
         disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
     }
 
-    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
-                                                GpuMat& table_color, GpuMat& table_space,
-                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
+    void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
+    {
+        typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                                    GpuMat& table_color, GpuMat& table_space,
+                                                    const GpuMat& disp, const GpuMat& img, OutputArray dst, Stream& stream);
+        const bilateral_filter_operator_t operators[] =
+            {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
 
-    const bilateral_filter_operator_t operators[] =
-        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
+        CV_Assert( 0 < ndisp_ && 0 < radius_ && 0 < iters_ );
+
+        GpuMat disp = _disp.getGpuMat();
+        GpuMat img = _image.getGpuMat();
+
+        CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 );
+        CV_Assert( disp.size() == img.size() );
+
+        operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
+                               table_color_, table_space_, disp, img, dst, stream);
+    }
 }
 
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
-      sigma_range(DEFAULT_SIGMA_RANGE)
+Ptr<gpu::DisparityBilateralFilter> cv::gpu::createDisparityBilateralFilter(int ndisp, int radius, int iters)
 {
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
-                                                     float max_disc_threshold_, float sigma_range_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
-      sigma_range(sigma_range_)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-{
-    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
-    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
+    return new DispBilateralFilterImpl(ndisp, radius, iters);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpustereo/src/precomp.hpp b/modules/gpustereo/src/precomp.hpp
index 963cb4d07a..934da9fd23 100644
--- a/modules/gpustereo/src/precomp.hpp
+++ b/modules/gpustereo/src/precomp.hpp
@@ -48,5 +48,6 @@
 #include "opencv2/gpustereo.hpp"
 
 #include "opencv2/core/private.gpu.hpp"
+#include "opencv2/core/utility.hpp"
 
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/gpustereo/src/stereobm.cpp b/modules/gpustereo/src/stereobm.cpp
index f8e6c20fb5..9b32cf7e92 100644
--- a/modules/gpustereo/src/stereobm.cpp
+++ b/modules/gpustereo/src/stereobm.cpp
@@ -47,11 +47,7 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::gpu::StereoBM_GPU::StereoBM_GPU() { throw_no_cuda(); }
-cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_no_cuda(); }
-
-bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_no_cuda(); return false; }
-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::StereoBM> cv::gpu::createStereoBM(int, int) { throw_no_cuda(); return Ptr<gpu::StereoBM>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -67,74 +63,123 @@ namespace cv { namespace gpu { namespace cudev
 
 namespace
 {
-    const float defaultAvgTexThreshold = 3;
-}
+    class StereoBMImpl : public gpu::StereoBM
+    {
+    public:
+        StereoBMImpl(int numDisparities, int blockSize);
 
-cv::gpu::StereoBM_GPU::StereoBM_GPU()
-    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)
-{
-}
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
 
-cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_)
-    : preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold)
-{
-    const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
-    CV_Assert(0 < ndisp && ndisp <= max_supported_ndisp);
-    CV_Assert(ndisp % 8 == 0);
-    CV_Assert(winSize % 2 == 1);
-}
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}
 
-bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
-{
-    if (0 == getCudaEnabledDeviceCount())
-        return false;
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
 
-    DeviceInfo device_info;
+        int getBlockSize() const { return winSize_; }
+        void setBlockSize(int blockSize) { winSize_ = blockSize; }
 
-    if (device_info.major() > 1 || device_info.multiProcessorCount() > 16)
-        return true;
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
 
-    return false;
-}
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
 
-namespace
-{
-    void stereo_bm_gpu_operator( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getPreFilterType() const { return preset_; }
+        void setPreFilterType(int preFilterType) { preset_ = preFilterType; }
+
+        int getPreFilterSize() const { return 0; }
+        void setPreFilterSize(int /*preFilterSize*/) {}
+
+        int getPreFilterCap() const { return preFilterCap_; }
+        void setPreFilterCap(int preFilterCap) { preFilterCap_ = preFilterCap; }
+
+        int getTextureThreshold() const { return avergeTexThreshold_; }
+        void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = textureThreshold; }
+
+        int getUniquenessRatio() const { return 0; }
+        void setUniquenessRatio(int /*uniquenessRatio*/) {}
+
+        int getSmallerBlockSize() const { return 0; }
+        void setSmallerBlockSize(int /*blockSize*/){}
+
+        Rect getROI1() const { return Rect(); }
+        void setROI1(Rect /*roi1*/) {}
+
+        Rect getROI2() const { return Rect(); }
+        void setROI2(Rect /*roi2*/) {}
+
+    private:
+        int preset_;
+        int ndisp_;
+        int winSize_;
+        int preFilterCap_;
+        float avergeTexThreshold_;
+
+        GpuMat minSSD_, leBuf_, riBuf_;
+    };
+
+    StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
+        : preset_(0), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3)
+    {
+    }
+
+    void StereoBMImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBMImpl::compute(InputArray _left, InputArray _right, OutputArray _disparity, Stream& _stream)
     {
         using namespace ::cv::gpu::cudev::stereobm;
 
-        CV_Assert(left.rows == right.rows && left.cols == right.cols);
-        CV_Assert(left.type() == CV_8UC1);
-        CV_Assert(right.type() == CV_8UC1);
+        const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
+        CV_Assert( 0 < ndisp_ && ndisp_ <= max_supported_ndisp );
+        CV_Assert( ndisp_ % 8 == 0 );
+        CV_Assert( winSize_ % 2 == 1 );
 
-        disparity.create(left.size(), CV_8U);
-        minSSD.create(left.size(), CV_32S);
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
 
-        GpuMat le_for_bm =  left;
-        GpuMat ri_for_bm = right;
+        CV_Assert( left.type() == CV_8UC1 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
 
-        if (preset == StereoBM_GPU::PREFILTER_XSOBEL)
+        _disparity.create(left.size(), CV_8UC1);
+        GpuMat disparity = _disparity.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        gpu::ensureSizeIsEnough(left.size(), CV_32SC1, minSSD_);
+
+        PtrStepSzb le_for_bm =  left;
+        PtrStepSzb ri_for_bm = right;
+
+        if (preset_ == cv::StereoBM::PREFILTER_XSOBEL)
         {
-            leBuf.create( left.size(),  left.type());
-            riBuf.create(right.size(), right.type());
+            gpu::ensureSizeIsEnough(left.size(), left.type(), leBuf_);
+            gpu::ensureSizeIsEnough(right.size(), right.type(), riBuf_);
 
-            prefilter_xsobel( left, leBuf, 31, stream);
-            prefilter_xsobel(right, riBuf, 31, stream);
+            prefilter_xsobel( left, leBuf_, preFilterCap_, stream);
+            prefilter_xsobel(right, riBuf_, preFilterCap_, stream);
 
-            le_for_bm = leBuf;
-            ri_for_bm = riBuf;
+            le_for_bm = leBuf_;
+            ri_for_bm = riBuf_;
         }
 
-        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);
+        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);
 
-        if (avergeTexThreshold)
-            postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);
+        if (avergeTexThreshold_ > 0)
+            postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
     }
 }
 
-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream)
+Ptr<gpu::StereoBM> cv::gpu::createStereoBM(int numDisparities, int blockSize)
 {
-    stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
+    return new StereoBMImpl(numDisparities, blockSize);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpustereo/src/stereobp.cpp b/modules/gpustereo/src/stereobp.cpp
index 5ce56c1d70..ac3bcfe339 100644
--- a/modules/gpustereo/src/stereobp.cpp
+++ b/modules/gpustereo/src/stereobp.cpp
@@ -49,12 +49,7 @@ using namespace cv::gpu;
 
 void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&, int&, int&) { throw_no_cuda(); }
 
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); }
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, float, float, float, float, int) { throw_no_cuda(); }
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::StereoBeliefPropagation> cv::gpu::createStereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); return Ptr<gpu::StereoBeliefPropagation>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -78,14 +73,295 @@ namespace cv { namespace gpu { namespace cudev
     }
 }}}
 
-using namespace ::cv::gpu::cudev::stereobp;
-
 namespace
 {
+    class StereoBPImpl : public gpu::StereoBeliefPropagation
+    {
+    public:
+        StereoBPImpl(int ndisp, int iters, int levels, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+    private:
+        void init(Stream& stream);
+        void calcBP(OutputArray disp, Stream& stream);
+
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+
+        float scale_;
+        int rows_, cols_;
+        std::vector<int> cols_all_, rows_all_;
+        GpuMat u_, d_, l_, r_, u2_, d2_, l2_, r2_;
+        std::vector<GpuMat> datas_;
+        GpuMat outBuf_;
+    };
+
     const float DEFAULT_MAX_DATA_TERM = 10.0f;
     const float DEFAULT_DATA_WEIGHT = 0.07f;
     const float DEFAULT_MAX_DISC_TERM = 1.7f;
     const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
+
+    StereoBPImpl::StereoBPImpl(int ndisp, int iters, int levels, int msg_type) :
+        ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type)
+    {
+    }
+
+    void StereoBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBPImpl::compute(InputArray _left, InputArray _right, OutputArray disparity, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::stereobp;
+
+        typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+        static const comp_data_t comp_data_callers[2][5] =
+        {
+            {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
+            {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
+        };
+
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        rows_ = left.rows;
+        cols_ = left.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        datas_[0].create(rows_ * ndisp_, cols_, msg_type_);
+
+        comp_data_callers[msg_type_ == CV_32F][left.channels()](left, right, datas_[0], StreamAccessor::getStream(stream));
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::compute(InputArray _data, OutputArray disparity, Stream& stream)
+    {
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat data = _data.getGpuMat();
+
+        CV_Assert( (data.type() == msg_type_) && (data.rows % ndisp_ == 0) );
+
+        rows_ = data.rows / ndisp_;
+        cols_ = data.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        data.copyTo(datas_[0], stream);
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::init(Stream& stream)
+    {
+        using namespace cv::gpu::cudev::stereobp;
+
+        u_.create(rows_ * ndisp_, cols_, msg_type_);
+        d_.create(rows_ * ndisp_, cols_, msg_type_);
+        l_.create(rows_ * ndisp_, cols_, msg_type_);
+        r_.create(rows_ * ndisp_, cols_, msg_type_);
+
+        if (levels_ & 1)
+        {
+            //can clear less area
+            u_.setTo(0, stream);
+            d_.setTo(0, stream);
+            l_.setTo(0, stream);
+            r_.setTo(0, stream);
+        }
+
+        if (levels_ > 1)
+        {
+            int less_rows = (rows_ + 1) / 2;
+            int less_cols = (cols_ + 1) / 2;
+
+            u2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            d2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            l2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            r2_.create(less_rows * ndisp_, less_cols, msg_type_);
+
+            if ((levels_ & 1) == 0)
+            {
+                u2_.setTo(0, stream);
+                d2_.setTo(0, stream);
+                l2_.setTo(0, stream);
+                r2_.setTo(0, stream);
+            }
+        }
+
+        load_constants(ndisp_, max_data_term_, scale_ * data_weight_, scale_ * max_disc_term_, scale_ * disc_single_jump_);
+
+        datas_.resize(levels_);
+
+        cols_all_.resize(levels_);
+        rows_all_.resize(levels_);
+
+        cols_all_[0] = cols_;
+        rows_all_[0] = rows_;
+    }
+
+    void StereoBPImpl::calcBP(OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::gpu::cudev::stereobp;
+
+        typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        static const data_step_down_t data_step_down_callers[2] =
+        {
+            data_step_down_gpu<short>, data_step_down_gpu<float>
+        };
+
+        typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        static const level_up_messages_t level_up_messages_callers[2] =
+        {
+            level_up_messages_gpu<short>, level_up_messages_gpu<float>
+        };
+
+        typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        static const calc_all_iterations_t calc_all_iterations_callers[2] =
+        {
+            calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
+        };
+
+        typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+        static const output_t output_callers[2] =
+        {
+            output_gpu<short>, output_gpu<float>
+        };
+
+        const int funcIdx = msg_type_ == CV_32F;
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        for (int i = 1; i < levels_; ++i)
+        {
+            cols_all_[i] = (cols_all_[i-1] + 1) / 2;
+            rows_all_[i] = (rows_all_[i-1] + 1) / 2;
+
+            datas_[i].create(rows_all_[i] * ndisp_, cols_all_[i], msg_type_);
+
+            data_step_down_callers[funcIdx](cols_all_[i], rows_all_[i], rows_all_[i-1], datas_[i-1], datas_[i], stream);
+        }
+
+        PtrStepSzb mus[] = {u_, u2_};
+        PtrStepSzb mds[] = {d_, d2_};
+        PtrStepSzb mrs[] = {r_, r2_};
+        PtrStepSzb mls[] = {l_, l2_};
+
+        int mem_idx = (levels_ & 1) ? 0 : 1;
+
+        for (int i = levels_ - 1; i >= 0; --i)
+        {
+            // for lower level we have already computed messages by setting to zero
+            if (i != levels_ - 1)
+                level_up_messages_callers[funcIdx](mem_idx, cols_all_[i], rows_all_[i], rows_all_[i+1], mus, mds, mls, mrs, stream);
+
+            calc_all_iterations_callers[funcIdx](cols_all_[i], rows_all_[i], iters_, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas_[i], stream);
+
+            mem_idx = (mem_idx + 1) & 1;
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows_, cols_, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows_, cols_, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        output_callers[funcIdx](u_, d_, l_, r_, datas_.front(), out, stream);
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+}
+
+Ptr<gpu::StereoBeliefPropagation> cv::gpu::createStereoBeliefPropagation(int ndisp, int iters, int levels, int msg_type)
+{
+    return new StereoBPImpl(ndisp, iters, levels, msg_type);
 }
 
 void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
@@ -101,240 +377,4 @@ void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int
     if (levels == 0) levels++;
 }
 
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(max_data_term_), data_weight(data_weight_),
-      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-namespace
-{
-    class StereoBeliefPropagationImpl
-    {
-    public:
-        StereoBeliefPropagationImpl(StereoBeliefPropagation& rthis_,
-                                    GpuMat& u_, GpuMat& d_, GpuMat& l_, GpuMat& r_,
-                                    GpuMat& u2_, GpuMat& d2_, GpuMat& l2_, GpuMat& r2_,
-                                    std::vector<GpuMat>& datas_, GpuMat& out_)
-            : rthis(rthis_), u(u_), d(d_), l(l_), r(r_), u2(u2_), d2(d2_), l2(l2_), r2(r2_), datas(datas_), out(out_),
-              zero(Scalar::all(0)), scale(rthis_.msg_type == CV_32F ? 1.0f : 10.0f)
-        {
-            CV_Assert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels);
-            CV_Assert(rthis.msg_type == CV_32F || rthis.msg_type == CV_16S);
-            CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < std::numeric_limits<short>::max());
-        }
-
-        void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-        {
-            typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
-            static const comp_data_t comp_data_callers[2][5] =
-            {
-                {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
-                {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
-            };
-
-            CV_Assert(left.size() == right.size() && left.type() == right.type());
-            CV_Assert(left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4);
-
-            rows = left.rows;
-            cols = left.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(std::min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init(stream);
-
-            datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
-
-            comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], StreamAccessor::getStream(stream));
-
-            calcBP(disp, stream);
-        }
-
-        void operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
-        {
-            CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0));
-
-            rows = data.rows / rthis.ndisp;
-            cols = data.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(std::min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init(stream);
-
-            datas[0] = data;
-
-            calcBP(disp, stream);
-        }
-    private:
-        void init(Stream& stream)
-        {
-            u.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            d.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            l.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            r.create(rows * rthis.ndisp, cols, rthis.msg_type);
-
-            if (rthis.levels & 1)
-            {
-                //can clear less area
-                u.setTo(zero, stream);
-                d.setTo(zero, stream);
-                l.setTo(zero, stream);
-                r.setTo(zero, stream);
-            }
-
-            if (rthis.levels > 1)
-            {
-                int less_rows = (rows + 1) / 2;
-                int less_cols = (cols + 1) / 2;
-
-                u2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                d2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                l2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                r2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-
-                if ((rthis.levels & 1) == 0)
-                {
-                    u2.setTo(zero, stream);
-                    d2.setTo(zero, stream);
-                    l2.setTo(zero, stream);
-                    r2.setTo(zero, stream);
-                }
-            }
-
-            load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
-
-            datas.resize(rthis.levels);
-
-            cols_all.resize(rthis.levels);
-            rows_all.resize(rthis.levels);
-
-            cols_all[0] = cols;
-            rows_all[0] = rows;
-        }
-
-        void calcBP(GpuMat& disp, Stream& stream)
-        {
-            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-            static const data_step_down_t data_step_down_callers[2] =
-            {
-                data_step_down_gpu<short>, data_step_down_gpu<float>
-            };
-
-            typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-            static const level_up_messages_t level_up_messages_callers[2] =
-            {
-                level_up_messages_gpu<short>, level_up_messages_gpu<float>
-            };
-
-            typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-            static const calc_all_iterations_t calc_all_iterations_callers[2] =
-            {
-                calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
-            };
-
-            typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
-            static const output_t output_callers[2] =
-            {
-                output_gpu<short>, output_gpu<float>
-            };
-
-            const int funcIdx = rthis.msg_type == CV_32F;
-
-            cudaStream_t cudaStream = StreamAccessor::getStream(stream);
-
-            for (int i = 1; i < rthis.levels; ++i)
-            {
-                cols_all[i] = (cols_all[i-1] + 1) / 2;
-                rows_all[i] = (rows_all[i-1] + 1) / 2;
-
-                datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
-
-                data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], cudaStream);
-            }
-
-            PtrStepSzb mus[] = {u, u2};
-            PtrStepSzb mds[] = {d, d2};
-            PtrStepSzb mrs[] = {r, r2};
-            PtrStepSzb mls[] = {l, l2};
-
-            int mem_idx = (rthis.levels & 1) ? 0 : 1;
-
-            for (int i = rthis.levels - 1; i >= 0; --i)
-            {
-                // for lower level we have already computed messages by setting to zero
-                if (i != rthis.levels - 1)
-                    level_up_messages_callers[funcIdx](mem_idx, cols_all[i], rows_all[i], rows_all[i+1], mus, mds, mls, mrs, cudaStream);
-
-                calc_all_iterations_callers[funcIdx](cols_all[i], rows_all[i], rthis.iters, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i], cudaStream);
-
-                mem_idx = (mem_idx + 1) & 1;
-            }
-
-            if (disp.empty())
-                disp.create(rows, cols, CV_16S);
-
-            out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-
-            out.setTo(zero, stream);
-
-            output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream);
-
-            if (disp.type() != CV_16S)
-                out.convertTo(disp, disp.type(), stream);
-        }
-
-        StereoBeliefPropagation& rthis;
-
-        GpuMat& u;
-        GpuMat& d;
-        GpuMat& l;
-        GpuMat& r;
-
-        GpuMat& u2;
-        GpuMat& d2;
-        GpuMat& l2;
-        GpuMat& r2;
-
-        std::vector<GpuMat>& datas;
-        GpuMat& out;
-
-        const Scalar zero;
-        const float scale;
-
-        int rows, cols;
-
-        std::vector<int> cols_all, rows_all;
-    };
-}
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(left, right, disp, stream);
-}
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
-{
-    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(data, disp, stream);
-}
-
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpustereo/src/stereocsbp.cpp b/modules/gpustereo/src/stereocsbp.cpp
index cedba1eeb9..9afd8d14e2 100644
--- a/modules/gpustereo/src/stereocsbp.cpp
+++ b/modules/gpustereo/src/stereocsbp.cpp
@@ -49,13 +49,9 @@ using namespace cv::gpu;
 
 void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, int&, int&, int&) { throw_no_cuda(); }
 
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); }
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, float, float, int, int) { throw_no_cuda(); }
-
-void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::StereoConstantSpaceBP> cv::gpu::createStereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); return Ptr<gpu::StereoConstantSpaceBP>(); }
 
 #else /* !defined (HAVE_CUDA) */
-#include "opencv2/core/utility.hpp"
 
 namespace cv { namespace gpu { namespace cudev
 {
@@ -89,14 +85,288 @@ namespace cv { namespace gpu { namespace cudev
     }
 }}}
 
-using namespace ::cv::gpu::cudev::stereocsbp;
-
 namespace
 {
+    class StereoCSBPImpl : public gpu::StereoConstantSpaceBP
+    {
+    public:
+        StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return min_disp_th_; }
+        void setMinDisparity(int minDisparity) { min_disp_th_ = minDisparity; }
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+        int getNrPlane() const { return nr_plane_; }
+        void setNrPlane(int nr_plane) { nr_plane_ = nr_plane; }
+
+        bool getUseLocalInitDataCost() const { return use_local_init_data_cost_; }
+        void setUseLocalInitDataCost(bool use_local_init_data_cost) { use_local_init_data_cost_ = use_local_init_data_cost; }
+
+    private:
+        int min_disp_th_;
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+        int nr_plane_;
+        bool use_local_init_data_cost_;
+
+        GpuMat mbuf_;
+        GpuMat temp_;
+        GpuMat outBuf_;
+    };
+
     const float DEFAULT_MAX_DATA_TERM = 30.0f;
     const float DEFAULT_DATA_WEIGHT = 1.0f;
     const float DEFAULT_MAX_DISC_TERM = 160.0f;
     const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
+
+    StereoCSBPImpl::StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type) :
+        min_disp_th_(0), ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type), nr_plane_(nr_plane), use_local_init_data_cost_(true)
+    {
+    }
+
+    void StereoCSBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoCSBPImpl::compute(InputArray _left, InputArray _right, OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::gpu::cudev::stereocsbp;
+
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ && 0 < nr_plane_ && levels_ <= 8 );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        // Init
+
+        int rows = left.rows;
+        int cols = left.cols;
+
+        levels_ = std::min(levels_, int(log((double)ndisp_) / log(2.0)));
+
+        // compute sizes
+        AutoBuffer<int> buf(levels_ * 3);
+        int* cols_pyr = buf;
+        int* rows_pyr = cols_pyr + levels_;
+        int* nr_plane_pyr = rows_pyr + levels_;
+
+        cols_pyr[0]     = cols;
+        rows_pyr[0]     = rows;
+        nr_plane_pyr[0] = nr_plane_;
+
+        for (int i = 1; i < levels_; i++)
+        {
+            cols_pyr[i]     = cols_pyr[i-1] / 2;
+            rows_pyr[i]     = rows_pyr[i-1] / 2;
+            nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
+        }
+
+        GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
+
+        //allocate buffers
+        int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
+        buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
+        buffers_count += 1; //  data_cost_selected
+        mbuf_.create(rows * nr_plane_ * buffers_count, cols, msg_type_);
+
+        data_cost          = mbuf_.rowRange(0, rows * nr_plane_ * 2);
+        data_cost_selected = mbuf_.rowRange(data_cost.rows, data_cost.rows + rows * nr_plane_);
+
+        for(int k = 0; k < 2; ++k) // in/out
+        {
+            GpuMat sub1 = mbuf_.rowRange(data_cost.rows + data_cost_selected.rows, mbuf_.rows);
+            GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
+
+            GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
+            for(int _r = 0; _r < 5; ++_r)
+            {
+                *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
+                CV_DbgAssert( buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * nr_plane_ );
+            }
+        };
+
+        size_t elem_step = mbuf_.step / mbuf_.elemSize();
+
+        Size temp_size = data_cost.size();
+        if ((size_t)temp_size.area() < elem_step * rows_pyr[levels_ - 1] * ndisp_)
+            temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels_ - 1] * ndisp_);
+
+        temp_.create(temp_size, msg_type_);
+
+        ////////////////////////////////////////////////////////////////////////////
+        // Compute
+
+        load_constants(ndisp_, max_data_term_, data_weight_, max_disc_term_, disc_single_jump_, min_disp_th_, left, right, temp_);
+
+        l[0].setTo(0, _stream);
+        d[0].setTo(0, _stream);
+        r[0].setTo(0, _stream);
+        u[0].setTo(0, _stream);
+
+        l[1].setTo(0, _stream);
+        d[1].setTo(0, _stream);
+        r[1].setTo(0, _stream);
+        u[1].setTo(0, _stream);
+
+        data_cost.setTo(0, _stream);
+        data_cost_selected.setTo(0, _stream);
+
+        int cur_idx = 0;
+
+        if (msg_type_ == CV_32F)
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<float>(), data_cost_selected.ptr<float>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<float>(), data_cost.ptr<float>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(u[new_idx].ptr<float>(), d[new_idx].ptr<float>(), l[new_idx].ptr<float>(), r[new_idx].ptr<float>(),
+                                 u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                 disp_selected_pyr[new_idx].ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(),
+                                 data_cost_selected.ptr<float>(), data_cost.ptr<float>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                    data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+            }
+        }
+        else
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<short>(), data_cost_selected.ptr<short>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<short>(), data_cost.ptr<short>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(u[new_idx].ptr<short>(), d[new_idx].ptr<short>(), l[new_idx].ptr<short>(), r[new_idx].ptr<short>(),
+                                 u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                 disp_selected_pyr[new_idx].ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(),
+                                 data_cost_selected.ptr<short>(), data_cost.ptr<short>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                    data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+            }
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows, cols, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows, cols, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        if (msg_type_ == CV_32F)
+        {
+            compute_disp(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                         data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+        else
+        {
+            compute_disp(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                         data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+
+    void StereoCSBPImpl::compute(InputArray /*data*/, OutputArray /*disparity*/, Stream& /*stream*/)
+    {
+        CV_Error(Error::StsNotImplemented, "Not implemented");
+    }
+}
+
+Ptr<gpu::StereoConstantSpaceBP> cv::gpu::createStereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, int msg_type)
+{
+    return new StereoCSBPImpl(ndisp, iters, levels, nr_plane, msg_type);
 }
 
 void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
@@ -114,174 +384,4 @@ void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int he
     nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
 }
 
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-                                                      int msg_type_)
-
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
-      msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-                                                      float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_,
-                                                      int min_disp_th_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-      max_data_term(max_data_term_), data_weight(data_weight_),
-      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
-      msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-template<class T>
-static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
-        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
-
-    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));
-
-    const Scalar zero = Scalar::all(0);
-
-    cudaStream_t cudaStream = StreamAccessor::getStream(stream);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////
-    // Init
-
-    int rows = left.rows;
-    int cols = left.cols;
-
-    rthis.levels = std::min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
-    int levels = rthis.levels;
-
-    // compute sizes
-    AutoBuffer<int> buf(levels * 3);
-    int* cols_pyr = buf;
-    int* rows_pyr = cols_pyr + levels;
-    int* nr_plane_pyr = rows_pyr + levels;
-
-    cols_pyr[0]     = cols;
-    rows_pyr[0]     = rows;
-    nr_plane_pyr[0] = rthis.nr_plane;
-
-    for (int i = 1; i < levels; i++)
-    {
-        cols_pyr[i]     = cols_pyr[i-1] / 2;
-        rows_pyr[i]     = rows_pyr[i-1] / 2;
-        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
-    }
-
-
-    GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
-
-
-    //allocate buffers
-    int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
-    buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
-    buffers_count += 1; //  data_cost_selected
-    mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type);
-
-    data_cost          = mbuf.rowRange(0, rows * rthis.nr_plane * 2);
-    data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane);
-
-    for(int k = 0; k < 2; ++k) // in/out
-    {
-        GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows);
-        GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
-
-        GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
-        for(int _r = 0; _r < 5; ++_r)
-        {
-            *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
-            CV_DbgAssert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane);
-        }
-    };
-
-    size_t elem_step = mbuf.step / sizeof(T);
-
-    Size temp_size = data_cost.size();
-    if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp)
-        temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp);
-
-    temp.create(temp_size, DataType<T>::type);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Compute
-
-    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
-
-    l[0].setTo(zero, stream);
-    d[0].setTo(zero, stream);
-    r[0].setTo(zero, stream);
-    u[0].setTo(zero, stream);
-
-    l[1].setTo(zero, stream);
-    d[1].setTo(zero, stream);
-    r[1].setTo(zero, stream);
-    u[1].setTo(zero, stream);
-
-    data_cost.setTo(zero, stream);
-    data_cost_selected.setTo(zero, stream);
-
-    int cur_idx = 0;
-
-    for (int i = levels - 1; i >= 0; i--)
-    {
-        if (i == levels - 1)
-        {
-            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
-                elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
-        }
-        else
-        {
-            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step,
-                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);
-
-            int new_idx = (cur_idx + 1) & 1;
-
-            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
-                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
-                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i],
-                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);
-
-            cur_idx = new_idx;
-        }
-
-        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step,
-                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
-    }
-
-    if (disp.empty())
-        disp.create(rows, cols, CV_16S);
-
-    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-
-    out.setTo(zero, stream);
-
-    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);
-
-    if (disp.type() != CV_16S)
-    {
-        out.convertTo(disp, disp.type(), stream);
-    }
-}
-
-
-typedef void (*csbp_operator_t)(StereoConstantSpaceBP& rthis, GpuMat& mbuf,
-                                     GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream);
-
-const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
-
-void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
-    operators[msg_type](*this, messages_buffers, temp, out, left, right, disp, stream);
-}
-
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpustereo/src/util.cpp b/modules/gpustereo/src/util.cpp
index 9bff6fff21..e58b5a18e0 100644
--- a/modules/gpustereo/src/util.cpp
+++ b/modules/gpustereo/src/util.cpp
@@ -47,8 +47,8 @@ using namespace cv::gpu;
 
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
-void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::reprojectImageTo3D(InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::drawColorDisp(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
 
 #else
 
@@ -61,7 +61,7 @@ namespace cv { namespace gpu { namespace cudev
     void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
 }}}
 
-void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q, int dst_cn, Stream& stream)
+void cv::gpu::reprojectImageTo3D(InputArray _disp, OutputArray _xyz, InputArray _Q, int dst_cn, Stream& stream)
 {
     using namespace cv::gpu::cudev;
 
@@ -72,11 +72,15 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q,
         {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
     };
 
-    CV_Assert(disp.type() == CV_8U || disp.type() == CV_16S);
-    CV_Assert(Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous());
-    CV_Assert(dst_cn == 3 || dst_cn == 4);
+    GpuMat disp = _disp.getGpuMat();
+    Mat Q = _Q.getMat();
 
-    xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
+    CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+    CV_Assert( Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous() );
+    CV_Assert( dst_cn == 3 || dst_cn == 4 );
+
+    _xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
+    GpuMat xyz = _xyz.getGpuMat();
 
     funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
 }
@@ -93,23 +97,25 @@ namespace cv { namespace gpu { namespace cudev
 namespace
 {
     template <typename T>
-    void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
+    void drawColorDisp_caller(const GpuMat& src, OutputArray _dst, int ndisp, const cudaStream_t& stream)
     {
         using namespace ::cv::gpu::cudev;
 
-        dst.create(src.size(), CV_8UC4);
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();
 
         drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
     }
-
-    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);
-
-    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
 }
 
-void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& stream)
+void cv::gpu::drawColorDisp(InputArray _src, OutputArray dst, int ndisp, Stream& stream)
 {
-    CV_Assert(src.type() == CV_8U || src.type() == CV_16S);
+    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, OutputArray dst, int ndisp, const cudaStream_t& stream);
+    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8U || src.type() == CV_16S );
 
     drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
 }
diff --git a/modules/gpustereo/test/test_stereo.cpp b/modules/gpustereo/test/test_stereo.cpp
index 0ead03dc5c..9a3d94627e 100644
--- a/modules/gpustereo/test/test_stereo.cpp
+++ b/modules/gpustereo/test/test_stereo.cpp
@@ -71,10 +71,10 @@ GPU_TEST_P(StereoBM, Regression)
     ASSERT_FALSE(right_image.empty());
     ASSERT_FALSE(disp_gold.empty());
 
-    cv::gpu::StereoBM_GPU bm(0, 128, 19);
+    cv::Ptr<cv::StereoBM> bm = cv::gpu::createStereoBM(128, 19);
     cv::gpu::GpuMat disp;
 
-    bm(loadMat(left_image), loadMat(right_image), disp);
+    bm->compute(loadMat(left_image), loadMat(right_image), disp);
 
     EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
 }
@@ -106,10 +106,15 @@ GPU_TEST_P(StereoBeliefPropagation, Regression)
     ASSERT_FALSE(right_image.empty());
     ASSERT_FALSE(disp_gold.empty());
 
-    cv::gpu::StereoBeliefPropagation bp(64, 8, 2, 25, 0.1f, 15, 1, CV_16S);
+    cv::Ptr<cv::gpu::StereoBeliefPropagation> bp = cv::gpu::createStereoBeliefPropagation(64, 8, 2, CV_16S);
+    bp->setMaxDataTerm(25.0);
+    bp->setDataWeight(0.1);
+    bp->setMaxDiscTerm(15.0);
+    bp->setDiscSingleJump(1.0);
+
     cv::gpu::GpuMat disp;
 
-    bp(loadMat(left_image), loadMat(right_image), disp);
+    bp->compute(loadMat(left_image), loadMat(right_image), disp);
 
     cv::Mat h_disp(disp);
     h_disp.convertTo(h_disp, disp_gold.depth());
@@ -150,10 +155,10 @@ GPU_TEST_P(StereoConstantSpaceBP, Regression)
     ASSERT_FALSE(right_image.empty());
     ASSERT_FALSE(disp_gold.empty());
 
-    cv::gpu::StereoConstantSpaceBP csbp(128, 16, 4, 4);
+    cv::Ptr<cv::gpu::StereoConstantSpaceBP> csbp = cv::gpu::createStereoConstantSpaceBP(128, 16, 4, 4);
     cv::gpu::GpuMat disp;
 
-    csbp(loadMat(left_image), loadMat(right_image), disp);
+    csbp->compute(loadMat(left_image), loadMat(right_image), disp);
 
     cv::Mat h_disp(disp);
     h_disp.convertTo(h_disp, disp_gold.depth());
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index f473171ef6..824466095c 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -185,7 +185,11 @@ if(HAVE_XIMEA)
   if(XIMEA_LIBRARY_DIR)
     link_directories(${XIMEA_LIBRARY_DIR})
   endif()
-  list(APPEND HIGHGUI_LIBRARIES m3api)
+  if(CMAKE_CL_64)
+    list(APPEND HIGHGUI_LIBRARIES m3apiX64)
+  else()
+    list(APPEND HIGHGUI_LIBRARIES m3api)
+  endif()
 endif(HAVE_XIMEA)
 
 if(HAVE_FFMPEG)
diff --git a/modules/highgui/doc/user_interface.rst b/modules/highgui/doc/user_interface.rst
index f84a04c21e..e4276718b1 100644
--- a/modules/highgui/doc/user_interface.rst
+++ b/modules/highgui/doc/user_interface.rst
@@ -9,6 +9,8 @@ Creates a trackbar and attaches it to the specified window.
 
 .. ocv:function:: int createTrackbar( const String& trackbarname, const String& winname, int* value, int count, TrackbarCallback onChange=0, void* userdata=0)
 
+.. ocv:pyfunction:: cv2.createTrackbar(trackbarName, windowName, value, count, onChange) -> None
+
 .. ocv:cfunction:: int cvCreateTrackbar( const char* trackbar_name, const char* window_name, int* value, int count, CvTrackbarCallback on_change=NULL )
 
     :param trackbarname: Name of the created trackbar.
@@ -181,6 +183,8 @@ Sets mouse handler for the specified window
 
 .. ocv:function:: void setMouseCallback( const String& winname, MouseCallback onMouse, void* userdata=0 )
 
+.. ocv:pyfunction:: cv2.setMouseCallback(windowName, onMouse [, param]) -> None
+
 .. ocv:cfunction:: void cvSetMouseCallback( const char* window_name, CvMouseCallback on_mouse, void* param=NULL )
 
     :param winname: Window name
diff --git a/modules/highgui/include/opencv2/highgui/cap_ios.h b/modules/highgui/include/opencv2/highgui/cap_ios.h
index 97d9a08af6..4d270aba9e 100644
--- a/modules/highgui/include/opencv2/highgui/cap_ios.h
+++ b/modules/highgui/include/opencv2/highgui/cap_ios.h
@@ -1,6 +1,4 @@
-/*
- *  cap_ios.h
- *  For iOS video I/O
+/*  For iOS video I/O
  *  by Eduard Feicho on 29/07/12
  *  Copyright 2012. All rights reserved.
  *
@@ -90,6 +88,12 @@
 - (void)createVideoPreviewLayer;
 - (void)updateOrientation;
 
+- (void)lockFocus;
+- (void)unlockFocus;
+- (void)lockExposure;
+- (void)unlockExposure;
+- (void)lockBalance;
+- (void)unlockBalance;
 
 @end
 
@@ -116,6 +120,7 @@
     BOOL grayscaleMode;
 
     BOOL recordVideo;
+    BOOL rotateVideo;
     AVAssetWriterInput* recordAssetWriterInput;
     AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
     AVAssetWriter* recordAssetWriter;
@@ -128,6 +133,7 @@
 @property (nonatomic, assign) BOOL grayscaleMode;
 
 @property (nonatomic, assign) BOOL recordVideo;
+@property (nonatomic, assign) BOOL rotateVideo;
 @property (nonatomic, retain) AVAssetWriterInput* recordAssetWriterInput;
 @property (nonatomic, retain) AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
 @property (nonatomic, retain) AVAssetWriter* recordAssetWriter;
diff --git a/modules/highgui/src/cap_dc1394_v2.cpp b/modules/highgui/src/cap_dc1394_v2.cpp
index 2aa494fac7..ea7e4b2b86 100644
--- a/modules/highgui/src/cap_dc1394_v2.cpp
+++ b/modules/highgui/src/cap_dc1394_v2.cpp
@@ -45,7 +45,16 @@
 
 #include <unistd.h>
 #include <stdint.h>
-#include <sys/select.h>
+#ifdef WIN32
+  // On Windows, we have no sys/select.h, but we need to pick up
+  // select() which is in winsock2.
+  #ifndef __SYS_SELECT_H__
+    #define __SYS_SELECT_H__ 1
+    #include <winsock2.h>
+  #endif
+#else
+  #include <sys/select.h>
+#endif /*WIN32*/
 #include <dc1394/dc1394.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/modules/highgui/src/cap_dshow.cpp b/modules/highgui/src/cap_dshow.cpp
index 42741e83b4..8e5baaa403 100644
--- a/modules/highgui/src/cap_dshow.cpp
+++ b/modules/highgui/src/cap_dshow.cpp
@@ -3193,8 +3193,10 @@ IplImage* CvCaptureCAM_DShow::retrieveFrame(int)
         frame = cvCreateImage( cvSize(w,h), 8, 3 );
     }
 
-    VI.getPixels( index, (uchar*)frame->imageData, false, true );
-    return frame;
+    if (VI.getPixels( index, (uchar*)frame->imageData, false, true ))
+        return frame;
+    else
+        return NULL;
 }
 
 double CvCaptureCAM_DShow::getProperty( int property_id )
diff --git a/modules/highgui/src/cap_ffmpeg_impl.hpp b/modules/highgui/src/cap_ffmpeg_impl.hpp
index f5d6b48065..99da45f4c0 100644
--- a/modules/highgui/src/cap_ffmpeg_impl.hpp
+++ b/modules/highgui/src/cap_ffmpeg_impl.hpp
@@ -41,6 +41,9 @@
 //M*/
 
 #include "cap_ffmpeg_api.hpp"
+#if !(defined(WIN32) || defined(_WIN32) || defined(WINCE))
+# include <pthread.h>
+#endif
 #include <assert.h>
 #include <algorithm>
 #include <limits>
diff --git a/modules/highgui/src/cap_gstreamer.cpp b/modules/highgui/src/cap_gstreamer.cpp
index b8f4eb83f3..4d4dc711bc 100644
--- a/modules/highgui/src/cap_gstreamer.cpp
+++ b/modules/highgui/src/cap_gstreamer.cpp
@@ -651,17 +651,47 @@ bool CvCapture_GStreamer::open( int type, const char* filename )
 
     if(manualpipeline)
     {
+        GstIterator *it = NULL;
 #if GST_VERSION_MAJOR == 0
-        GstIterator *it = gst_bin_iterate_sinks(GST_BIN(uridecodebin));
+        it = gst_bin_iterate_sinks(GST_BIN(uridecodebin));
         if(gst_iterator_next(it, (gpointer *)&sink) != GST_ITERATOR_OK) {
             CV_ERROR(CV_StsError, "GStreamer: cannot find appsink in manual pipeline\n");
             return false;
         }
 #else
-        sink = gst_bin_get_by_name(GST_BIN(uridecodebin), "opencvsink");
-        if (!sink){
-            sink = gst_bin_get_by_name(GST_BIN(uridecodebin), "appsink0");
+        it = gst_bin_iterate_sinks (GST_BIN(uridecodebin));
+
+        gboolean done = FALSE;
+        GstElement *element = NULL;
+        gchar* name = NULL;
+        GValue value = G_VALUE_INIT;
+
+        while (!done) {
+          switch (gst_iterator_next (it, &value)) {
+            case GST_ITERATOR_OK:
+              element = GST_ELEMENT (g_value_get_object (&value));
+              name = gst_element_get_name(element);
+              if (name){
+                if(strstr(name, "opencvsink") != NULL || strstr(name, "appsink") != NULL) {
+                  sink = GST_ELEMENT ( gst_object_ref (element) );
+                  done = TRUE;
+                }
+                g_free(name);
+              }
+              g_value_unset (&value);
+
+              break;
+            case GST_ITERATOR_RESYNC:
+              gst_iterator_resync (it);
+              break;
+            case GST_ITERATOR_ERROR:
+            case GST_ITERATOR_DONE:
+              done = TRUE;
+              break;
+          }
         }
+        gst_iterator_free (it);
+
 
         if (!sink){
             CV_ERROR(CV_StsError, "GStreamer: cannot find appsink in manual pipeline\n");
@@ -1030,6 +1060,12 @@ void CvVideoWriter_GStreamer::close()
         handleMessage(pipeline);
 
         gst_object_unref (GST_OBJECT (pipeline));
+
+        if (source)
+          gst_object_unref (GST_OBJECT (source));
+
+        if (file)
+          gst_object_unref (GST_OBJECT (file));
     }
 }
 
@@ -1127,9 +1163,7 @@ bool CvVideoWriter_GStreamer::open( const char * filename, int fourcc,
     GstEncodingVideoProfile* videoprofile = NULL;
 #endif
 
-#if GST_VERSION_MAJOR == 0
     GstIterator *it = NULL;
-#endif
 
     // we first try to construct a pipeline from the given string.
     // if that fails, we assume it is an ordinary filename
@@ -1150,10 +1184,38 @@ bool CvVideoWriter_GStreamer::open( const char * filename, int fourcc,
             return false;
         }
 #else
-        source = gst_bin_get_by_name(GST_BIN(encodebin), "opencvsrc");
-        if (!source){
-            source = gst_bin_get_by_name(GST_BIN(encodebin), "appsrc0");
+        it = gst_bin_iterate_sources (GST_BIN(encodebin));
+
+        gboolean done = FALSE;
+        GstElement *element = NULL;
+        gchar* name = NULL;
+        GValue value = G_VALUE_INIT;
+
+        while (!done) {
+          switch (gst_iterator_next (it, &value)) {
+            case GST_ITERATOR_OK:
+              element = GST_ELEMENT (g_value_get_object (&value));
+              name = gst_element_get_name(element);
+              if (name){
+                if(strstr(name, "opencvsrc") != NULL || strstr(name, "appsrc") != NULL) {
+                  source = GST_ELEMENT ( gst_object_ref (element) );
+                  done = TRUE;
+                }
+                g_free(name);
+              }
+              g_value_unset (&value);
+
+              break;
+            case GST_ITERATOR_RESYNC:
+              gst_iterator_resync (it);
+              break;
+            case GST_ITERATOR_ERROR:
+            case GST_ITERATOR_DONE:
+              done = TRUE;
+              break;
+          }
         }
+        gst_iterator_free (it);
 
         if (!source){
             CV_ERROR(CV_StsError, "GStreamer: cannot find appsrc in manual pipeline\n");
diff --git a/modules/highgui/src/cap_ios_abstract_camera.mm b/modules/highgui/src/cap_ios_abstract_camera.mm
index b6a7d944fa..38e1c12e68 100644
--- a/modules/highgui/src/cap_ios_abstract_camera.mm
+++ b/modules/highgui/src/cap_ios_abstract_camera.mm
@@ -2,6 +2,7 @@
  *  cap_ios_abstract_camera.mm
  *  For iOS video I/O
  *  by Eduard Feicho on 29/07/12
+ *  by Alexander Shishkov on 17/07/13
  *  Copyright 2012. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -405,4 +406,89 @@
     }
 }
 
+- (void)lockFocus;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isFocusModeSupported:AVCaptureFocusModeLocked]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.focusMode = AVCaptureFocusModeLocked;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for locked focus configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void) unlockFocus;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isFocusModeSupported:AVCaptureFocusModeContinuousAutoFocus]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.focusMode = AVCaptureFocusModeContinuousAutoFocus;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for autofocus configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void)lockExposure;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isExposureModeSupported:AVCaptureExposureModeLocked]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.exposureMode = AVCaptureExposureModeLocked;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for locked exposure configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void) unlockExposure;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isExposureModeSupported:AVCaptureExposureModeContinuousAutoExposure]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.exposureMode = AVCaptureExposureModeContinuousAutoExposure;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for autoexposure configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void)lockBalance;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isWhiteBalanceModeSupported:AVCaptureWhiteBalanceModeLocked]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.whiteBalanceMode = AVCaptureWhiteBalanceModeLocked;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for locked white balance configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
+- (void) unlockBalance;
+{
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if ([device isWhiteBalanceModeSupported:AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance]) {
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            device.whiteBalanceMode = AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance;
+            [device unlockForConfiguration];
+        } else {
+            NSLog(@"unable to lock device for auto white balance configuration %@", [error localizedDescription]);
+        }
+    }
+}
+
 @end
+
diff --git a/modules/highgui/src/cap_ios_video_camera.mm b/modules/highgui/src/cap_ios_video_camera.mm
index 1f9ea14bf8..ac85f79ee5 100644
--- a/modules/highgui/src/cap_ios_video_camera.mm
+++ b/modules/highgui/src/cap_ios_video_camera.mm
@@ -2,6 +2,7 @@
  *  cap_ios_video_camera.mm
  *  For iOS video I/O
  *  by Eduard Feicho on 29/07/12
+ *  by Alexander Shishkov on 17/07/13
  *  Copyright 2012. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +31,6 @@
 
 #import "opencv2/highgui/cap_ios.h"
 #include "precomp.hpp"
-
 #import <AssetsLibrary/AssetsLibrary.h>
 
 
@@ -70,6 +70,7 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 @synthesize videoDataOutput;
 
 @synthesize recordVideo;
+@synthesize rotateVideo;
 //@synthesize videoFileOutput;
 @synthesize recordAssetWriterInput;
 @synthesize recordPixelBufferAdaptor;
@@ -85,6 +86,7 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
     if (self) {
         self.useAVCaptureVideoPreviewLayer = NO;
         self.recordVideo = NO;
+        self.rotateVideo = NO;
     }
     return self;
 }
@@ -269,13 +271,8 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 
 }
 
-
-
-
 #pragma mark - Private Interface
 
-
-
 - (void)createVideoDataOutput;
 {
     // Make a video data output
@@ -389,6 +386,38 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
     [self.parentView.layer addSublayer:self.customPreviewLayer];
 }
 
+- (CVPixelBufferRef) pixelBufferFromCGImage: (CGImageRef) image
+{
+    
+    CGSize frameSize = CGSizeMake(CGImageGetWidth(image), CGImageGetHeight(image));
+    NSDictionary *options = [NSDictionary dictionaryWithObjectsAndKeys:
+                             [NSNumber numberWithBool:NO], kCVPixelBufferCGImageCompatibilityKey,
+                             [NSNumber numberWithBool:NO], kCVPixelBufferCGBitmapContextCompatibilityKey,
+                             nil];
+    CVPixelBufferRef pxbuffer = NULL;
+    CVReturn status = CVPixelBufferCreate(kCFAllocatorDefault, frameSize.width,
+                                          frameSize.height,  kCVPixelFormatType_32ARGB, (CFDictionaryRef) CFBridgingRetain(options),
+                                          &pxbuffer);
+    NSParameterAssert(status == kCVReturnSuccess && pxbuffer != NULL);
+    
+    CVPixelBufferLockBaseAddress(pxbuffer, 0);
+    void *pxdata = CVPixelBufferGetBaseAddress(pxbuffer);
+    
+    
+    CGColorSpaceRef rgbColorSpace = CGColorSpaceCreateDeviceRGB();
+    CGContextRef context = CGBitmapContextCreate(pxdata, frameSize.width,
+                                                 frameSize.height, 8, 4*frameSize.width, rgbColorSpace,
+                                                 kCGImageAlphaPremultipliedFirst);
+    
+    CGContextDrawImage(context, CGRectMake(0, 0, CGImageGetWidth(image),
+                                           CGImageGetHeight(image)), image);
+    CGColorSpaceRelease(rgbColorSpace);
+    CGContextRelease(context);
+    
+    CVPixelBufferUnlockBaseAddress(pxbuffer, 0);
+    
+    return pxbuffer;
+}
 
 #pragma mark - Protocol AVCaptureVideoDataOutputSampleBufferDelegate
 
@@ -522,7 +551,8 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
             }
 
             if (self.recordAssetWriterInput.readyForMoreMediaData) {
-                if (! [self.recordPixelBufferAdaptor appendPixelBuffer:imageBuffer
+                CVImageBufferRef pixelBuffer = [self pixelBufferFromCGImage:dstImage];
+                if (! [self.recordPixelBufferAdaptor appendPixelBuffer:pixelBuffer
                                                   withPresentationTime:lastSampleTime] ) {
                     NSLog(@"Video Writing Error");
                 }
@@ -543,9 +573,12 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 
 - (void)updateOrientation;
 {
-    NSLog(@"rotate..");
-    self.customPreviewLayer.bounds = CGRectMake(0, 0, self.parentView.frame.size.width, self.parentView.frame.size.height);
-    [self layoutPreviewLayer];
+    if (self.rotateVideo == YES)
+    {
+        NSLog(@"rotate..");
+        self.customPreviewLayer.bounds = CGRectMake(0, 0, self.parentView.frame.size.width, self.parentView.frame.size.height);
+        [self layoutPreviewLayer];
+    }
 }
 
 
@@ -583,3 +616,4 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
 }
 
 @end
+
diff --git a/modules/highgui/src/cap_ximea.cpp b/modules/highgui/src/cap_ximea.cpp
index dbb8f58683..5acf2c09d1 100644
--- a/modules/highgui/src/cap_ximea.cpp
+++ b/modules/highgui/src/cap_ximea.cpp
@@ -20,25 +20,24 @@ public:
     virtual IplImage* retrieveFrame(int);
     virtual int getCaptureDomain() { return CV_CAP_XIAPI; } // Return the type of the capture object: CV_CAP_VFW, etc...
 
-protected:
+private:
     void init();
     void errMsg(const char* msg, int errNum);
+    void resetCvImage();
+    int  getBpp();
     IplImage* frame;
 
     HANDLE    hmv;
     DWORD     numDevices;
-    XI_IMG    image;
-    int       width;
-    int       height;
-    int       format;
     int       timeout;
+    XI_IMG    image;
 };
 
 /**********************************************************************************/
 
 CvCapture* cvCreateCameraCapture_XIMEA( int index )
 {
-     CvCaptureCAM_XIMEA* capture = new CvCaptureCAM_XIMEA;
+    CvCaptureCAM_XIMEA* capture = new CvCaptureCAM_XIMEA;
 
     if( capture->open( index ))
         return capture;
@@ -79,18 +78,19 @@ bool CvCaptureCAM_XIMEA::open( int wIndex )
     // always use auto white ballance
     mvret = xiSetParamInt( hmv, XI_PRM_AUTO_WB, 1);
     if(mvret != XI_OK) goto error;
+    
+    // default image format RGB24
+    mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, XI_RGB24);
+    if(mvret != XI_OK) goto error;
 
+    int width = 0;
     mvret = xiGetParamInt( hmv, XI_PRM_WIDTH, &width);
     if(mvret != XI_OK) goto error;
 
+    int height = 0;
     mvret = xiGetParamInt( hmv, XI_PRM_HEIGHT, &height);
     if(mvret != XI_OK) goto error;
 
-    // default image format RGB24
-    format = XI_RGB24;
-    mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, format);
-    if(mvret != XI_OK) goto error;
-
     // allocate frame buffer for RGB24 image
     frame = cvCreateImage(cvSize( width, height), IPL_DEPTH_8U, 3);
 
@@ -103,10 +103,10 @@ bool CvCaptureCAM_XIMEA::open( int wIndex )
         errMsg("StartAcquisition XI_DEVICE failed", mvret);
         goto error;
     }
-
     return true;
 
 error:
+    errMsg("Open XI_DEVICE failed", mvret);
     xiCloseDevice(hmv);
     hmv = NULL;
     return false;
@@ -116,18 +116,19 @@ error:
 
 void CvCaptureCAM_XIMEA::close()
 {
-    if(hmv)
-    {
-        xiStopAcquisition(hmv);
-        xiCloseDevice(hmv);
-        hmv = NULL;
-    }
+    if(frame)
+        cvReleaseImage(&frame);
+
+    xiStopAcquisition(hmv);
+    xiCloseDevice(hmv);
+    hmv = NULL;
 }
 
 /**********************************************************************************/
 
 bool CvCaptureCAM_XIMEA::grabFrame()
 {
+    memset(&image, 0, sizeof(XI_IMG));
     image.size = sizeof(XI_IMG);
     int mvret = xiGetImage( hmv, timeout, &image);
 
@@ -151,31 +152,18 @@ bool CvCaptureCAM_XIMEA::grabFrame()
 IplImage* CvCaptureCAM_XIMEA::retrieveFrame(int)
 {
     // update cvImage after format has changed
-    if( (int)image.width != width || (int)image.height != height || image.frm != (XI_IMG_FORMAT)format)
-    {
-        cvReleaseImage(&frame);
-        switch( image.frm)
-        {
-        case XI_MONO8  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 1); break;
-        case XI_MONO16 : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_16U, 1); break;
-        case XI_RGB24  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 3); break;
-        case XI_RGB32  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 4); break;
-        default :
-            return frame;
-        }
-        // update global image format
-        format = image.frm;
-        width = image.width;
-        height = image.height;
-    }
-
+    resetCvImage();
+    
     // copy pixel data
     switch( image.frm)
     {
-    case XI_MONO8  : memcpy( frame->imageData, image.bp, image.width*image.height); break;
-    case XI_MONO16 : memcpy( frame->imageData, image.bp, image.width*image.height*sizeof(WORD)); break;
-    case XI_RGB24  : memcpy( frame->imageData, image.bp, image.width*image.height*3); break;
-    case XI_RGB32  : memcpy( frame->imageData, image.bp, image.width*image.height*sizeof(DWORD)); break;
+    case XI_MONO8       : 
+    case XI_RAW8        : memcpy( frame->imageData, image.bp, image.width*image.height); break;
+    case XI_MONO16      :
+    case XI_RAW16       : memcpy( frame->imageData, image.bp, image.width*image.height*sizeof(WORD)); break;
+    case XI_RGB24       :
+    case XI_RGB_PLANAR  : memcpy( frame->imageData, image.bp, image.width*image.height*3); break;
+    case XI_RGB32       : memcpy( frame->imageData, image.bp, image.width*image.height*4); break;
     default: break;
     }
     return frame;
@@ -183,6 +171,35 @@ IplImage* CvCaptureCAM_XIMEA::retrieveFrame(int)
 
 /**********************************************************************************/
 
+void CvCaptureCAM_XIMEA::resetCvImage()
+{
+    int width = 0, height = 0, format = 0;
+    xiGetParamInt( hmv, XI_PRM_WIDTH, &width);
+    xiGetParamInt( hmv, XI_PRM_HEIGHT, &height);
+    xiGetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, &format);
+
+    if( (int)image.width != width || (int)image.height != height || image.frm != (XI_IMG_FORMAT)format)
+    {
+        if(frame) cvReleaseImage(&frame);
+        frame = NULL;
+
+        switch( image.frm)
+        {
+        case XI_MONO8       :
+        case XI_RAW8        : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 1); break;
+        case XI_MONO16      : 
+        case XI_RAW16       : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_16U, 1); break;
+        case XI_RGB24       : 
+        case XI_RGB_PLANAR  : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 3); break;
+        case XI_RGB32       : frame = cvCreateImage(cvSize( image.width, image.height), IPL_DEPTH_8U, 4); break;
+        default :
+            return;
+        }
+    }	
+    cvZero(frame);
+}
+/**********************************************************************************/
+
 double CvCaptureCAM_XIMEA::getProperty( int property_id )
 {
     if(hmv == NULL)
@@ -238,20 +255,14 @@ bool CvCaptureCAM_XIMEA::setProperty( int property_id, double value )
     switch(property_id)
     {
     // OCV parameters
-    case CV_CAP_PROP_FRAME_WIDTH  : mvret = xiSetParamInt( hmv, XI_PRM_WIDTH, ival);
-        if(mvret == XI_OK) width = ival;
-        break;
-    case CV_CAP_PROP_FRAME_HEIGHT : mvret = xiSetParamInt( hmv, XI_PRM_HEIGHT, ival);
-        if(mvret == XI_OK) height = ival;
-        break;
+    case CV_CAP_PROP_FRAME_WIDTH  : mvret = xiSetParamInt( hmv, XI_PRM_WIDTH, ival); break;
+    case CV_CAP_PROP_FRAME_HEIGHT : mvret = xiSetParamInt( hmv, XI_PRM_HEIGHT, ival); break;
     case CV_CAP_PROP_FPS          : mvret = xiSetParamFloat( hmv, XI_PRM_FRAMERATE, fval); break;
     case CV_CAP_PROP_GAIN         : mvret = xiSetParamFloat( hmv, XI_PRM_GAIN, fval); break;
     case CV_CAP_PROP_EXPOSURE     : mvret = xiSetParamInt( hmv, XI_PRM_EXPOSURE, ival); break;
     // XIMEA camera properties
     case CV_CAP_PROP_XI_DOWNSAMPLING  : mvret = xiSetParamInt( hmv, XI_PRM_DOWNSAMPLING, ival); break;
-    case CV_CAP_PROP_XI_DATA_FORMAT   : mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, ival);
-        if(mvret == XI_OK) format = ival;
-        break;
+    case CV_CAP_PROP_XI_DATA_FORMAT   : mvret = xiSetParamInt( hmv, XI_PRM_IMAGE_DATA_FORMAT, ival); break;
     case CV_CAP_PROP_XI_OFFSET_X      : mvret = xiSetParamInt( hmv, XI_PRM_OFFSET_X, ival); break;
     case CV_CAP_PROP_XI_OFFSET_Y      : mvret = xiSetParamInt( hmv, XI_PRM_OFFSET_Y, ival); break;
     case CV_CAP_PROP_XI_TRG_SOURCE    : mvret = xiSetParamInt( hmv, XI_PRM_TRG_SOURCE, ival); break;
@@ -288,7 +299,7 @@ bool CvCaptureCAM_XIMEA::setProperty( int property_id, double value )
 void CvCaptureCAM_XIMEA::errMsg(const char* msg, int errNum)
 {
 #if defined WIN32 || defined _WIN32
-    char buf[512];
+    char buf[512]="";
     sprintf( buf, "%s : %d\n", msg, errNum);
     OutputDebugString(buf);
 #else
@@ -296,4 +307,22 @@ void CvCaptureCAM_XIMEA::errMsg(const char* msg, int errNum)
 #endif
 }
 
+/**********************************************************************************/
+
+int  CvCaptureCAM_XIMEA::getBpp()
+{
+    switch( image.frm)
+    {
+    case XI_MONO8       :
+    case XI_RAW8        : return 1;
+    case XI_MONO16      : 
+    case XI_RAW16       : return 2;
+    case XI_RGB24       : 
+    case XI_RGB_PLANAR  : return 3;
+    case XI_RGB32       : return 4;
+    default :
+        return 0;
+    }
+}
+
 /**********************************************************************************/
\ No newline at end of file
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 701f5f5781..428ef51ef5 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -256,12 +256,17 @@ namespace
 
 void cv::imshow( const String& winname, InputArray _img )
 {
+    const Size size = _img.size();
 #ifndef HAVE_OPENGL
-    Mat img = _img.getMat();
-    CvMat c_img = img;
-    cvShowImage(winname.c_str(), &c_img);
+    CV_Assert(size.width>0 && size.height>0);
+    {
+        Mat img = _img.getMat();
+        CvMat c_img = img;
+        cvShowImage(winname.c_str(), &c_img);
+    }
 #else
     const double useGl = getWindowProperty(winname, WND_PROP_OPENGL);
+    CV_Assert(size.width>0 && size.height>0);
 
     if (useGl <= 0)
     {
@@ -275,7 +280,6 @@ void cv::imshow( const String& winname, InputArray _img )
 
         if (autoSize > 0)
         {
-            Size size = _img.size();
             resizeWindow(winname, size.width, size.height);
         }
 
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index eda75bb3cd..0a4e8df0f7 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -2473,35 +2473,33 @@ void DefaultViewPort::saveView()
     if (!fileName.isEmpty()) //save the picture
     {
         QString extension = fileName.right(3);
-
-        //   (no need anymore) create the image resized to receive the 'screenshot'
-        //    image2Draw_qt_resized = QImage(viewport()->width(), viewport()->height(),QImage::Format_RGB888);
-
-        QPainter saveimage(&image2Draw_qt_resized);
-        this->render(&saveimage);
+        
+        // Create a new pixmap to render the viewport into
+        QPixmap viewportPixmap(viewport()->size());
+        viewport()->render(&viewportPixmap);
 
         // Save it..
         if (QString::compare(extension, "png", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "PNG");
+            viewportPixmap.save(fileName, "PNG");
             return;
         }
 
         if (QString::compare(extension, "jpg", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "JPG");
+            viewportPixmap.save(fileName, "JPG");
             return;
         }
 
         if (QString::compare(extension, "bmp", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "BMP");
+            viewportPixmap.save(fileName, "BMP");
             return;
         }
 
         if (QString::compare(extension, "jpeg", Qt::CaseInsensitive) == 0)
         {
-            image2Draw_qt_resized.save(fileName, "JPEG");
+            viewportPixmap.save(fileName, "JPEG");
             return;
         }
 
@@ -2651,17 +2649,16 @@ void DefaultViewPort::paintEvent(QPaintEvent* evnt)
     //Now disable matrixWorld for overlay display
     myPainter.setWorldMatrixEnabled(false);
 
+    //overlay pixel values if zoomed in far enough
+    if (param_matrixWorld.m11()*ratioX >= threshold_zoom_img_region &&
+        param_matrixWorld.m11()*ratioY >= threshold_zoom_img_region)
+    {
+        drawImgRegion(&myPainter);
+    }
+
     //in mode zoom/panning
     if (param_matrixWorld.m11() > 1)
     {
-        if (param_matrixWorld.m11() >= threshold_zoom_img_region)
-        {
-            if (centralWidget->param_flags == CV_WINDOW_NORMAL)
-                startDisplayInfo("WARNING: The values displayed are the resized image's values. If you want the original image's values, use CV_WINDOW_AUTOSIZE", 1000);
-
-            drawImgRegion(&myPainter);
-        }
-
         drawViewOverview(&myPainter);
     }
 
@@ -2887,22 +2884,24 @@ void DefaultViewPort::drawStatusBar()
 //accept only CV_8UC1 and CV_8UC8 image for now
 void DefaultViewPort::drawImgRegion(QPainter *painter)
 {
-
     if (nbChannelOriginImage!=CV_8UC1 && nbChannelOriginImage!=CV_8UC3)
         return;
 
-    qreal offsetX = param_matrixWorld.dx()/param_matrixWorld.m11();
+    double pixel_width = param_matrixWorld.m11()*ratioX;
+    double pixel_height = param_matrixWorld.m11()*ratioY;
+
+    qreal offsetX = param_matrixWorld.dx()/pixel_width;
     offsetX = offsetX - floor(offsetX);
-    qreal offsetY = param_matrixWorld.dy()/param_matrixWorld.m11();
+    qreal offsetY = param_matrixWorld.dy()/pixel_height;
     offsetY = offsetY - floor(offsetY);
 
     QSize view = size();
     QVarLengthArray<QLineF, 30> linesX;
-    for (qreal _x = offsetX*param_matrixWorld.m11(); _x < view.width(); _x += param_matrixWorld.m11() )
+    for (qreal _x = offsetX*pixel_width; _x < view.width(); _x += pixel_width )
         linesX.append(QLineF(_x, 0, _x, view.height()));
 
     QVarLengthArray<QLineF, 30> linesY;
-    for (qreal _y = offsetY*param_matrixWorld.m11(); _y < view.height(); _y += param_matrixWorld.m11() )
+    for (qreal _y = offsetY*pixel_height; _y < view.height(); _y += pixel_height )
         linesY.append(QLineF(0, _y, view.width(), _y));
 
 
@@ -2910,27 +2909,25 @@ void DefaultViewPort::drawImgRegion(QPainter *painter)
     int original_font_size = f.pointSize();
     //change font size
     //f.setPointSize(4+(param_matrixWorld.m11()-threshold_zoom_img_region)/5);
-    f.setPixelSize(10+(param_matrixWorld.m11()-threshold_zoom_img_region)/5);
+    f.setPixelSize(10+(pixel_height-threshold_zoom_img_region)/5);
     painter->setFont(f);
-    QString val;
-    QRgb rgbValue;
 
-    QPointF point1;//sorry, I do not know how to name it
-    QPointF point2;//idem
 
-    for (int j=-1;j<height()/param_matrixWorld.m11();j++)//-1 because display the pixels top rows left colums
-        for (int i=-1;i<width()/param_matrixWorld.m11();i++)//-1
+    for (int j=-1;j<height()/pixel_height;j++)//-1 because display the pixels top rows left columns
+        for (int i=-1;i<width()/pixel_width;i++)//-1
         {
-            point1.setX((i+offsetX)*param_matrixWorld.m11());
-            point1.setY((j+offsetY)*param_matrixWorld.m11());
+            // Calculate top left of the pixel's position in the viewport (screen space)
+            QPointF pos_in_view((i+offsetX)*pixel_width, (j+offsetY)*pixel_height);
 
-            matrixWorld_inv.map(point1.x(),point1.y(),&point2.rx(),&point2.ry());
+            // Calculate top left of the pixel's position in the image (image space)
+            QPointF pos_in_image = matrixWorld_inv.map(pos_in_view);// Top left of pixel in view
+            pos_in_image.rx() = pos_in_image.x()/ratioX;
+            pos_in_image.ry() = pos_in_image.y()/ratioY;
+            QPoint point_in_image(pos_in_image.x() + 0.5f,pos_in_image.y() + 0.5f);// Add 0.5 for rounding
 
-            point2.rx()= (long) (point2.x() + 0.5);
-            point2.ry()= (long) (point2.y() + 0.5);
-
-            if (point2.x() >= 0 && point2.y() >= 0)
-                rgbValue = image2Draw_qt_resized.pixel(QPoint(point2.x(),point2.y()));
+            QRgb rgbValue;
+            if (image2Draw_qt.valid(point_in_image))
+                rgbValue = image2Draw_qt.pixel(point_in_image);
             else
                 rgbValue = qRgb(0,0,0);
 
@@ -2943,29 +2940,29 @@ void DefaultViewPort::drawImgRegion(QPainter *painter)
                 painter->drawText(QRect(point1.x(),point1.y(),param_matrixWorld.m11(),param_matrixWorld.m11()/2),
                     Qt::AlignCenter, val);
                 */
+                QString val;
 
                 val = tr("%1").arg(qRed(rgbValue));
                 painter->setPen(QPen(Qt::red, 1));
-                painter->drawText(QRect(point1.x(),point1.y(),param_matrixWorld.m11(),param_matrixWorld.m11()/3),
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y(),pixel_width,pixel_height/3),
                     Qt::AlignCenter, val);
 
                 val = tr("%1").arg(qGreen(rgbValue));
                 painter->setPen(QPen(Qt::green, 1));
-                painter->drawText(QRect(point1.x(),point1.y()+param_matrixWorld.m11()/3,param_matrixWorld.m11(),param_matrixWorld.m11()/3),
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y()+pixel_height/3,pixel_width,pixel_height/3),
                     Qt::AlignCenter, val);
 
                 val = tr("%1").arg(qBlue(rgbValue));
                 painter->setPen(QPen(Qt::blue, 1));
-                painter->drawText(QRect(point1.x(),point1.y()+2*param_matrixWorld.m11()/3,param_matrixWorld.m11(),param_matrixWorld.m11()/3),
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y()+2*pixel_height/3,pixel_width,pixel_height/3),
                     Qt::AlignCenter, val);
 
             }
 
             if (nbChannelOriginImage==CV_8UC1)
             {
-
-                val = tr("%1").arg(qRed(rgbValue));
-                painter->drawText(QRect(point1.x(),point1.y(),param_matrixWorld.m11(),param_matrixWorld.m11()),
+                QString val = tr("%1").arg(qRed(rgbValue));
+                painter->drawText(QRect(pos_in_view.x(),pos_in_view.y(),pixel_width,pixel_height),
                     Qt::AlignCenter, val);
             }
         }
diff --git a/modules/highgui/src/window_QT.h b/modules/highgui/src/window_QT.h
index 089997f514..a96a8c6e69 100644
--- a/modules/highgui/src/window_QT.h
+++ b/modules/highgui/src/window_QT.h
@@ -522,7 +522,6 @@ private:
 
     CvMat* image2Draw_mat;
     QImage image2Draw_qt;
-    QImage image2Draw_qt_resized;
     int nbChannelOriginImage;
 
     //for mouse callback
diff --git a/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst b/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
index 6f7cba3a9a..136d3e3df4 100644
--- a/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
+++ b/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
@@ -522,6 +522,24 @@ The function calculates and returns the minimum-area bounding rectangle (possibl
 
 
 
+boxPoints
+-----------
+Finds the four vertices of a rotated rect. Useful to draw the rotated rectangle.
+
+.. ocv:function:: void boxPoints(RotatedRect box, OutputArray points)
+
+.. ocv:pyfunction:: cv2.boxPoints(box[, points]) -> points
+
+.. ocv:cfunction:: void cvBoxPoints( CvBox2D box, CvPoint2D32f pt[4] )
+
+    :param box: The input rotated rectangle. It may be the output of .. ocv:function:: minAreaRect.
+    
+    :param points: The output array of four vertices of rectangles.
+    
+The function finds the four vertices of a rotated rectangle. This function is useful to draw the rectangle. In C++, instead of using this function, you can directly use box.points() method. Please visit the `tutorial on bounding rectangle <http://docs.opencv.org/doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.html#bounding-rects-circles>`_ for more information.
+
+
+
 minEnclosingCircle
 ----------------------
 Finds a circle of the minimum area enclosing a 2D point set.
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index fcaf6a58ee..6d61088724 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1318,6 +1318,9 @@ CV_EXPORTS_W double contourArea( InputArray contour, bool oriented = false );
 //! computes the minimal rotated rectangle for a set of points
 CV_EXPORTS_W RotatedRect minAreaRect( InputArray points );
 
+//! computes boxpoints
+CV_EXPORTS_W void boxPoints(RotatedRect box, OutputArray points);
+
 //! computes the minimal enclosing circle for a set of points
 CV_EXPORTS_W void minEnclosingCircle( InputArray points,
                                       CV_OUT Point2f& center, CV_OUT float& radius );
diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp
index df1456c34e..a8932dd736 100644
--- a/modules/imgproc/perf/perf_cvt_color.cpp
+++ b/modules/imgproc/perf/perf_cvt_color.cpp
@@ -258,7 +258,8 @@ PERF_TEST_P(Size_CvtMode, cvtColor8u,
     declare.time(100);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn);
+    int runs = sz.width <= 320 ? 70 : 1;
+    TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
 
     SANITY_CHECK(dst, 1);
 }
@@ -334,7 +335,8 @@ PERF_TEST_P(Size_CvtMode3, cvtColorRGB2YUV420p,
     declare.time(100);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn);
+    int runs = (sz.width <= 640) ? 10 : 1;
+    TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn);
 
     SANITY_CHECK(dst, 1);
 }
diff --git a/modules/imgproc/perf/perf_morph.cpp b/modules/imgproc/perf/perf_morph.cpp
index 9aadeaff52..d3dbba38fb 100644
--- a/modules/imgproc/perf/perf_morph.cpp
+++ b/modules/imgproc/perf/perf_morph.cpp
@@ -19,7 +19,8 @@ PERF_TEST_P(Size_MatType, erode, TYPICAL_MATS_MORPH)
 
     declare.in(src, WARMUP_RNG).out(dst);
 
-    TEST_CYCLE() erode(src, dst, noArray());
+    int runs = (sz.width <= 320) ? 15 : 1;
+    TEST_CYCLE_MULTIRUN(runs) erode(src, dst, noArray());
 
     SANITY_CHECK(dst);
 }
diff --git a/modules/imgproc/perf/perf_remap.cpp b/modules/imgproc/perf/perf_remap.cpp
index 334c5ff960..92c6007a2b 100644
--- a/modules/imgproc/perf/perf_remap.cpp
+++ b/modules/imgproc/perf/perf_remap.cpp
@@ -63,7 +63,8 @@ PERF_TEST_P( TestRemap, Remap,
 
     declare.in(src, WARMUP_RNG).out(dst).time(20);
 
-    TEST_CYCLE() remap(src, dst, map1, map2, inter_type);
+    int runs = (sz.width <= 640) ? 3 : 1;
+    TEST_CYCLE_MULTIRUN(runs) remap(src, dst, map1, map2, inter_type);
 
     SANITY_CHECK(dst);
 }
diff --git a/modules/imgproc/perf/perf_threshold.cpp b/modules/imgproc/perf/perf_threshold.cpp
index 61255e2283..01fff2e8cc 100644
--- a/modules/imgproc/perf/perf_threshold.cpp
+++ b/modules/imgproc/perf/perf_threshold.cpp
@@ -32,7 +32,7 @@ PERF_TEST_P(Size_MatType_ThreshType, threshold,
 
     declare.in(src, WARMUP_RNG).out(dst);
 
-    int runs = (sz.width <= 640) ? 8 : 1;
+    int runs = (sz.width <= 640) ? 40 : 1;
     TEST_CYCLE_MULTIRUN(runs) threshold(src, dst, thresh, maxval, threshType);
 
     SANITY_CHECK(dst);
diff --git a/modules/imgproc/src/rotcalipers.cpp b/modules/imgproc/src/rotcalipers.cpp
index cc43732c26..98ae6df034 100644
--- a/modules/imgproc/src/rotcalipers.cpp
+++ b/modules/imgproc/src/rotcalipers.cpp
@@ -398,3 +398,10 @@ cvMinAreaRect2( const CvArr* array, CvMemStorage* /*storage*/ )
     return (CvBox2D)rr;
 }
 
+void cv::boxPoints(cv::RotatedRect box, OutputArray _pts)
+{
+    _pts.create(4, 2, CV_32F);
+    Mat pts = _pts.getMat();
+    box.points((Point2f*)pts.data);
+}
+
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 30aa9efe6a..ad10aafc52 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -269,6 +269,10 @@ else(ANDROID)
 endif(ANDROID)
 
 # step 5: build native part
+
+# workarounding lack of `__attribute__ ((visibility("default")))` in jni_md.h/JNIEXPORT
+string(REPLACE "-fvisibility=hidden" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
 add_library(${the_module} SHARED ${handwrittren_h_sources} ${handwrittren_cpp_sources} ${generated_cpp_sources}
                                  ${copied_files}
                                 "${JAR_FILE}" "${JAR_FILE}.dephelper")
diff --git a/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java b/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java
index e6520a43d7..69b8d0fcbd 100644
--- a/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java
+++ b/modules/java/android_test/src/org/opencv/test/calib3d/Calib3dTest.java
@@ -585,4 +585,18 @@ public class Calib3dTest extends OpenCVTestCase {
     public void testValidateDisparityMatMatIntIntInt() {
         fail("Not yet implemented");
     }
+
+    public void testComputeCorrespondEpilines()
+    {
+        Mat fundamental = new Mat(3, 3, CvType.CV_64F);
+        fundamental.put(0, 0, 0, -0.577, 0.288, 0.577, 0, 0.288, -0.288, -0.288, 0);
+        MatOfPoint2f left = new MatOfPoint2f();
+        left.alloc(1);
+        left.put(0, 0, 2, 3); //add(new Point(x, y));
+        Mat lines = new Mat();
+        Mat truth = new Mat(1, 1, CvType.CV_32FC3);
+        truth.put(0, 0, -0.70735186, 0.70686162, -0.70588124);
+        Calib3d.computeCorrespondEpilines(left, 1, fundamental, lines);
+        assertMatEqual(truth, lines, EPS);
+    }
 }
diff --git a/modules/java/generator/src/java/android+CameraBridgeViewBase.java b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
index b15ae2bd8f..c0c9f5bde7 100644
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@@ -80,10 +80,10 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
         mMaxHeight = MAX_UNSPECIFIED;
         styledAttrs.recycle();
     }
-    
+
     /**
      * Sets the camera index
-     * @param camera index
+     * @param cameraIndex new camera index
      */
     public void setCameraIndex(int cameraIndex) {
         this.mCameraIndex = cameraIndex;
diff --git a/modules/java/generator/src/java/core+MatOfByte.java b/modules/java/generator/src/java/core+MatOfByte.java
index 0ebdb66733..b3fe5691ee 100644
--- a/modules/java/generator/src/java/core+MatOfByte.java
+++ b/modules/java/generator/src/java/core+MatOfByte.java
@@ -14,7 +14,7 @@ public class MatOfByte extends Mat {
 
     protected MatOfByte(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfByte extends Mat {
 
     public MatOfByte(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfDouble.java b/modules/java/generator/src/java/core+MatOfDouble.java
index cca5251105..4eb7cbc280 100644
--- a/modules/java/generator/src/java/core+MatOfDouble.java
+++ b/modules/java/generator/src/java/core+MatOfDouble.java
@@ -14,7 +14,7 @@ public class MatOfDouble extends Mat {
 
     protected MatOfDouble(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfDouble extends Mat {
 
     public MatOfDouble(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfFloat.java b/modules/java/generator/src/java/core+MatOfFloat.java
index ce73b6f638..96bbeab9fb 100644
--- a/modules/java/generator/src/java/core+MatOfFloat.java
+++ b/modules/java/generator/src/java/core+MatOfFloat.java
@@ -14,7 +14,7 @@ public class MatOfFloat extends Mat {
 
     protected MatOfFloat(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfFloat extends Mat {
 
     public MatOfFloat(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfFloat4.java b/modules/java/generator/src/java/core+MatOfFloat4.java
index 8a3e51014f..aaa97b7990 100644
--- a/modules/java/generator/src/java/core+MatOfFloat4.java
+++ b/modules/java/generator/src/java/core+MatOfFloat4.java
@@ -14,7 +14,7 @@ public class MatOfFloat4 extends Mat {
 
     protected MatOfFloat4(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfFloat4 extends Mat {
 
     public MatOfFloat4(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfFloat6.java b/modules/java/generator/src/java/core+MatOfFloat6.java
index 1e23101a72..68e6249b6d 100644
--- a/modules/java/generator/src/java/core+MatOfFloat6.java
+++ b/modules/java/generator/src/java/core+MatOfFloat6.java
@@ -14,7 +14,7 @@ public class MatOfFloat6 extends Mat {
 
     protected MatOfFloat6(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfFloat6 extends Mat {
 
     public MatOfFloat6(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfInt.java b/modules/java/generator/src/java/core+MatOfInt.java
index 80c5b3a5c2..33e5124e4f 100644
--- a/modules/java/generator/src/java/core+MatOfInt.java
+++ b/modules/java/generator/src/java/core+MatOfInt.java
@@ -15,7 +15,7 @@ public class MatOfInt extends Mat {
 
     protected MatOfInt(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -26,7 +26,7 @@ public class MatOfInt extends Mat {
 
     public MatOfInt(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfInt4.java b/modules/java/generator/src/java/core+MatOfInt4.java
index 60277103cc..c924233a6c 100644
--- a/modules/java/generator/src/java/core+MatOfInt4.java
+++ b/modules/java/generator/src/java/core+MatOfInt4.java
@@ -15,7 +15,7 @@ public class MatOfInt4 extends Mat {
 
     protected MatOfInt4(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -26,7 +26,7 @@ public class MatOfInt4 extends Mat {
 
     public MatOfInt4(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfKeyPoint.java b/modules/java/generator/src/java/core+MatOfKeyPoint.java
index a30805e18d..48ad3ca65c 100644
--- a/modules/java/generator/src/java/core+MatOfKeyPoint.java
+++ b/modules/java/generator/src/java/core+MatOfKeyPoint.java
@@ -16,7 +16,7 @@ public class MatOfKeyPoint extends Mat {
 
     protected MatOfKeyPoint(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -27,7 +27,7 @@ public class MatOfKeyPoint extends Mat {
 
     public MatOfKeyPoint(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint.java b/modules/java/generator/src/java/core+MatOfPoint.java
index 23eeed0ebb..6d23ed1162 100644
--- a/modules/java/generator/src/java/core+MatOfPoint.java
+++ b/modules/java/generator/src/java/core+MatOfPoint.java
@@ -14,7 +14,7 @@ public class MatOfPoint extends Mat {
 
     protected MatOfPoint(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint extends Mat {
 
     public MatOfPoint(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint2f.java b/modules/java/generator/src/java/core+MatOfPoint2f.java
index ba4be4ac5e..0c6960730b 100644
--- a/modules/java/generator/src/java/core+MatOfPoint2f.java
+++ b/modules/java/generator/src/java/core+MatOfPoint2f.java
@@ -14,7 +14,7 @@ public class MatOfPoint2f extends Mat {
 
     protected MatOfPoint2f(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint2f extends Mat {
 
     public MatOfPoint2f(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint3.java b/modules/java/generator/src/java/core+MatOfPoint3.java
index 16e21301ef..0c8374f250 100644
--- a/modules/java/generator/src/java/core+MatOfPoint3.java
+++ b/modules/java/generator/src/java/core+MatOfPoint3.java
@@ -14,7 +14,7 @@ public class MatOfPoint3 extends Mat {
 
     protected MatOfPoint3(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint3 extends Mat {
 
     public MatOfPoint3(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfPoint3f.java b/modules/java/generator/src/java/core+MatOfPoint3f.java
index 97e2a95702..b0d50d4500 100644
--- a/modules/java/generator/src/java/core+MatOfPoint3f.java
+++ b/modules/java/generator/src/java/core+MatOfPoint3f.java
@@ -14,7 +14,7 @@ public class MatOfPoint3f extends Mat {
 
     protected MatOfPoint3f(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -25,7 +25,7 @@ public class MatOfPoint3f extends Mat {
 
     public MatOfPoint3f(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/java/generator/src/java/core+MatOfRect.java b/modules/java/generator/src/java/core+MatOfRect.java
index 2e58bfe897..3844d9dfbf 100644
--- a/modules/java/generator/src/java/core+MatOfRect.java
+++ b/modules/java/generator/src/java/core+MatOfRect.java
@@ -15,7 +15,7 @@ public class MatOfRect extends Mat {
 
     protected MatOfRect(long addr) {
         super(addr);
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
@@ -26,7 +26,7 @@ public class MatOfRect extends Mat {
 
     public MatOfRect(Mat m) {
         super(m, Range.all());
-        if(checkVector(_channels, _depth) < 0 )
+        if( !empty() && checkVector(_channels, _depth) < 0 )
             throw new IllegalArgumentException("Incomatible Mat");
         //FIXME: do we need release() here?
     }
diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index bf85425b9c..7323ab57a7 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -251,7 +251,7 @@ void CvANN_MLP::create( const CvMat* _layer_sizes, int _activ_func,
     buf_sz += (l_dst[0] + l_dst[l_count-1]*2)*2;
 
     CV_CALL( wbuf = cvCreateMat( 1, buf_sz, CV_64F ));
-    CV_CALL( weights = (double**)cvAlloc( (l_count+1)*sizeof(weights[0]) ));
+    CV_CALL( weights = (double**)cvAlloc( (l_count+2)*sizeof(weights[0]) ));
 
     weights[0] = wbuf->data.db;
     weights[1] = weights[0] + l_dst[0]*2;
diff --git a/modules/nonfree/src/sift.cpp b/modules/nonfree/src/sift.cpp
index ba65690426..68216f58bb 100644
--- a/modules/nonfree/src/sift.cpp
+++ b/modules/nonfree/src/sift.cpp
@@ -774,9 +774,6 @@ void SIFT::operator()(InputArray _image, InputArray _mask,
         findScaleSpaceExtrema(gpyr, dogpyr, keypoints);
         KeyPointsFilter::removeDuplicated( keypoints );
 
-        if( !mask.empty() )
-            KeyPointsFilter::runByPixelsMask( keypoints, mask );
-
         if( nfeatures > 0 )
             KeyPointsFilter::retainBest(keypoints, nfeatures);
         //t = (double)getTickCount() - t;
@@ -791,6 +788,9 @@ void SIFT::operator()(InputArray _image, InputArray _mask,
                 kpt.pt *= scale;
                 kpt.size *= scale;
             }
+
+        if( !mask.empty() )
+            KeyPointsFilter::runByPixelsMask( keypoints, mask );
     }
     else
     {
diff --git a/modules/nonfree/src/surf_gpu.cpp b/modules/nonfree/src/surf_gpu.cpp
index ace9bb53ab..35805470b2 100644
--- a/modules/nonfree/src/surf_gpu.cpp
+++ b/modules/nonfree/src/surf_gpu.cpp
@@ -142,13 +142,13 @@ namespace
 
             bindImgTex(img);
 
-            gpu::integralBuffered(img, surf_.sum, surf_.intBuffer);
+            gpu::integral(img, surf_.sum, surf_.intBuffer);
             sumOffset = bindSumTex(surf_.sum);
 
             if (use_mask)
             {
-                min(mask, 1.0, surf_.mask1);
-                gpu::integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
+                gpu::min(mask, 1.0, surf_.mask1);
+                gpu::integral(surf_.mask1, surf_.maskSum, surf_.intBuffer);
                 maskOffset = bindMaskSumTex(surf_.maskSum);
             }
         }
diff --git a/modules/nonfree/test/test_features2d.cpp b/modules/nonfree/test/test_features2d.cpp
index 09997abe51..bff8a387f4 100644
--- a/modules/nonfree/test/test_features2d.cpp
+++ b/modules/nonfree/test/test_features2d.cpp
@@ -1149,3 +1149,76 @@ protected:
 
 TEST(Features2d_SIFTHomographyTest, regression) { CV_DetectPlanarTest test("SIFT", 80); test.safe_run(); }
 TEST(Features2d_SURFHomographyTest, regression) { CV_DetectPlanarTest test("SURF", 80); test.safe_run(); }
+
+class FeatureDetectorUsingMaskTest : public cvtest::BaseTest
+{
+public:
+    FeatureDetectorUsingMaskTest(const Ptr<FeatureDetector>& featureDetector) :
+        featureDetector_(featureDetector)
+    {
+        CV_Assert(!featureDetector_.empty());
+    }
+
+protected:
+
+    void run(int)
+    {
+        const int nStepX = 2;
+        const int nStepY = 2;
+
+        const string imageFilename = string(ts->get_data_path()) + "/features2d/tsukuba.png";
+
+        Mat image = imread(imageFilename);
+        if(image.empty())
+        {
+            ts->printf(cvtest::TS::LOG, "Image %s can not be read.\n", imageFilename.c_str());
+            ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+            return;
+        }
+
+        Mat mask(image.size(), CV_8U);
+
+        const int stepX = image.size().width / nStepX;
+        const int stepY = image.size().height / nStepY;
+
+        vector<KeyPoint> keyPoints;
+        vector<Point2f> points;
+        for(int i=0; i<nStepX; ++i)
+            for(int j=0; j<nStepY; ++j)
+            {
+
+                mask.setTo(0);
+                Rect whiteArea(i * stepX, j * stepY, stepX, stepY);
+                mask(whiteArea).setTo(255);
+
+                featureDetector_->detect(image, keyPoints, mask);
+                KeyPoint::convert(keyPoints, points);
+
+                for(size_t k=0; k<points.size(); ++k)
+                {
+                    if ( !whiteArea.contains(points[k]) )
+                    {
+                        ts->printf(cvtest::TS::LOG, "The feature point is outside of the mask.");
+                        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+                        return;
+                    }
+                }
+            }
+
+        ts->set_failed_test_info( cvtest::TS::OK );
+    }
+
+    Ptr<FeatureDetector> featureDetector_;
+};
+
+TEST(Features2d_SIFT_using_mask, regression)
+{
+    FeatureDetectorUsingMaskTest test(Algorithm::create<FeatureDetector>("Feature2D.SIFT"));
+    test.safe_run();
+}
+
+TEST(DISABLED_Features2d_SURF_using_mask, regression)
+{
+    FeatureDetectorUsingMaskTest test(Algorithm::create<FeatureDetector>("Feature2D.SURF"));
+    test.safe_run();
+}
diff --git a/modules/objdetect/doc/cascade_classification.rst b/modules/objdetect/doc/cascade_classification.rst
index 961cf0aa57..46272d2f07 100644
--- a/modules/objdetect/doc/cascade_classification.rst
+++ b/modules/objdetect/doc/cascade_classification.rst
@@ -189,6 +189,7 @@ CascadeClassifier::detectMultiScale
 Detects objects of different sizes in the input image. The detected objects are returned as a list of rectangles.
 
 .. ocv:function:: void CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor=1.1, int minNeighbors=3, int flags=0, Size minSize=Size(), Size maxSize=Size())
+.. ocv:function:: void CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& objects, vector<int>& numDetections, double scaleFactor=1.1, int minNeighbors=3, int flags=0, Size minSize=Size(), Size maxSize=Size())
 
 .. ocv:pyfunction:: cv2.CascadeClassifier.detectMultiScale(image[, scaleFactor[, minNeighbors[, flags[, minSize[, maxSize]]]]]) -> objects
 .. ocv:pyfunction:: cv2.CascadeClassifier.detectMultiScale(image[, scaleFactor[, minNeighbors[, flags[, minSize[, maxSize[, outputRejectLevels]]]]]]) -> objects, rejectLevels, levelWeights
@@ -201,6 +202,8 @@ Detects objects of different sizes in the input image. The detected objects are
 
     :param objects: Vector of rectangles where each rectangle contains the detected object, the rectangles may be partially outside the original image.
 
+    :param numDetections: Vector of detection numbers for the corresponding objects. An object's number of detections is the number of neighboring positively classified rectangles that were joined together to form the object.
+
     :param scaleFactor: Parameter specifying how much the image size is reduced at each image scale.
 
     :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index 3ccb057e33..d263b2eb74 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -149,6 +149,14 @@ public:
                                    Size minSize = Size(),
                                    Size maxSize = Size() );
 
+    CV_WRAP virtual void detectMultiScale( const Mat& image,
+                                   CV_OUT std::vector<Rect>& objects,
+                                   CV_OUT std::vector<int>& numDetections,
+                                   double scaleFactor=1.1,
+                                   int minNeighbors=3, int flags=0,
+                                   Size minSize=Size(),
+                                   Size maxSize=Size() );
+
     CV_WRAP virtual void detectMultiScale( const Mat& image,
                                    CV_OUT std::vector<Rect>& objects,
                                    CV_OUT std::vector<int>& rejectLevels,
@@ -168,7 +176,12 @@ public:
 protected:
     virtual bool detectSingleScale( const Mat& image, int stripCount, Size processingRectSize,
                                     int stripSize, int yStep, double factor, std::vector<Rect>& candidates,
-                                    std::vector<int>& rejectLevels, std::vector<double>& levelWeights, bool outputRejectLevels = false);
+                                    std::vector<int>& rejectLevels, std::vector<double>& levelWeights, bool outputRejectLevels = false );
+
+    virtual void detectMultiScaleNoGrouping( const Mat& image, std::vector<Rect>& candidates,
+                                             std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
+                                             double scaleFactor, Size minObjectSize, Size maxObjectSize,
+                                             bool outputRejectLevels = false );
 
 protected:
     enum { BOOST = 0
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 6d1b287d8a..13422b97e3 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -1022,6 +1022,7 @@ public:
 };
 
 struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } };
+struct getNeighbors { int operator ()(const CvAvgComp& e) const { return e.neighbors; } };
 
 
 bool CascadeClassifier::detectSingleScale( const Mat& image, int stripCount, Size processingRectSize,
@@ -1086,39 +1087,33 @@ bool CascadeClassifier::setImage(const Mat& image)
     return featureEvaluator->setImage(image, data.origWinSize);
 }
 
-void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
-                                          std::vector<int>& rejectLevels,
-                                          std::vector<double>& levelWeights,
-                                          double scaleFactor, int minNeighbors,
-                                          int flags, Size minObjectSize, Size maxObjectSize,
-                                          bool outputRejectLevels )
+static void detectMultiScaleOldFormat( const Mat& image, Ptr<CvHaarClassifierCascade> oldCascade,
+                                       std::vector<Rect>& objects,
+                                       std::vector<int>& rejectLevels,
+                                       std::vector<double>& levelWeights,
+                                       std::vector<CvAvgComp>& vecAvgComp,
+                                       double scaleFactor, int minNeighbors,
+                                       int flags, Size minObjectSize, Size maxObjectSize,
+                                       bool outputRejectLevels = false )
 {
-    const double GROUP_EPS = 0.2;
+    MemStorage storage(cvCreateMemStorage(0));
+    CvMat _image = image;
+    CvSeq* _objects = cvHaarDetectObjectsForROC( &_image, oldCascade, storage, rejectLevels, levelWeights, scaleFactor,
+                                                 minNeighbors, flags, minObjectSize, maxObjectSize, outputRejectLevels );
+    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
+    objects.resize(vecAvgComp.size());
+    std::transform(vecAvgComp.begin(), vecAvgComp.end(), objects.begin(), getRect());
+}
 
-    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
+void CascadeClassifier::detectMultiScaleNoGrouping( const Mat& image, std::vector<Rect>& candidates,
+                                                    std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
+                                                    double scaleFactor, Size minObjectSize, Size maxObjectSize,
+                                                    bool outputRejectLevels )
+{
+    candidates.clear();
 
-    if( empty() )
-        return;
-
-    if( isOldFormatCascade() )
-    {
-        MemStorage storage(cvCreateMemStorage(0));
-        CvMat _image = image;
-        CvSeq* _objects = cvHaarDetectObjectsForROC( &_image, oldCascade, storage, rejectLevels, levelWeights, scaleFactor,
-                                              minNeighbors, flags, minObjectSize, maxObjectSize, outputRejectLevels );
-        std::vector<CvAvgComp> vecAvgComp;
-        Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-        objects.resize(vecAvgComp.size());
-        std::transform(vecAvgComp.begin(), vecAvgComp.end(), objects.begin(), getRect());
-        return;
-    }
-
-    objects.clear();
-
-    if (!maskGenerator.empty()) {
+    if (!maskGenerator.empty())
         maskGenerator->initializeMask(image);
-    }
-
 
     if( maxObjectSize.height == 0 || maxObjectSize.width == 0 )
         maxObjectSize = image.size();
@@ -1132,7 +1127,6 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
     }
 
     Mat imageBuffer(image.rows + 1, image.cols + 1, CV_8U);
-    std::vector<Rect> candidates;
 
     for( double factor = 1; ; factor *= scaleFactor )
     {
@@ -1173,18 +1167,39 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
             rejectLevels, levelWeights, outputRejectLevels ) )
             break;
     }
+}
 
+void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
+                                          std::vector<int>& rejectLevels,
+                                          std::vector<double>& levelWeights,
+                                          double scaleFactor, int minNeighbors,
+                                          int flags, Size minObjectSize, Size maxObjectSize,
+                                          bool outputRejectLevels )
+{
+    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
 
-    objects.resize(candidates.size());
-    std::copy(candidates.begin(), candidates.end(), objects.begin());
+    if( empty() )
+        return;
 
-    if( outputRejectLevels )
+    if( isOldFormatCascade() )
     {
-        groupRectangles( objects, rejectLevels, levelWeights, minNeighbors, GROUP_EPS );
+        std::vector<CvAvgComp> fakeVecAvgComp;
+        detectMultiScaleOldFormat( image, oldCascade, objects, rejectLevels, levelWeights, fakeVecAvgComp, scaleFactor,
+                                   minNeighbors, flags, minObjectSize, maxObjectSize, outputRejectLevels );
     }
     else
     {
-        groupRectangles( objects, minNeighbors, GROUP_EPS );
+        detectMultiScaleNoGrouping( image, objects, rejectLevels, levelWeights, scaleFactor, minObjectSize, maxObjectSize,
+                                    outputRejectLevels );
+        const double GROUP_EPS = 0.2;
+        if( outputRejectLevels )
+        {
+            groupRectangles( objects, rejectLevels, levelWeights, minNeighbors, GROUP_EPS );
+        }
+        else
+        {
+            groupRectangles( objects, minNeighbors, GROUP_EPS );
+        }
     }
 }
 
@@ -1195,7 +1210,35 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
     std::vector<int> fakeLevels;
     std::vector<double> fakeWeights;
     detectMultiScale( image, objects, fakeLevels, fakeWeights, scaleFactor,
-        minNeighbors, flags, minObjectSize, maxObjectSize, false );
+        minNeighbors, flags, minObjectSize, maxObjectSize );
+}
+
+void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
+                                          std::vector<int>& numDetections, double scaleFactor,
+                                          int minNeighbors, int flags, Size minObjectSize,
+                                          Size maxObjectSize )
+{
+    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
+
+    if( empty() )
+        return;
+
+    std::vector<int> fakeLevels;
+    std::vector<double> fakeWeights;
+    if( isOldFormatCascade() )
+    {
+        std::vector<CvAvgComp> vecAvgComp;
+        detectMultiScaleOldFormat( image, oldCascade, objects, fakeLevels, fakeWeights, vecAvgComp, scaleFactor,
+                                   minNeighbors, flags, minObjectSize, maxObjectSize );
+        numDetections.resize(vecAvgComp.size());
+        std::transform(vecAvgComp.begin(), vecAvgComp.end(), numDetections.begin(), getNeighbors());
+    }
+    else
+    {
+        detectMultiScaleNoGrouping( image, objects, fakeLevels, fakeWeights, scaleFactor, minObjectSize, maxObjectSize );
+        const double GROUP_EPS = 0.2;
+        groupRectangles( objects, numDetections, minNeighbors, GROUP_EPS );
+    }
 }
 
 bool CascadeClassifier::Data::read(const FileNode &root)
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index 05a969d4d2..a59aae1a5e 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -117,9 +117,6 @@ namespace cv
         //the devnum is the index of the selected device in DeviceName vector of INfo
         CV_EXPORTS void setDevice(Info &oclinfo, int devnum = 0);
 
-        //optional function, if you want save opencl binary kernel to the file, set its path
-        CV_EXPORTS  void setBinpath(const char *path);
-
         //The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
         //returns cl_context *
         CV_EXPORTS void* getoclContext();
@@ -133,6 +130,9 @@ namespace cv
         //getDevice also need to be called before this function
         CV_EXPORTS void setDeviceEx(Info &oclinfo, void *ctx, void *qu, int devnum = 0);
 
+        //returns true when global OpenCL context is initialized
+        CV_EXPORTS bool initialized();
+
         //////////////////////////////// OpenCL context ////////////////////////
         //This is a global singleton class used to represent a OpenCL context.
         class CV_EXPORTS Context
@@ -140,7 +140,7 @@ namespace cv
         protected:
             Context();
             friend class std::auto_ptr<Context>;
-
+            friend bool initialized();
         private:
             static std::auto_ptr<Context> clCxt;
             static int val;
@@ -178,6 +178,29 @@ namespace cv
                                                         bool finish = true, bool measureKernelTime = false,
                                                         bool cleanUp = true);
 
+        //! Enable or disable OpenCL program binary caching onto local disk
+        // After a program (*.cl files in opencl/ folder) is built at runtime, we allow the
+        // compiled OpenCL program to be cached to the path automatically as "path/*.clb" 
+        // binary file, which will be reused when the OpenCV executable is started again. 
+        //
+        // Caching mode is controlled by the following enums
+        // Notes
+        //   1. the feature is by default enabled when OpenCV is built in release mode.
+        //   2. the CACHE_DEBUG / CACHE_RELEASE flags only effectively work with MSVC compiler;
+        //      for GNU compilers, the function always treats the build as release mode (enabled by default).
+        enum
+        {
+            CACHE_NONE    = 0,        // do not cache OpenCL binary
+            CACHE_DEBUG   = 0x1 << 0, // cache OpenCL binary when built in debug mode (only work with MSVC)
+            CACHE_RELEASE = 0x1 << 1, // default behavior, only cache when built in release mode (only work with MSVC)
+            CACHE_ALL     = CACHE_DEBUG | CACHE_RELEASE, // always cache opencl binary
+            CACHE_UPDATE  = 0x1 << 2  // if the binary cache file with the same name is already on the disk, it will be updated.
+        };
+        CV_EXPORTS void setBinaryDiskCache(int mode = CACHE_RELEASE, cv::String path = "./");
+
+        //! set where binary cache to be saved to 
+        CV_EXPORTS void setBinpath(const char *path);
+
         class CV_EXPORTS oclMatExpr;
         //////////////////////////////// oclMat ////////////////////////////////
         class CV_EXPORTS oclMat
@@ -222,6 +245,11 @@ namespace cv
             operator Mat() const;
             void download(cv::Mat &m) const;
 
+            //! convert to _InputArray
+            operator _InputArray();
+
+            //! convert to _OutputArray
+            operator _OutputArray();
 
             //! returns a new oclMatrix header for the specified row
             oclMat row(int y) const;
@@ -363,6 +391,9 @@ namespace cv
             int wholecols;
         };
 
+        // convert InputArray/OutputArray to oclMat references
+        CV_EXPORTS oclMat& getOclMatRef(InputArray src);
+        CV_EXPORTS oclMat& getOclMatRef(OutputArray src);
 
         ///////////////////// mat split and merge /////////////////////////////////
         //! Compose a multi-channel array from several single-channel arrays
@@ -482,6 +513,25 @@ namespace cv
         CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist);
         //! only 8UC1 and 256 bins is supported now
         CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst);
+        
+        //! only 8UC1 is supported now
+        class CV_EXPORTS CLAHE
+        {
+        public:
+            virtual void apply(const oclMat &src, oclMat &dst) = 0;
+
+            virtual void setClipLimit(double clipLimit) = 0;
+            virtual double getClipLimit() const = 0;
+
+            virtual void setTilesGridSize(Size tileGridSize) = 0;
+            virtual Size getTilesGridSize() const = 0;
+
+            virtual void collectGarbage() = 0;
+
+            virtual ~CLAHE() {}
+        };
+        CV_EXPORTS Ptr<cv::ocl::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
+        
         //! bilateralFilter
         // supports 8UC1 8UC4
         CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpave, int borderType=BORDER_DEFAULT);
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index 4c9980a4b6..93f7461246 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -113,7 +113,7 @@ namespace cv
                                   size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
         void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, String kernelName, size_t globalThreads[3],
                                   size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                  int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
+                                  int depth, const char *build_options, FLUSH_MODE finish_mode = DISABLE);
         // bind oclMat to OpenCL image textures
         // note:
         //   1. there is no memory management. User need to explicitly release the resource
diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp
index 9455cbbf17..8b0a406462 100644
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@@ -52,6 +52,8 @@ int main(int argc, const char *argv[])
         cerr << "no device found\n";
         return -1;
     }
+    // set this to overwrite binary cache every time the test starts
+    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
 
     int devidx = 0;
 
diff --git a/modules/ocl/test/test_columnsum.cpp b/modules/ocl/perf/perf_calib3d.cpp
similarity index 64%
rename from modules/ocl/test/test_columnsum.cpp
rename to modules/ocl/perf/perf_calib3d.cpp
index 231f0657b0..428f00ea94 100644
--- a/modules/ocl/test/test_columnsum.cpp
+++ b/modules/ocl/perf/perf_calib3d.cpp
@@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//	   Chunpeng Zhang chunpeng@multicorewareinc.com
-//
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -31,7 +31,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -45,50 +45,57 @@
 //M*/
 
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-PARAM_TEST_CASE(ColumnSum, cv::Size)
+///////////// StereoMatchBM ////////////////////////
+PERFTEST(StereoMatchBM)
 {
-    cv::Size size;
-    cv::Mat src;
+	Mat left_image = imread(abspath("aloeL.jpg"), cv::IMREAD_GRAYSCALE);
+	Mat right_image = imread(abspath("aloeR.jpg"), cv::IMREAD_GRAYSCALE);
+	Mat disp,dst;
+	ocl::oclMat d_left, d_right,d_disp;
+	int n_disp= 128;
+	int winSize =19;
 
-    virtual void SetUp()
-    {
-        size = GET_PARAM(0);
-    }
-};
+	SUBTEST << left_image.cols << 'x' << left_image.rows << "; aloeL.jpg ;"<< right_image.cols << 'x' << right_image.rows << "; aloeR.jpg ";
 
-TEST_P(ColumnSum, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_32FC1);
-    cv::ocl::oclMat d_dst;
-    cv::ocl::oclMat d_src(src);
+	Ptr<StereoBM> bm = createStereoBM(n_disp, winSize);
+	bm->compute(left_image, right_image, dst);
 
-    cv::ocl::columnSum(d_src, d_dst);
+	CPU_ON;
+	bm->compute(left_image, right_image, dst);
+	CPU_OFF;
 
-    cv::Mat dst(d_dst);
+	d_left.upload(left_image);
+	d_right.upload(right_image);
 
-    for (int j = 0; j < src.cols; ++j)
-    {
-        float gold = src.at<float>(0, j);
-        float res = dst.at<float>(0, j);
-        ASSERT_NEAR(res, gold, 1e-5);
-    }
+	ocl::StereoBM_OCL d_bm(0, n_disp, winSize);
 
-    for (int i = 1; i < src.rows; ++i)
-    {
-        for (int j = 0; j < src.cols; ++j)
-        {
-            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
-            float res = dst.at<float>(i, j);
-            ASSERT_NEAR(res, gold, 1e-5);
-        }
-    }
+	WARMUP_ON;
+	d_bm(d_left, d_right, d_disp);
+	WARMUP_OFF;
+
+    cv::Mat ocl_mat;
+    d_disp.download(ocl_mat);
+    ocl_mat.convertTo(ocl_mat, dst.type());
+
+	GPU_ON;
+	d_bm(d_left, d_right, d_disp);
+	GPU_OFF;
+
+	GPU_FULL_ON;
+	d_left.upload(left_image);
+	d_right.upload(right_image);
+	d_bm(d_left, d_right, d_disp);
+	d_disp.download(disp);
+	GPU_FULL_OFF;
+    
+    TestSystem::instance().setAccurate(-1, 0.);
 }
 
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);
 
 
-#endif
+
+
+
+
+
+	
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
index a05301b34c..e988ce09d6 100644
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -284,6 +284,7 @@ PERFTEST(GaussianBlur)
     Mat src, dst, ocl_dst;
     int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
     std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
+    const int ksize = 7;	
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
@@ -291,29 +292,28 @@ PERFTEST(GaussianBlur)
         {
             SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            gen(src, size, size, all_type[j], 5, 16);
+            gen(src, size, size, all_type[j], 0, 256);
 
-            GaussianBlur(src, dst, Size(9, 9), 0);
+            GaussianBlur(src, dst, Size(ksize, ksize), 0);
 
             CPU_ON;
-            GaussianBlur(src, dst, Size(9, 9), 0);
+            GaussianBlur(src, dst, Size(ksize, ksize), 0);
             CPU_OFF;
 
             ocl::oclMat d_src(src);
-            ocl::oclMat d_dst(src.size(), src.type());
-            ocl::oclMat d_buf;
+            ocl::oclMat d_dst;
 
             WARMUP_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
             WARMUP_OFF;
 
             GPU_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
             GPU_OFF;
 
             GPU_FULL_ON;
             d_src.upload(src);
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
             d_dst.download(ocl_dst);
             GPU_FULL_OFF;
 
diff --git a/modules/ocl/perf/perf_hog.cpp b/modules/ocl/perf/perf_hog.cpp
index 05093811fe..7daa61396c 100644
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -46,11 +46,6 @@
 #include "precomp.hpp"
 
 ///////////// HOG////////////////////////
-bool match_rect(cv::Rect r1, cv::Rect r2, int threshold)
-{
-    return ((abs(r1.x - r2.x) < threshold) && (abs(r1.y - r2.y) < threshold) &&
-        (abs(r1.width - r2.width) < threshold) && (abs(r1.height - r2.height) < threshold));
-}
 
 PERFTEST(HOG)
 {
@@ -61,13 +56,12 @@ PERFTEST(HOG)
         throw runtime_error("can't open road.png");
     }
 
-
     cv::HOGDescriptor hog;
     hog.setSVMDetector(hog.getDefaultPeopleDetector());
     std::vector<cv::Rect> found_locations;
     std::vector<cv::Rect> d_found_locations;
 
-    SUBTEST << 768 << 'x' << 576 << "; road.png";
+    SUBTEST << src.cols << 'x' << src.rows << "; road.png";
 
     hog.detectMultiScale(src, found_locations);
 
@@ -84,70 +78,10 @@ PERFTEST(HOG)
     ocl_hog.detectMultiScale(d_src, d_found_locations);
     WARMUP_OFF;
     
-    // Ground-truth rectangular people window
-    cv::Rect win1_64x128(231, 190, 72, 144);
-    cv::Rect win2_64x128(621, 156, 97, 194);
-    cv::Rect win1_48x96(238, 198, 63, 126);
-    cv::Rect win2_48x96(619, 161, 92, 185);
-    cv::Rect win3_48x96(488, 136, 56, 112);
-
-    // Compare whether ground-truth windows are detected and compare the number of windows detected.
-    std::vector<int> d_comp(4);
-    std::vector<int> comp(4);
-    for(int i = 0; i < (int)d_comp.size(); i++)
-    {
-        d_comp[i] = 0;
-        comp[i] = 0;
-    }
-
-    int threshold = 10;
-    int val = 32;
-    d_comp[0] = (int)d_found_locations.size();
-    comp[0] = (int)found_locations.size();
-
-    cv::Size winSize = hog.winSize;
-
-    if (winSize == cv::Size(48, 96))
-    {
-        for(int i = 0; i < (int)d_found_locations.size(); i++)
-        {
-            if (match_rect(d_found_locations[i], win1_48x96, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found_locations[i], win2_48x96, threshold))
-                d_comp[2] = val;
-            if (match_rect(d_found_locations[i], win3_48x96, threshold))
-                d_comp[3] = val;
-        }
-        for(int i = 0; i < (int)found_locations.size(); i++)
-        {
-            if (match_rect(found_locations[i], win1_48x96, threshold))
-                comp[1] = val;
-            if (match_rect(found_locations[i], win2_48x96, threshold))
-                comp[2] = val;
-            if (match_rect(found_locations[i], win3_48x96, threshold))
-                comp[3] = val;
-        }
-    }
-    else if (winSize == cv::Size(64, 128))
-    {
-        for(int i = 0; i < (int)d_found_locations.size(); i++)
-        {
-            if (match_rect(d_found_locations[i], win1_64x128, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found_locations[i], win2_64x128, threshold))
-                d_comp[2] = val;
-        }
-        for(int i = 0; i < (int)found_locations.size(); i++)
-        {
-            if (match_rect(found_locations[i], win1_64x128, threshold))
-                comp[1] = val;
-            if (match_rect(found_locations[i], win2_64x128, threshold))
-                comp[2] = val;
-        }
-    }
-
-    cv::Mat gpu_rst(d_comp), cpu_rst(comp);
-    TestSystem::instance().ExpectedMatNear(gpu_rst, cpu_rst, 3);
+    if(d_found_locations.size() == found_locations.size())
+        TestSystem::instance().setAccurate(1, 0);
+    else
+        TestSystem::instance().setAccurate(0, abs((int)found_locations.size() - (int)d_found_locations.size()));
 
     GPU_ON;
     ocl_hog.detectMultiScale(d_src, found_locations);
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
index e7c05cc0b7..77960d0abc 100644
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -743,12 +743,12 @@ PERFTEST(meanShiftFiltering)
         WARMUP_OFF;
 
         GPU_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
         GPU_OFF;
 
         GPU_FULL_ON;
         d_src.upload(src);
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
         d_dst.download(ocl_dst);
         GPU_FULL_OFF;
 
@@ -922,3 +922,92 @@ PERFTEST(remap)
 
     }
 }
+///////////// CLAHE ////////////////////////
+PERFTEST(CLAHE)
+{
+    Mat src, dst, ocl_dst;
+    cv::ocl::oclMat d_src, d_dst;
+    int all_type[] = {CV_8UC1};
+    std::string type_name[] = {"CV_8UC1"};
+
+    double clipLimit = 40.0;
+
+    cv::Ptr<cv::CLAHE>      clahe   = cv::createCLAHE(clipLimit);
+    cv::Ptr<cv::ocl::CLAHE> d_clahe = cv::ocl::createCLAHE(clipLimit);
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            CPU_ON;
+            clahe->apply(src, dst);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_clahe->apply(d_src, d_dst);
+            WARMUP_OFF;
+
+            ocl_dst = d_dst;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
+
+            GPU_ON;
+            d_clahe->apply(d_src, d_dst);
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_clahe->apply(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+    }
+}
+
+///////////// columnSum////////////////////////
+PERFTEST(columnSum)
+{
+    Mat src, dst, ocl_dst;
+    ocl::oclMat d_src, d_dst;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
+
+        gen(src, size, size, CV_32FC1, 0, 256);
+
+        CPU_ON;
+        dst.create(src.size(), src.type());
+        for (int j = 0; j < src.cols; j++)
+            dst.at<float>(0, j) = src.at<float>(0, j);
+
+        for (int i = 1; i < src.rows; ++i)
+            for (int j = 0; j < src.cols; ++j)
+                dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
+        CPU_OFF;
+
+        d_src.upload(src);
+
+        WARMUP_ON;
+        ocl::columnSum(d_src, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::columnSum(d_src, d_dst);
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::columnSum(d_src, d_dst);
+        d_dst.download(ocl_dst);
+        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
+    }
+}
diff --git a/modules/ocl/perf/perf_columnsum.cpp b/modules/ocl/perf/perf_moments.cpp
similarity index 68%
rename from modules/ocl/perf/perf_columnsum.cpp
rename to modules/ocl/perf/perf_moments.cpp
index ff7ebcd1de..7fa3948dec 100644
--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_moments.cpp
@@ -44,45 +44,49 @@
 //
 //M*/
 #include "precomp.hpp"
-
-///////////// columnSum////////////////////////
-PERFTEST(columnSum)
+///////////// Moments ////////////////////////
+PERFTEST(Moments)
 {
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;
+    Mat src;
+    bool binaryImage = 0;
+
+    int all_type[] = {CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_16SC1", "CV_32FC1", "CV_64FC1"};
 
     for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        SUBTEST << size << 'x' << size << "; CV_32FC1";
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-        gen(src, size, size, CV_32FC1, 0, 256);
+            gen(src, size, size, all_type[j], 0, 256);
 
-        CPU_ON;
-        dst.create(src.size(), src.type());
-        for (int j = 0; j < src.cols; j++)
-            dst.at<float>(0, j) = src.at<float>(0, j);
+            cv::Moments CvMom = moments(src, binaryImage);
 
-        for (int i = 1; i < src.rows; ++i)
-            for (int j = 0; j < src.cols; ++j)
-                dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
-        CPU_OFF;
+            CPU_ON;
+            moments(src, binaryImage);
+            CPU_OFF;
 
-        d_src.upload(src);
+            cv::Moments oclMom;
+            WARMUP_ON;
+            oclMom = ocl::ocl_moments(src, binaryImage);
+            WARMUP_OFF;
 
-        WARMUP_ON;
-        ocl::columnSum(d_src, d_dst);
-        WARMUP_OFF;
+            Mat gpu_dst, cpu_dst;
+            HuMoments(CvMom, cpu_dst);
+            HuMoments(oclMom, gpu_dst);
 
-        GPU_ON;
-        ocl::columnSum(d_src, d_dst);
-        GPU_OFF;
+            GPU_ON;
+            ocl::ocl_moments(src, binaryImage);
+            GPU_OFF;
 
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::columnSum(d_src, d_dst);
-        d_dst.download(ocl_dst);
-        GPU_FULL_OFF;
+            GPU_FULL_ON;
+            ocl::ocl_moments(src, binaryImage);
+            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(gpu_dst, cpu_dst, .5);
+
+        }
 
-        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
     }
-}
\ No newline at end of file
+}
diff --git a/modules/ocl/perf/precomp.cpp b/modules/ocl/perf/precomp.cpp
index 65e2d51816..9601cda58d 100644
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@ -331,20 +331,6 @@ void TestSystem::printMetrics(int is_accurate, double cpu_time, double gpu_time,
     cout << setiosflags(ios_base::left);
     stringstream stream;
 
-#if 0
-    if(is_accurate == 1)
-            stream << "Pass";
-    else if(is_accurate_ == 0)
-            stream << "Fail";
-    else if(is_accurate == -1)
-        stream << " ";
-    else
-    {
-        std::cout<<"is_accurate errer: "<<is_accurate<<"\n";
-        exit(-1);
-    }
-#endif
-
     std::stringstream &cur_subtest_description = getCurSubtestDescription();
 
 #if GTEST_OS_WINDOWS&&!GTEST_OS_WINDOWS_MOBILE
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index b2ebb23ce7..ff8f09157b 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Wenju He, wenju@multicorewareinc.com
+//     Wenju He, wenju@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -48,13 +48,107 @@
 using namespace cv;
 using namespace cv::ocl;
 
-
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
 #define CELLS_PER_BLOCK_X 2
 #define CELLS_PER_BLOCK_Y 2
 #define NTHREADS 256
 
+static oclMat gauss_w_lut;
+static bool hog_device_cpu;
+/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
+static const float gaussian_interp_lut[] =
+{
+    /* gaussian lut */
+    0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f,
+    0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f,
+    0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f,
+    0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f,
+    0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f,
+    0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
+    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
+    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
+    0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f,
+    0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f,
+    0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f,
+    0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f,
+    0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f,
+    0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
+    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
+    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
+    0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f,
+    0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f,
+    0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f,
+    0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f,
+    0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f,
+    0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f,
+    0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f,
+    0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f,
+    0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f,
+    0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f,
+    0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f,
+    0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f,
+    0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f,
+    0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
+    0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
+    0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
+    0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f,
+    0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f,
+    0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f,
+    0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f,
+    0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f,
+    0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
+    0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
+    0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
+    0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f,
+    0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f,
+    0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f,
+    /* interp_weight lut */
+    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
+    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
+    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f,
+    0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f,
+    0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f,
+    0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f,
+    0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f,
+    0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f,
+    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
+    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
+    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f,
+    0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f,
+    0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f,
+    0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f,
+    0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f,
+    0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f,
+    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
+    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
+    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f,
+    0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f,
+    0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f,
+    0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f,
+    0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f,
+    0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f,
+    0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
+    0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
+    0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f,
+    0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f,
+    0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f,
+    0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f,
+    0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f,
+    0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f,
+    0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
+    0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
+    0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f,
+    0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f,
+    0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f,
+    0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f,
+    0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f,
+    0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f,
+    0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
+    0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
+    0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
+};
+
 namespace cv
 {
     namespace ocl
@@ -78,38 +172,43 @@ namespace cv
                 int cnblocks_win_x;
                 int cnblocks_win_y;
                 int cblock_hist_size;
-                int cblock_hist_size_2up;
                 int cdescr_size;
                 int cdescr_width;
+                int cdescr_height;
 
                 void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                                       int nblocks_win_x, int nblocks_win_y);
 
                 void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                                   int height, int width, const cv::ocl::oclMat &grad,
-                                   const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists);
+                                   int height, int width, float sigma, const cv::ocl::oclMat &grad,
+                                   const cv::ocl::oclMat &qangle,
+                                   const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
 
                 void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                                     int height, int width, cv::ocl::oclMat &block_hists, float threshold);
+                                     int height, int width, cv::ocl::oclMat &block_hists,
+                                     float threshold);
 
                 void classify_hists(int win_height, int win_width, int block_stride_y,
-                                    int block_stride_x, int win_stride_y, int win_stride_x, int height,
-                                    int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
+                                    int block_stride_x, int win_stride_y, int win_stride_x,
+                                    int height, int width, const cv::ocl::oclMat &block_hists,
+                                    const cv::ocl::oclMat &coefs, float free_coef,
                                     float threshold, cv::ocl::oclMat &labels);
 
-                void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                            int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+                void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y,
+                                            int block_stride_x, int win_stride_y, int win_stride_x,
+                                            int height, int width, const cv::ocl::oclMat &block_hists,
                                             cv::ocl::oclMat &descriptors);
-                void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-                                            int win_stride_y, int win_stride_x, int height, int width, const cv::ocl::oclMat &block_hists,
+                void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y,
+                                            int block_stride_x, int win_stride_y, int win_stride_x,
+                                            int height, int width, const cv::ocl::oclMat &block_hists,
                                             cv::ocl::oclMat &descriptors);
 
                 void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
+                                            float angle_scale, cv::ocl::oclMat &grad,
+                                            cv::ocl::oclMat &qangle, bool correct_gamma);
                 void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma);
-
-                void resize( const oclMat &src, oclMat &dst, const Size sz);
+                                            float angle_scale, cv::ocl::oclMat &grad,
+                                            cv::ocl::oclMat &qangle, bool correct_gamma);
             }
         }
     }
@@ -117,8 +216,14 @@ namespace cv
 
 using namespace ::cv::ocl::device;
 
-cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
-                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
+static inline int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+
+cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
+                                      Size cell_size_, int nbins_, double win_sigma_,
+                                      double threshold_L2hys_, bool gamma_correction_, int nlevels_)
     : win_size(win_size_),
       block_size(block_size_),
       block_stride(block_stride_),
@@ -132,19 +237,27 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
     CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
               (win_size.height - block_size.height) % block_stride.height == 0);
 
-    CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
+    CV_Assert(block_size.width % cell_size.width == 0 &&
+        block_size.height % cell_size.height == 0);
 
     CV_Assert(block_stride == cell_size);
 
     CV_Assert(cell_size == Size(8, 8));
 
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+    Size cells_per_block(block_size.width / cell_size.width,
+        block_size.height / cell_size.height);
     CV_Assert(cells_per_block == Size(2, 2));
 
     cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
+    hog::set_up_constants(nbins, block_stride.width, block_stride.height,
+        blocks_per_win.width, blocks_per_win.height);
 
     effect_size = Size(0, 0);
+
+	if (queryDeviceInfo<IS_CPU_DEVICE, bool>())
+        hog_device_cpu = true;
+    else
+        hog_device_cpu = false;
 }
 
 size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@@ -154,7 +267,8 @@ size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
 
 size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const
 {
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+    Size cells_per_block = Size(block_size.width / cell_size.width,
+        block_size.height / cell_size.height);
     return (size_t)(nbins * cells_per_block.area());
 }
 
@@ -167,7 +281,8 @@ bool cv::ocl::HOGDescriptor::checkDetectorSize() const
 {
     size_t detector_size = detector.rows * detector.cols;
     size_t descriptor_size = getDescriptorSize();
-    return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
+    return detector_size == 0 || detector_size == descriptor_size ||
+        detector_size == descriptor_size + 1;
 }
 
 void cv::ocl::HOGDescriptor::setSVMDetector(const std::vector<float> &_detector)
@@ -207,10 +322,16 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
 
     const size_t block_hist_size = getBlockHistogramSize();
     const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
-    block_hists.create(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F);
+    block_hists.create(1,
+        static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
 
     Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
     labels.create(1, wins_per_img.area(), CV_8U);
+
+    std::vector<float> v_lut = std::vector<float>(gaussian_interp_lut, gaussian_interp_lut +
+        sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
+    Mat m_lut(v_lut);
+    gauss_w_lut.upload(m_lut.reshape(1,1));
 }
 
 void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
@@ -221,10 +342,12 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
     switch (img.type())
     {
     case CV_8UC1:
-        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img,
+            angleScale, grad, qangle, gamma_correction);
         break;
     case CV_8UC4:
-        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img, angleScale, grad, qangle, gamma_correction);
+        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img,
+            angleScale, grad, qangle, gamma_correction);
         break;
     }
 }
@@ -232,19 +355,21 @@ void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oc
 
 void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
 {
-    computeGradient(img, grad, qangle);
+    computeGradient(img, this->grad, this->qangle);
 
-    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
-                       grad, qangle, (float)getWinSigma(), block_hists);
+    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
+        effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
 
-    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height, effect_size.width,
-                         block_hists, (float)threshold_L2hys);
+    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
+        effect_size.width, block_hists, (float)threshold_L2hys);
 }
 
 
-void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride, oclMat &descriptors, int descr_format)
+void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
+                                            oclMat &descriptors, int descr_format)
 {
-    CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+    CV_Assert(win_stride.width % block_stride.width == 0 &&
+        win_stride.height % block_stride.height == 0);
 
     init_buffer(img, win_stride);
 
@@ -254,17 +379,20 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
     Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
     Size wins_per_img   = numPartsWithin(effect_size, win_size, win_stride);
 
-    descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
+    descriptors.create(wins_per_img.area(),
+        static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
 
     switch (descr_format)
     {
     case DESCR_FORMAT_ROW_BY_ROW:
-        hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+        hog::extract_descrs_by_rows(win_size.height, win_size.width,
+            block_stride.height, block_stride.width, win_stride.height, win_stride.width,
+            effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     case DESCR_FORMAT_COL_BY_COL:
-        hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists, descriptors);
+        hog::extract_descrs_by_cols(win_size.height, win_size.width,
+            block_stride.height, block_stride.width, win_stride.height, win_stride.width,
+            effect_size.height, effect_size.width, block_hists, descriptors);
         break;
     default:
         CV_Error(Error::StsBadArg, "Unknown descriptor format");
@@ -272,7 +400,8 @@ void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
 }
 
 
-void cv::ocl::HOGDescriptor::detect(const oclMat &img, std::vector<Point> &hits, double hit_threshold, Size win_stride, Size padding)
+void cv::ocl::HOGDescriptor::detect(const oclMat &img, std::vector<Point> &hits,
+                                    double hit_threshold, Size win_stride, Size padding)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
     CV_Assert(padding == Size(0, 0));
@@ -284,14 +413,16 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, std::vector<Point> &hits,
     if (win_stride == Size())
         win_stride = block_stride;
     else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+        CV_Assert(win_stride.width % block_stride.width == 0 &&
+            win_stride.height % block_stride.height == 0);
     init_buffer(img, win_stride);
 
     computeBlockHistograms(img);
 
-    hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                        win_stride.height, win_stride.width, effect_size.height, effect_size.width, block_hists,
-                        detector, (float)free_coef, (float)hit_threshold, labels);
+    hog::classify_hists(win_size.height, win_size.width, block_stride.height,
+        block_stride.width, win_stride.height, win_stride.width,
+        effect_size.height, effect_size.width, block_hists, detector,
+        (float)free_coef, (float)hit_threshold, labels);
 
     labels.download(labels_host);
     unsigned char *vec = labels_host.ptr();
@@ -307,8 +438,9 @@ void cv::ocl::HOGDescriptor::detect(const oclMat &img, std::vector<Point> &hits,
 
 
 
-void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, std::vector<Rect> &found_locations, double hit_threshold,
-        Size win_stride, Size padding, double scale0, int group_threshold)
+void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, std::vector<Rect> &found_locations,
+                                              double hit_threshold, Size win_stride, Size padding,
+                                              double scale0, int group_threshold)
 {
     CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
     CV_Assert(scale0 > 1);
@@ -334,7 +466,8 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, std::vector<Rec
     if (win_stride == Size())
         win_stride = block_stride;
     else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+        CV_Assert(win_stride.width % block_stride.width == 0 &&
+            win_stride.height % block_stride.height == 0);
     init_buffer(img, win_stride);
     image_scale.create(img.size(), img.type());
 
@@ -348,16 +481,17 @@ void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, std::vector<Rec
         }
         else
         {
-            hog::resize( img, image_scale, effect_size);
+            resize(img, image_scale, effect_size);
             detect(image_scale, locations, hit_threshold, win_stride, padding);
         }
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
+        Size scaled_win_size(cvRound(win_size.width * scale),
+            cvRound(win_size.height * scale));
         for (size_t j = 0; j < locations.size(); j++)
             all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
     }
 
     found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
+    groupRectangles(found_locations, group_threshold, 0.2);
 }
 
 int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
@@ -365,9 +499,11 @@ int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
     return (size - part_size + stride) / stride;
 }
 
-cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
+cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size,
+                                                cv::Size stride)
 {
-    return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
+    return Size(numPartsWithin(size.width, part_size.width, stride.width),
+        numPartsWithin(size.height, part_size.height, stride.height));
 }
 
 std::vector<float> cv::ocl::HOGDescriptor::getDefaultPeopleDetector()
@@ -1548,8 +1684,9 @@ static int power_2up(unsigned int n)
     return -1; // Input is too big
 }
 
-void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-        int nblocks_win_x, int nblocks_win_y)
+void cv::ocl::device::hog::set_up_constants(int nbins,
+                                            int block_stride_x, int block_stride_y,
+                                            int nblocks_win_x, int nblocks_win_y)
 {
     cnbins = nbins;
     cblock_stride_x = block_stride_x;
@@ -1560,29 +1697,31 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int block_stride_x, int b
     int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
     cblock_hist_size = block_hist_size;
 
-    int block_hist_size_2up = power_2up(block_hist_size);
-    cblock_hist_size_2up = block_hist_size_2up;
-
     int descr_width = nblocks_win_x * block_hist_size;
     cdescr_width = descr_width;
+    cdescr_height = nblocks_win_y;
 
     int descr_size = descr_width * nblocks_win_y;
     cdescr_size = descr_size;
 }
 
-void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int block_stride_y,
-        int height, int width, const cv::ocl::oclMat &grad,
-        const cv::ocl::oclMat &qangle, float sigma, cv::ocl::oclMat &block_hists)
+void cv::ocl::device::hog::compute_hists(int nbins,
+                                         int block_stride_x, int block_stride_y,
+                                         int height, int width, float sigma,
+                                         const cv::ocl::oclMat &grad,
+                                         const cv::ocl::oclMat &qangle,
+                                         const cv::ocl::oclMat &gauss_w_lut,
+                                         cv::ocl::oclMat &block_hists)
 {
     Context *clCxt = Context::getContext();
-    String kernelName = "compute_hists_kernel";
     std::vector< std::pair<size_t, const void *> > args;
+    String kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" :
+        "compute_hists_kernel";
 
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
-
-    size_t globalThreads[3] = { img_block_width * 32, img_block_height * 2, 1 };
-    size_t localThreads[3] = { 32, 2, 1 };
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
+        / block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
+        / block_stride_y;
 
     int grad_quadstep = grad.step >> 2;
     int qangle_step = qangle.step;
@@ -1590,6 +1729,11 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
     // Precompute gaussian spatial window parameter
     float scale = 1.f / (2.f * sigma * sigma);
 
+    int blocks_in_group = 4;
+    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
+    size_t globalThreads[3] = {
+        divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 };
+
     int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
     int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
     int smem = hists_size + final_hists_size;
@@ -1604,19 +1748,26 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&qangle_step));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&grad.data));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
+    if (kernelName.compare("compute_hists_lut_kernel") == 0)
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
+    else
+        args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( std::make_pair( smem, (void *)NULL));
 
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-        int height, int width, cv::ocl::oclMat &block_hists, float threshold)
+void cv::ocl::device::hog::normalize_hists(int nbins,
+                                           int block_stride_x, int block_stride_y,
+                                           int height, int width,
+                                           cv::ocl::oclMat &block_hists,
+                                           float threshold)
 {
     Context *clCxt = Context::getContext();
-    String kernelName = "normalize_hists_kernel";
     std::vector< std::pair<size_t, const void *> > args;
+    String kernelName;
 
     int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
     int nthreads = power_2up(block_hist_size);
@@ -1626,40 +1777,90 @@ void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int bl
     size_t globalThreads[3] = { img_block_width * nthreads, img_block_height, 1 };
     size_t localThreads[3] = { nthreads, 1, 1  };
 
-    if ((nthreads < 32) || (nthreads > 512) )
-        cv::error(Error::StsBadArg, "normalize_hists: histogram's size is too small or too big", "cv::ocl::device::hog::normalize_hists", __FILE__, __LINE__);
+    if ( nbins == 9 )
+    {
+        /* optimized for the case of 9 bins */
+        kernelName = "normalize_hists_36_kernel";
+        int blocks_in_group = NTHREADS / block_hist_size;
+        nthreads = blocks_in_group * block_hist_size;
+        int num_groups = divUp( img_block_width * img_block_height, blocks_in_group);
+        globalThreads[0] = nthreads * num_groups;
+        localThreads[0] = nthreads;
+    }
+    else
+    {
+        kernelName = "normalize_hists_kernel";
+        nthreads = power_2up(block_hist_size);
+        globalThreads[0] = img_block_width * nthreads;
+        globalThreads[1] = img_block_height;
+        localThreads[0] = nthreads;
+
+        if ((nthreads < 32) || (nthreads > 512) )
+            cv::error(Error::StsBadArg, "normalize_hists: histogram's size is too small or too big",
+                "normalize_hists", __FILE__, __LINE__);
+
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&nthreads));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_hist_size));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
+    }
 
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nthreads));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_hist_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( std::make_pair( sizeof(cl_float), (void *)&threshold));
     args.push_back( std::make_pair( nthreads * sizeof(float), (void *)NULL));
 
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    if(hog_device_cpu)
+        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+                             localThreads, args, -1, -1, "-D CPU");
+    else
+        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+                             localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
-        int block_stride_x, int win_stride_y, int win_stride_x, int height,
-        int width, const cv::ocl::oclMat &block_hists, const cv::ocl::oclMat &coefs, float free_coef,
-        float threshold, cv::ocl::oclMat &labels)
+void cv::ocl::device::hog::classify_hists(int win_height, int win_width,
+                                          int block_stride_y, int block_stride_x,
+                                          int win_stride_y, int win_stride_x,
+                                          int height, int width,
+                                          const cv::ocl::oclMat &block_hists,
+                                          const cv::ocl::oclMat &coefs,
+                                          float free_coef, float threshold,
+                                          cv::ocl::oclMat &labels)
 {
     Context *clCxt = Context::getContext();
-    String kernelName = "classify_hists_kernel";
     std::vector< std::pair<size_t, const void *> > args;
 
+    int nthreads;
+    String kernelName;
+    switch (cdescr_width)
+    {
+    case 180:
+        nthreads = 180;
+        kernelName = "classify_hists_180_kernel";
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_height));
+        break;
+    case 252:
+        nthreads = 256;
+        kernelName = "classify_hists_252_kernel";
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_height));
+        break;
+    default:
+        nthreads = 256;
+        kernelName = "classify_hists_kernel";
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_size));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
+    }
+
     int win_block_stride_x = win_stride_x / block_stride_x;
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
-
-    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
-    size_t localThreads[3] = { NTHREADS, 1, 1 };
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
 
+    size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
+    size_t localThreads[3] = { nthreads, 1, 1 };
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_win_width));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
@@ -1670,12 +1871,20 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
     args.push_back( std::make_pair( sizeof(cl_float), (void *)&threshold));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&labels.data));
 
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    if(hog_device_cpu)
+        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+                             localThreads, args, -1, -1, "-D CPU");
+    else
+        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+                             localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
-        int win_stride_y, int win_stride_x, int height, int width,
-        const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
+                                                  int block_stride_y, int block_stride_x,
+                                                  int win_stride_y, int win_stride_x,
+                                                  int height, int width,
+                                                  const cv::ocl::oclMat &block_hists,
+                                                  cv::ocl::oclMat &descriptors)
 {
     Context *clCxt = Context::getContext();
     String kernelName = "extract_descrs_by_rows_kernel";
@@ -1685,7 +1894,8 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
     int descriptors_quadstep = descriptors.step >> 2;
 
     size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@@ -1701,12 +1911,16 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
 
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
-        int win_stride_y, int win_stride_x, int height, int width,
-        const cv::ocl::oclMat &block_hists, cv::ocl::oclMat &descriptors)
+void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
+                                                  int block_stride_y, int block_stride_x,
+                                                  int win_stride_y, int win_stride_x,
+                                                  int height, int width,
+                                                  const cv::ocl::oclMat &block_hists,
+                                                  cv::ocl::oclMat &descriptors)
 {
     Context *clCxt = Context::getContext();
     String kernelName = "extract_descrs_by_cols_kernel";
@@ -1716,7 +1930,8 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
     int win_block_stride_y = win_stride_y / block_stride_y;
     int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
     int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
     int descriptors_quadstep = descriptors.step >> 2;
 
     size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
@@ -1733,16 +1948,16 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
 
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+        localThreads, args, -1, -1);
 }
 
-static inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
-void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
-        float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
+                                                  const cv::ocl::oclMat &img,
+                                                  float angle_scale,
+                                                  cv::ocl::oclMat &grad,
+                                                  cv::ocl::oclMat &qangle,
+                                                  bool correct_gamma)
 {
     Context *clCxt = Context::getContext();
     String kernelName = "compute_gradients_8UC1_kernel";
@@ -1767,11 +1982,16 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
     args.push_back( std::make_pair( sizeof(cl_char), (void *)&correctGamma));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cnbins));
 
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+        localThreads, args, -1, -1);
 }
 
-void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
-        float angle_scale, cv::ocl::oclMat &grad, cv::ocl::oclMat &qangle, bool correct_gamma)
+void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
+                                                  const cv::ocl::oclMat &img,
+                                                  float angle_scale,
+                                                  cv::ocl::oclMat &grad,
+                                                  cv::ocl::oclMat &qangle,
+                                                  bool correct_gamma)
 {
     Context *clCxt = Context::getContext();
     String kernelName = "compute_gradients_8UC4_kernel";
@@ -1797,37 +2017,6 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
     args.push_back( std::make_pair( sizeof(cl_char), (void *)&correctGamma));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cnbins));
 
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
-{
-    CV_Assert( (src.channels() == dst.channels()) );
-    Context *clCxt = Context::getContext();
-
-    String kernelName = (src.type() == CV_8UC1) ? "resize_8UC1_kernel" : "resize_8UC4_kernel";
-    size_t blkSizeX = 16, blkSizeY = 16;
-    size_t glbSizeX = sz.width % blkSizeX == 0 ? sz.width : (sz.width / blkSizeX + 1) * blkSizeX;
-    size_t glbSizeY = sz.height % blkSizeY == 0 ? sz.height : (sz.height / blkSizeY + 1) * blkSizeY;
-    size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
-    size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
-
-    float ifx = (float)src.cols / sz.width;
-    float ify = (float)src.rows / sz.height;
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&sz.width));
-    args.push_back( std::make_pair(sizeof(cl_int), (void *)&sz.height));
-    args.push_back( std::make_pair(sizeof(cl_float), (void *)&ifx));
-    args.push_back( std::make_pair(sizeof(cl_float), (void *)&ify));
-
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
+        localThreads, args, -1, -1);
 }
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index 4b8fe58b89..3366cf7261 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -26,6 +26,7 @@
 //    Wu Zailong, bullet@yeah.net
 //    Wenju He, wenju@multicorewareinc.com
 //    Peng Xiao, pengxiao@outlook.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -81,6 +82,7 @@ namespace cv
         extern const char *imgproc_calcMinEigenVal;
         extern const char *imgproc_convolve;
         extern const char *imgproc_mulAndScaleSpectrums;
+        extern const char *imgproc_clahe;
         ////////////////////////////////////OpenCL call wrappers////////////////////////////
 
         template <typename T> struct index_and_sizeof;
@@ -1505,6 +1507,189 @@ namespace cv
             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
             LUT(mat_src, lut, mat_dst);
         }
+
+        ////////////////////////////////////////////////////////////////////////
+        // CLAHE
+        namespace clahe
+        {
+            inline int divUp(int total, int grain)
+            {
+                return (total + grain - 1) / grain * grain;
+            }
+
+            static void calcLut(const oclMat &src, oclMat &dst,
+                const int tilesX, const int tilesY, const cv::Size tileSize,
+                const int clipLimit, const float lutScale)
+            {
+                cl_int2 tile_size;
+                tile_size.s[0] = tileSize.width;
+                tile_size.s[1] = tileSize.height;
+
+                std::vector<std::pair<size_t , const void *> > args;
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
+                args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
+
+                String kernelName = "calcLut";
+                size_t localThreads[3]  = { 32, 8, 1 };
+                size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
+                bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
+                if (is_cpu)
+                {
+                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
+                }
+                else
+                {
+                    cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
+                    int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+                    openCLSafeCall(clReleaseKernel(kernel));
+
+                    static char opt[20] = {0};
+                    sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
+                }
+            }
+
+            static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
+                const int tilesX, const int tilesY, const cv::Size tileSize)
+            {
+                cl_int2 tile_size;
+                tile_size.s[0] = tileSize.width;
+                tile_size.s[1] = tileSize.height;
+
+                std::vector<std::pair<size_t , const void *> > args;
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
+                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
+                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
+
+                String kernelName = "transform";
+                size_t localThreads[3]  = { 32, 8, 1 };
+                size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 };
+
+                openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
+            }
+        }
+
+        namespace
+        {
+            class CLAHE_Impl : public cv::ocl::CLAHE
+            {
+            public:
+                CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+                cv::AlgorithmInfo* info() const;
+
+                void apply(const oclMat &src, oclMat &dst);
+
+                void setClipLimit(double clipLimit);
+                double getClipLimit() const;
+
+                void setTilesGridSize(cv::Size tileGridSize);
+                cv::Size getTilesGridSize() const;
+
+                void collectGarbage();
+
+            private:
+                double clipLimit_;
+                int tilesX_;
+                int tilesY_;
+
+                oclMat srcExt_;
+                oclMat lut_;
+            };
+
+            CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+            clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+            {
+            }
+
+            void CLAHE_Impl::apply(const oclMat &src, oclMat &dst)
+            {
+                CV_Assert( src.type() == CV_8UC1 );
+
+                dst.create( src.size(), src.type() );
+
+                const int histSize = 256;
+
+                ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
+
+                cv::Size tileSize;
+                oclMat srcForLut;
+
+                if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+                {
+                    tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+                    srcForLut = src;
+                }
+                else
+                {
+                    cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar());
+
+                    tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+                    srcForLut = srcExt_;
+                }
+
+                const int tileSizeTotal = tileSize.area();
+                const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+                int clipLimit = 0;
+                if (clipLimit_ > 0.0)
+                {
+                    clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+                    clipLimit = std::max(clipLimit, 1);
+                }
+
+                clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale);
+                //finish();
+                clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize);
+            }
+
+            void CLAHE_Impl::setClipLimit(double clipLimit)
+            {
+                clipLimit_ = clipLimit;
+            }
+
+            double CLAHE_Impl::getClipLimit() const
+            {
+                return clipLimit_;
+            }
+
+            void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+            {
+                tilesX_ = tileGridSize.width;
+                tilesY_ = tileGridSize.height;
+            }
+
+            cv::Size CLAHE_Impl::getTilesGridSize() const
+            {
+                return cv::Size(tilesX_, tilesY_);
+            }
+
+            void CLAHE_Impl::collectGarbage()
+            {
+                srcExt_.release();
+                lut_.release();
+            }
+        }
+
+        cv::Ptr<cv::ocl::CLAHE> createCLAHE(double clipLimit, cv::Size tileGridSize)
+        {
+            return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
+        }
+
         //////////////////////////////////bilateralFilter////////////////////////////////////////////////////
         static void
         oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d,
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index 53b1c2a77d..5d02423a21 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -121,8 +121,9 @@ namespace cv
             cacheSize = 0;
         }
 
-
-       struct Info::Impl
+        // not to be exported to dynamic lib
+        void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl);
+        struct Info::Impl
         {
             cl_platform_id oclplatform;
             std::vector<cl_device_id> devices;
@@ -140,22 +141,12 @@ namespace cv
             char extra_options[512];
             int  double_support;
             int unified_memory; //1 means integrated GPU, otherwise this value is 0
+            bool enable_disk_cache; 
+            bool update_disk_cache;
             String binpath;
             int refcounter;
 
-            Impl()
-            {
-                refcounter = 1;
-                oclplatform = 0;
-                oclcontext = 0;
-                clCmdQueue = 0;
-                devnum = -1;
-                maxComputeUnits = 0;
-                maxWorkGroupSize = 0;
-                memset(extra_options, 0, 512);
-                double_support = 0;
-                unified_memory = 0;
-            }
+            Impl();
 
             void setDevice(void *ctx, void *q, int devnum);
 
@@ -180,6 +171,25 @@ namespace cv
             void releaseResources();
         };
 
+        Info::Impl::Impl()
+            :oclplatform(0),
+            oclcontext(0),
+            clCmdQueue(0),
+            devnum(-1),
+            maxWorkGroupSize(0),
+            maxDimensions(0),
+            maxComputeUnits(0),
+            double_support(0),
+            unified_memory(0),
+            enable_disk_cache(false),
+            update_disk_cache(false),
+            binpath("./"),
+            refcounter(1)
+        {
+            memset(extra_options, 0, 512);
+            setBinaryDiskCacheImpl(CACHE_RELEASE, String("./"), this);
+        }
+
         void Info::Impl::releaseResources()
         {
             devnum = -1;
@@ -498,6 +508,24 @@ namespace cv
             return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
         }
 
+        void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl)
+        {
+            impl->update_disk_cache = (mode & CACHE_UPDATE) == CACHE_UPDATE;
+            impl->enable_disk_cache = 
+#ifdef _DEBUG 
+                (mode & CACHE_DEBUG)   == CACHE_DEBUG;
+#else
+                (mode & CACHE_RELEASE) == CACHE_RELEASE;
+#endif
+            if(impl->enable_disk_cache && !path.empty())
+            {
+                impl->binpath = path;
+            }
+        }
+        void setBinaryDiskCache(int mode, cv::String path)
+        {
+            setBinaryDiskCacheImpl(mode, path, Context::getContext()->impl);
+        }
 
         void setBinpath(const char *path)
         {
@@ -577,8 +605,8 @@ namespace cv
                     filename = clCxt->impl->binpath  + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
                 }
 
-                FILE *fp = fopen(filename.c_str(), "rb");
-                if(fp == NULL || clCxt->impl->binpath.size() == 0)    //we should generate a binary file for the first time.
+                FILE *fp = clCxt->impl->enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
+                if(fp == NULL || clCxt->impl->update_disk_cache)
                 {
                     if(fp != NULL)
                         fclose(fp);
@@ -587,7 +615,7 @@ namespace cv
                                   clCxt->impl->oclcontext, 1, source, NULL, &status);
                     openCLVerifyCall(status);
                     status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
-                    if(status == CL_SUCCESS && clCxt->impl->binpath.size())
+                    if(status == CL_SUCCESS && clCxt->impl->enable_disk_cache)
                         savetofile(clCxt, program, filename.c_str());
                 }
                 else
@@ -921,6 +949,14 @@ namespace cv
         int Context::val = 0;
         static Mutex cs;
         static volatile int context_tear_down = 0;
+
+        bool initialized()
+        {
+            return *((volatile int*)&Context::val) != 0 && 
+                Context::clCxt->impl->clCmdQueue != NULL&& 
+                Context::clCxt->impl->oclcontext != NULL;
+        }
+
         Context* Context::getContext()
         {
             if(*((volatile int*)&val) != 1)
@@ -934,8 +970,6 @@ namespace cv
                     clCxt.reset(new Context);
                     std::vector<Info> oclinfo;
                     CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
-                    oclinfo[0].impl->setDevice(0, 0, 0);
-                    clCxt.get()->impl = oclinfo[0].impl->copy();
 
                     *((volatile int*)&val) = 1;
                 }
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index 4f23789b33..e6af56d577 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -73,6 +73,7 @@ namespace cv
     }
 }
 
+
 ////////////////////////////////////////////////////////////////////////
 // convert_C3C4
 static void convert_C3C4(const cl_mem &src, oclMat &dst)
@@ -215,6 +216,34 @@ void cv::ocl::oclMat::upload(const Mat &m)
     offset = ofs.y * step + ofs.x * elemSize();
 }
 
+cv::ocl::oclMat::operator cv::_InputArray()
+{
+    _InputArray newInputArray;
+    newInputArray.flags = cv::_InputArray::OCL_MAT;
+    newInputArray.obj   = reinterpret_cast<void *>(this);
+    return newInputArray;
+}
+
+cv::ocl::oclMat::operator cv::_OutputArray()
+{
+    _OutputArray newOutputArray;
+    newOutputArray.flags = cv::_InputArray::OCL_MAT;
+    newOutputArray.obj   = reinterpret_cast<void *>(this);
+    return newOutputArray;
+}
+
+cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
+{
+    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
+    return *reinterpret_cast<oclMat*>(src.obj);
+}
+
+cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
+{
+    CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
+    return *reinterpret_cast<oclMat*>(src.obj);
+}
+
 void cv::ocl::oclMat::download(cv::Mat &m) const
 {
     CV_DbgAssert(!this->empty());
@@ -382,7 +411,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
     if( rtype < 0 )
         rtype = type();
     else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels());
+        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
 
     //int scn = channels();
     int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index 27f8d26ecf..d4fd47b722 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -80,7 +80,7 @@ namespace cv
         // provide additional methods for the user to interact with the command queue after a task is fired
         static void openCLExecuteKernel_2(Context *clCxt , const char **source, String kernelName, size_t globalThreads[3],
                                    size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                   int depth, char *build_options, FLUSH_MODE finish_mode)
+                                   int depth, const char *build_options, FLUSH_MODE finish_mode)
         {
             //construct kernel name
             //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
@@ -133,7 +133,7 @@ namespace cv
         }
         void openCLExecuteKernel2(Context *clCxt , const char **source, String kernelName,
                                   size_t globalThreads[3], size_t localThreads[3],
-                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, char *build_options, FLUSH_MODE finish_mode)
+                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options, FLUSH_MODE finish_mode)
 
         {
             openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
diff --git a/modules/ocl/src/opencl/imgproc_clahe.cl b/modules/ocl/src/opencl/imgproc_clahe.cl
new file mode 100644
index 0000000000..0d010f7a5b
--- /dev/null
+++ b/modules/ocl/src/opencl/imgproc_clahe.cl
@@ -0,0 +1,275 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+
+int calc_lut(__local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid == 0)
+    {
+        for (int i = 1; i < 256; ++i)
+        {
+            smem[i] += smem[i - 1];
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    return smem[tid];
+}
+
+#ifdef CPU
+void reduce(volatile __local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+    { 
+        smem[tid] = val += smem[tid + 128];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+    { 
+        smem[tid] = val += smem[tid + 64];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+    {
+        smem[tid] += smem[tid + 16];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+    {
+        smem[tid] += smem[tid + 8];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 4)
+    {
+        smem[tid] += smem[tid + 4];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 2)
+    {
+        smem[tid] += smem[tid + 2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 1)
+    {
+        smem[256] = smem[tid] + smem[tid + 1];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+#else
+void reduce(__local volatile int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+    { 
+        smem[tid] = val += smem[tid + 128];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+    { 
+        smem[tid] = val += smem[tid + 64];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    } barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8) {
+#endif
+        smem[tid] += smem[tid + 8];
+        smem[tid] += smem[tid + 4];
+        smem[tid] += smem[tid + 2];
+        smem[tid] += smem[tid + 1];
+    }
+}
+#endif
+
+__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
+                      const int srcStep, const int dstStep,
+                      const int2 tileSize, const int tilesX,
+                      const int clipLimit, const float lutScale)
+{
+    __local int smem[512];
+
+    const int tx = get_group_id(0);
+    const int ty = get_group_id(1);
+    const unsigned int tid = get_local_id(1) * get_local_size(0)
+                             + get_local_id(0);
+
+    smem[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
+    {
+        __global const uchar* srcPtr = src + mad24( ty * tileSize.y + i,
+                                                    srcStep, tx * tileSize.x );
+        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
+        {
+            const int data = srcPtr[j];
+            atomic_inc(&smem[data]);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tHistVal = smem[tid];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (clipLimit > 0)
+    {
+        // clip histogram bar
+
+        int clipped = 0;
+        if (tHistVal > clipLimit)
+        {
+            clipped = tHistVal - clipLimit;
+            tHistVal = clipLimit;
+        }
+
+        // find number of overall clipped samples
+
+        reduce(smem, clipped, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef CPU
+        clipped = smem[256];
+#else
+        clipped = smem[0];
+#endif
+
+        // broadcast evaluated value
+
+        __local int totalClipped;
+
+        if (tid == 0)
+            totalClipped = clipped;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // redistribute clipped samples evenly
+
+        int redistBatch = totalClipped / 256;
+        tHistVal += redistBatch;
+
+        int residual = totalClipped - redistBatch * 256;
+        if (tid < residual)
+            ++tHistVal;
+    }
+
+    const int lutVal = calc_lut(smem, tHistVal, tid);
+    uint ires = (uint)convert_int_rte(lutScale * lutVal);
+    lut[(ty * tilesX + tx) * dstStep + tid] =
+        convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
+
+__kernel void transform(__global __const uchar * src,
+                        __global uchar * dst,
+                        __global uchar * lut,
+                        const int srcStep, const int dstStep, const int lutStep,
+                        const int cols, const int rows,
+                        const int2 tileSize,
+                        const int tilesX, const int tilesY)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= cols || y >= rows)
+        return;
+
+    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
+    int ty1 = convert_int_rtn(tyf);
+    int ty2 = ty1 + 1;
+    const float ya = tyf - ty1;
+    ty1 = max(ty1, 0);
+    ty2 = min(ty2, tilesY - 1);
+
+    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
+    int tx1 = convert_int_rtn(txf);
+    int tx2 = tx1 + 1;
+    const float xa = txf - tx1;
+    tx1 = max(tx1, 0);
+    tx2 = min(tx2, tilesX - 1);
+
+    const int srcVal = src[mad24(y, srcStep, x)];
+
+    float res = 0;
+
+    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (1.0f - ya));
+    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (1.0f - ya));
+    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal)] * ((1.0f - xa) * (ya));
+    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal)] * ((xa) * (ya));
+
+    uint ires = (uint)convert_int_rte(res);
+    dst[mad24(y, dstStep, x)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
index 64ae3ea980..509cf13ade 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -43,7 +43,6 @@
 //
 //M*/
 
-
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
 #define CELLS_PER_BLOCK_X 2
@@ -51,6 +50,100 @@
 #define NTHREADS 256
 #define CV_PI_F 3.1415926535897932384626433832795f
 
+//----------------------------------------------------------------------------
+// Histogram computation
+// 12 threads for a cell, 12x4 threads per block
+// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
+__kernel void compute_hists_lut_kernel(
+    const int cblock_stride_x, const int cblock_stride_y,
+    const int cnbins, const int cblock_hist_size, const int img_block_width, 
+    const int blocks_in_group, const int blocks_total,
+    const int grad_quadstep, const int qangle_step,
+    __global const float* grad, __global const uchar* qangle,
+    __global const float* gauss_w_lut,
+    __global float* block_hists, __local float* smem)
+{
+    const int lx = get_local_id(0);
+    const int lp = lx / 24; /* local group id */
+    const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
+    const int gidY = gid / img_block_width;
+    const int gidX = gid - gidY * img_block_width;
+
+    const int lidX = lx - lp * 24;
+    const int lidY = get_local_id(1);
+
+    const int cell_x = lidX / 12;
+    const int cell_y = lidY;
+    const int cell_thread_x = lidX - cell_x * 12;
+
+    __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * 
+        CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
+    __local float* final_hist = hists + cnbins * 
+        (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
+
+    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
+    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
+
+    __global const float* grad_ptr = (gid < blocks_total) ? 
+        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
+    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
+
+    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + 
+        cell_thread_x;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+        hist[bin_id * 48] = 0.f;
+
+    const int dist_x = -4 + cell_thread_x - 4 * cell_x;
+    const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+    const int dist_y_begin = -4 - 4 * lidY;
+    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+    {
+        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
+        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+
+        grad_ptr += grad_quadstep;
+        qangle_ptr += qangle_step;
+
+        int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+
+        int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
+        float gaussian = gauss_w_lut[idx];
+        idx = (dist_y + 8) * 16 + (dist_x + 8);
+        float interp_weight = gauss_w_lut[256+idx];
+
+        hist[bin.x * 48] += gaussian * interp_weight * vote.x;
+        hist[bin.y * 48] += gaussian * interp_weight * vote.y;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* hist_ = hist;
+    for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
+    {
+        if (cell_thread_x < 6)
+            hist_[0] += hist_[6];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (cell_thread_x < 3)
+            hist_[0] += hist_[3];
+#ifdef CPU
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+        if (cell_thread_x == 0)
+            final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 
+                hist_[0] + hist_[1] + hist_[2];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
+    if ((tid < cblock_hist_size) && (gid < blocks_total))
+    {
+        __global float* block_hist = block_hists + 
+            (gidY * img_block_width + gidX) * cblock_hist_size;
+        block_hist[tid] = final_hist[tid];
+    }
+}
+
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
@@ -125,16 +218,14 @@ __kernel void compute_hists_kernel(
         barrier(CLK_LOCAL_MEM_FENCE);
         if (cell_thread_x < 3)
             hist_[0] += hist_[3];
-#ifdef WAVE_SIZE_1
+#ifdef CPU
         barrier(CLK_LOCAL_MEM_FENCE);
 #endif
         if (cell_thread_x == 0)
             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
                 hist_[0] + hist_[1] + hist_[2];
     }
-#ifdef WAVE_SIZE_1
     barrier(CLK_LOCAL_MEM_FENCE);
-#endif
 
     int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
     if ((tid < cblock_hist_size) && (gid < blocks_total))
@@ -145,6 +236,57 @@ __kernel void compute_hists_kernel(
     }
 }
 
+//-------------------------------------------------------------
+//  Normalization of histograms via L2Hys_norm
+//  optimized for the case of 9 bins
+__kernel void normalize_hists_36_kernel(__global float* block_hists, 
+                                        const float threshold, __local float *squares)
+{
+    const int tid = get_local_id(0);
+    const int gid = get_global_id(0);
+    const int bid = tid / 36;      /* block-hist id, (0 - 6) */
+    const int boffset = bid * 36;  /* block-hist offset in the work-group */
+    const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
+
+    float elem = block_hists[gid];
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local float* smem = squares + boffset;
+    float sum = smem[hid];
+    if (hid < 18)
+        smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    elem = elem / (sqrt(sum) + 3.6f);
+    elem = min(elem, threshold);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    squares[tid] = elem * elem;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    sum = smem[hid];
+    if (hid < 18)
+      smem[hid] = sum = sum + smem[hid + 18];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 9)
+        smem[hid] = sum = sum + smem[hid + 9];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (hid < 4)
+        smem[hid] = sum + smem[hid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
+
+    block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
+}
+
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
 //
@@ -153,76 +295,50 @@ float reduce_smem(volatile __local float* smem, int size)
     unsigned int tid = get_local_id(0);
     float sum = smem[tid];
 
-    if (size >= 512)
-    {
-        if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (size >= 256)
-    {
-        if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (size >= 128)
-    {
-        if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
+    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#ifdef CPU
+    if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }	
+    if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }		
+    if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; 
+        barrier(CLK_LOCAL_MEM_FENCE); }
+#else
     if (tid < 32)
     {
         if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-#endif
         if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-#endif
         if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
-    {
-#endif
         if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
-    {
-#endif
         if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
-#endif
         if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
     }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    sum = smem[0];
+#endif
 
     return sum;
 }
 
-__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
-                                     __global float* block_hists, const float threshold, __local float *squares)
+__kernel void normalize_hists_kernel(
+    const int nthreads, const int block_hist_size, const int img_block_width,
+    __global float* block_hists, const float threshold, __local float *squares)
 {
     const int tid = get_local_id(0);
     const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
-    __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
+    __global float* hist = block_hists + (gidY * img_block_width + gidX) * 
+        block_hist_size + tid;
 
     float elem = 0.f;
     if (tid < block_hist_size)
@@ -249,25 +365,98 @@ __kernel void normalize_hists_kernel(const int nthreads, const int block_hist_si
 
 //---------------------------------------------------------------------
 //  Linear SVM based classification
-//
-__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
-                                    const int img_win_width, const int img_block_width,
-                                    const int win_block_stride_x, const int win_block_stride_y,
-                                    __global const float * block_hists, __global const float* coefs,
-                                    float free_coef, float threshold, __global uchar* labels)
+//  48x96 window, 9 bins and default parameters
+//  180 threads, each thread corresponds to a bin in a row
+__kernel void classify_hists_180_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
 {
     const int tid = get_local_id(0);
     const int gidX = get_group_id(0);
     const int gidY = get_group_id(1);
 
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     float product = 0.f;
-    for (int i = tid; i < cdescr_size; i += NTHREADS)
+
+    for (int i = 0; i < cdescr_height; i++)
     {
-        int offset_y = i / cdescr_width;
-        int offset_x = i - offset_y * cdescr_width;
-        product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+        product += coefs[i * cdescr_width + tid] * 
+            hist[i * img_block_width * cblock_hist_size + tid];
+    }
+
+    __local float products[180];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 90) products[tid] = product = product + products[tid + 90];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 45) products[tid] = product = product + products[tid + 45];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    volatile __local float* smem = products;
+#ifdef CPU
+    if (tid < 13) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 13)
+    {
+        smem[tid] = product = product + smem[tid + 32];
+    }
+    if (tid < 16)
+    {
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+
+    if (tid == 0){
+		product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  64x128 window, 9 bins and default parameters
+//  256 threads, 252 of them are used
+__kernel void classify_hists_252_kernel(
+    const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    if (tid < cdescr_width)
+    {
+        for (int i = 0; i < cdescr_height; i++)
+            product += coefs[i * cdescr_width + tid] * 
+                hist[i * img_block_width * cblock_hist_size + tid];
     }
 
     __local float products[NTHREADS];
@@ -282,67 +471,120 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
     if (tid < 64) products[tid] = product = product + products[tid + 64];
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    volatile __local float* smem = products;
+	volatile __local float* smem = products;
+#ifdef CPU
+	if(tid<32) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
     if (tid < 32)
-    {
+    {      
         smem[tid] = product = product + smem[tid + 32];
-#if defined(WAVE_SIZE_16) || defined(WAVE_SIZE_1)
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 16)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 16];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 8];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 4)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 4];
-#ifdef WAVE_SIZE_1
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 2)
-    {
-#endif
         smem[tid] = product = product + smem[tid + 2];
-#ifdef WAVE_SIZE_1
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 1)
-    {
 #endif
-        smem[tid] = product = product + smem[tid + 1];
+    if (tid == 0){
+		product = product + smem[tid + 1];
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//  256 threads
+__kernel void classify_hists_kernel(
+    const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
+    const int img_win_width, const int img_block_width,
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float * block_hists, __global const float* coefs,
+    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        product += coefs[i] * 
+            hist[offset_y * img_block_width * cblock_hist_size + offset_x];
     }
 
-    if (tid == 0)
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+	volatile __local float* smem = products;
+#ifdef CPU
+	if(tid<32) smem[tid] = product = product + smem[tid + 32];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<16) smem[tid] = product = product + smem[tid + 16];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<8) smem[tid] = product = product + smem[tid + 8];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<4) smem[tid] = product = product + smem[tid + 4];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(tid<2) smem[tid] = product = product + smem[tid + 2];
+	barrier(CLK_LOCAL_MEM_FENCE);
+#else
+    if (tid < 32)
+    {       
+        smem[tid] = product = product + smem[tid + 32];
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+    }
+#endif
+    if (tid == 0){
+		smem[tid] = product = product + smem[tid + 1];
         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+	}
 }
 
 //----------------------------------------------------------------------------
 // Extract descriptors
 
-__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
-        const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-        __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_rows_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep, 
+    const int cdescr_size, const int cdescr_width, const int img_block_width, 
+    const int win_block_stride_x, const int win_block_stride_y,
+    __global const float* block_hists, __global float* descriptors)
 {
     int tid = get_local_id(0);
     int gidX = get_group_id(0);
     int gidY = get_group_id(1);
 
     // Get left top corner of the window in src
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     // Get left top corner of the window in dst
-    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+    __global float* descriptor = descriptors + 
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 
     // Copy elements from src to dst
     for (int i = tid; i < cdescr_size; i += NTHREADS)
@@ -353,19 +595,23 @@ __kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const in
     }
 }
 
-__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
-        const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
-        const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
+__kernel void extract_descrs_by_cols_kernel(
+    const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
+    const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, 
+    const int win_block_stride_x, const int win_block_stride_y, 
+    __global const float* block_hists, __global float* descriptors)
 {
     int tid = get_local_id(0);
     int gidX = get_group_id(0);
     int gidY = get_group_id(1);
 
     // Get left top corner of the window in src
-    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+    __global const float* hist = block_hists +  (gidY * win_block_stride_y * 
+        img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
 
     // Get left top corner of the window in dst
-    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+    __global float* descriptor = descriptors + 
+        (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
 
     // Copy elements from src to dst
     for (int i = tid; i < cdescr_size; i += NTHREADS)
@@ -376,16 +622,19 @@ __kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const in
         int y = block_idx / cnblocks_win_x;
         int x = block_idx - y * cnblocks_win_x;
 
-        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = 
+            hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
     }
 }
 
 //----------------------------------------------------------------------------
 // Gradients computation
 
-__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
-        const __global uchar4 * img, __global float * grad, __global uchar * qangle,
-        const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC4_kernel(
+    const int height, const int width, 
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
@@ -426,8 +675,10 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
     barrier(CLK_LOCAL_MEM_FENCE);
     if (x < width)
     {
-        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
-        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], 
+            sh_row[tid + 2 * (NTHREADS + 2)]);
+        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], 
+            sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
 
         float3 dx;
         if (correct_gamma == 1)
@@ -482,9 +733,11 @@ __kernel void compute_gradients_8UC4_kernel(const int height, const int width, c
     }
 }
 
-__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
-        __global const uchar * img, __global float * grad, __global uchar * qangle,
-        const float angle_scale, const char correct_gamma, const int cnbins)
+__kernel void compute_gradients_8UC1_kernel(
+    const int height, const int width, 
+    const int img_step, const int grad_quadstep, const int qangle_step,
+    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
     const int tid = get_local_id(0);
@@ -539,43 +792,4 @@ __kernel void compute_gradients_8UC1_kernel(const int height, const int width, c
         grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
         grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
     }
-}
-
-//----------------------------------------------------------------------------
-// Resize
-
-__kernel void resize_8UC4_kernel(__global uchar4 * dst, __global const uchar4 * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step,
-                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    int sx = (int)floor(dx*ifx+0.5f);
-    int sy = (int)floor(dy*ify+0.5f);
-    sx = min(sx, src_cols-1);
-    sy = min(sy, src_rows-1);
-    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
-    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
-
-    if(dx<dst_cols && dy<dst_rows)
-        dst[dpos] = src[spos];
-}
-
-__kernel void resize_8UC1_kernel(__global uchar * dst, __global const uchar * src,
-                                 int dst_offset, int src_offset, int dst_step, int src_step,
-                                 int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    int sx = (int)floor(dx*ifx+0.5f);
-    int sy = (int)floor(dy*ify+0.5f);
-    sx = min(sx, src_cols-1);
-    sy = min(sy, src_rows-1);
-    int dpos = dst_offset + dy * dst_step + dx;
-    int spos = src_offset + sy * src_step + sx;
-
-    if(dx<dst_cols && dy<dst_rows)
-        dst[dpos] = src[spos];
 }
\ No newline at end of file
diff --git a/modules/ocl/test/test_haar.cpp b/modules/ocl/test/test_haar.cpp
deleted file mode 100644
index fa6dd68073..0000000000
--- a/modules/ocl/test/test_haar.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Sen Liu, swjutls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/objdetect.hpp"
-#include "precomp.hpp"
-
-#if 0 //def HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-extern string workdir;
-
-namespace
-{
-IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
-CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
-CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
-struct getRect
-{
-    Rect operator ()(const CvAvgComp &e) const
-    {
-        return e.rect;
-    }
-};
-}
-
-PARAM_TEST_CASE(Haar, double, int, CascadeName)
-{
-    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::CascadeClassifier cpucascade, cpunestedCascade;
-
-    double scale;
-    int flags;
-    std::string cascadeName;
-
-    virtual void SetUp()
-    {
-        scale = GET_PARAM(0);
-        flags = GET_PARAM(1);
-        cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(2));
-
-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
-        {
-            cout << "ERROR: Could not load classifier cascade" << endl;
-            return;
-        }
-    }
-};
-
-////////////////////////////////faceDetect/////////////////////////////////////////////////
-TEST_P(Haar, FaceDetect)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << "Couldn't read " << imgName << std::endl;
-        return ;
-    }
-
-    vector<Rect> faces, oclfaces;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, COLOR_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    cv::ocl::oclMat image;
-    CvSeq *_objects;
-    image.upload(smallImg);
-    _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
-                   3, flags, Size(30, 30), Size(0, 0) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    oclfaces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-    
-    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-    EXPECT_EQ(faces.size(), oclfaces.size());
-}
-
-TEST_P(Haar, FaceDetectUseBuf)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << "Couldn't read " << imgName << std::endl;
-        return ;
-    }
-
-    vector<Rect> faces, oclfaces;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    cv::ocl::oclMat image;
-    image.upload(smallImg);
-
-    cv::ocl::OclCascadeClassifierBuf cascadebuf;
-    if( !cascadebuf.load( cascadeName ) )
-    {
-        cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << endl;
-        return;
-    }
-    cascadebuf.detectMultiScale( image, oclfaces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-
-    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
-                                 flags,
-                                 Size(30, 30), Size(0, 0) );
-    EXPECT_EQ(faces.size(), oclfaces.size());
-
-    // intentionally run ocl facedetect again and check if it still works after the first run
-    cascadebuf.detectMultiScale( image, oclfaces,  1.1, 3,
-        flags,
-        Size(30, 30));
-    cascadebuf.release();
-    EXPECT_EQ(faces.size(), oclfaces.size());
-}
-
-INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
-    Combine(Values(1.0),
-            Values(CV_HAAR_SCALE_IMAGE, 0), Values(cascade_frontalface_alt, cascade_frontalface_alt2)));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp
index 7c8b5c829f..2fb29318fa 100644
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
@@ -23,6 +23,7 @@
 //    Rock Li, Rock.Li@amd.com
 //    Wu Zailong, bullet@yeah.net
 //    Xu Pang, pangxu010@163.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -1393,6 +1394,46 @@ TEST_P(calcHist, Mat)
         EXPECT_MAT_NEAR(dst_hist, cpu_hist, 0.0);
     }
 }
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// CLAHE
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(ClipLimit, double)
+}
+
+PARAM_TEST_CASE(CLAHE, cv::Size, ClipLimit)
+{
+    cv::Size size;
+    double clipLimit;
+
+    cv::Mat src;
+    cv::Mat dst_gold;
+
+    cv::ocl::oclMat g_src;
+    cv::ocl::oclMat g_dst;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        clipLimit = GET_PARAM(1);
+
+        cv::RNG &rng = TS::ptr()->get_rng();
+        src = randomMat(rng, size, CV_8UC1, 0, 256, false);
+        g_src.upload(src);
+    }
+};
+
+TEST_P(CLAHE, Accuracy)
+{
+    cv::Ptr<cv::ocl::CLAHE> clahe = cv::ocl::createCLAHE(clipLimit);
+    clahe->apply(g_src, g_dst);
+    cv::Mat dst(g_dst);
+
+    cv::Ptr<cv::CLAHE> clahe_gold = cv::createCLAHE(clipLimit);
+    clahe_gold->apply(src, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+}
 
 ///////////////////////////Convolve//////////////////////////////////
 PARAM_TEST_CASE(ConvolveTestBase, MatType, bool)
@@ -1532,6 +1573,47 @@ TEST_P(Convolve, Mat)
     }
 }
 
+//////////////////////////////// ColumnSum //////////////////////////////////////
+PARAM_TEST_CASE(ColumnSum, cv::Size)
+{
+    cv::Size size;
+    cv::Mat src;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+    }
+};
+
+TEST_P(ColumnSum, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1);
+    cv::ocl::oclMat d_dst;
+    cv::ocl::oclMat d_src(src);
+
+    cv::ocl::columnSum(d_src, d_dst);
+
+    cv::Mat dst(d_dst);
+
+    for (int j = 0; j < src.cols; ++j)
+    {
+        float gold = src.at<float>(0, j);
+        float res = dst.at<float>(0, j);
+        ASSERT_NEAR(res, gold, 1e-5);
+    }
+
+    for (int i = 1; i < src.rows; ++i)
+    {
+        for (int j = 0; j < src.cols; ++j)
+        {
+            float gold = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            float res = dst.at<float>(i, j);
+            ASSERT_NEAR(res, gold, 1e-5);
+        }
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////
+
 INSTANTIATE_TEST_CASE_P(ImgprocTestBase, equalizeHist, Combine(
                             ONE_TYPE(CV_8UC1),
                             NULL_TYPE,
@@ -1643,7 +1725,10 @@ INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine(
                             ONE_TYPE(CV_32SC1) //no use
                         ));
 
-//INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine(
-//                            Values(CV_32FC1, CV_32FC1),
-//                            Values(false))); // Values(false) is the reserved parameter
+INSTANTIATE_TEST_CASE_P(ImgProc, CLAHE, Combine(
+                        Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(1300, 1300)),
+                        Values(0.0, 40.0)));
+
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);
+
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_hog.cpp b/modules/ocl/test/test_objdetect.cpp
similarity index 50%
rename from modules/ocl/test/test_hog.cpp
rename to modules/ocl/test/test_objdetect.cpp
index e968d04440..ad35270c92 100644
--- a/modules/ocl/test/test_hog.cpp
+++ b/modules/ocl/test/test_objdetect.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Wenju He, wenju@multicorewareinc.com
+//		Yao Wang, bitwangyaoyao@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,51 +45,61 @@
 
 #include "precomp.hpp"
 
+#include "opencv2/objdetect.hpp"
+#include "opencv2/objdetect/objdetect_c.h"
+
 using namespace std;
+using namespace cv;
+using namespace testing;
+
 #ifdef HAVE_OPENCL
 
 extern string workdir;
-PARAM_TEST_CASE(HOG, cv::Size, int)
+
+///////////////////// HOG /////////////////////////////
+PARAM_TEST_CASE(HOG, Size, int)
 {
-    cv::Size winSize;
+    Size winSize;
     int type;
+    Mat img_rgb;
     virtual void SetUp()
     {
         winSize = GET_PARAM(0);
         type = GET_PARAM(1);
+        img_rgb = readImage(workdir + "../gpu/road.png");
+        if(img_rgb.empty())
+        {
+            std::cout << "Couldn't read road.png" << std::endl;
+        }
     }
 };
 
 TEST_P(HOG, GetDescriptors)
 {
-    // Load image
-    cv::Mat img_rgb = readImage(workdir + "lena.jpg");
-    ASSERT_FALSE(img_rgb.empty());
-
     // Convert image
-    cv::Mat img;
+    Mat img;
     switch (type)
     {
     case CV_8UC1:
-        cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
+        cvtColor(img_rgb, img, COLOR_BGR2GRAY);
         break;
     case CV_8UC4:
     default:
-        cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+        cvtColor(img_rgb, img, COLOR_BGR2BGRA);
         break;
     }
-    cv::ocl::oclMat d_img(img);
+    ocl::oclMat d_img(img);
 
     // HOGs
-    cv::ocl::HOGDescriptor ocl_hog;
+    ocl::HOGDescriptor ocl_hog;
     ocl_hog.gamma_correction = true;
-    cv::HOGDescriptor hog;
+    HOGDescriptor hog;
     hog.gammaCorrection = true;
 
     // Compute descriptor
-    cv::ocl::oclMat d_descriptors;
+    ocl::oclMat d_descriptors;
     ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
-    cv::Mat down_descriptors;
+    Mat down_descriptors;
     d_descriptors.download(down_descriptors);
     down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
 
@@ -105,45 +115,34 @@ TEST_P(HOG, GetDescriptors)
         hog.compute(img_rgb, descriptors, ocl_hog.win_size);
         break;
     }
-    cv::Mat cpu_descriptors(descriptors);
+    Mat cpu_descriptors(descriptors);
 
     EXPECT_MAT_SIMILAR(down_descriptors, cpu_descriptors, 1e-2);
 }
 
-
-bool match_rect(cv::Rect r1, cv::Rect r2, int threshold)
-{
-    return ((abs(r1.x - r2.x) < threshold) && (abs(r1.y - r2.y) < threshold) &&
-            (abs(r1.width - r2.width) < threshold) && (abs(r1.height - r2.height) < threshold));
-}
-
 TEST_P(HOG, Detect)
 {
-    // Load image
-    cv::Mat img_rgb = readImage(workdir + "lena.jpg");
-    ASSERT_FALSE(img_rgb.empty());
-
     // Convert image
-    cv::Mat img;
+    Mat img;
     switch (type)
     {
     case CV_8UC1:
-        cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
+        cvtColor(img_rgb, img, COLOR_BGR2GRAY);
         break;
     case CV_8UC4:
     default:
-        cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+        cvtColor(img_rgb, img, COLOR_BGR2BGRA);
         break;
     }
-    cv::ocl::oclMat d_img(img);
+    ocl::oclMat d_img(img);
 
     // HOGs
-    if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
-        winSize = cv::Size(64, 128);
-    cv::ocl::HOGDescriptor ocl_hog(winSize);
+    if ((winSize != Size(48, 96)) && (winSize != Size(64, 128)))
+        winSize = Size(64, 128);
+    ocl::HOGDescriptor ocl_hog(winSize);
     ocl_hog.gamma_correction = true;
 
-    cv::HOGDescriptor hog;
+    HOGDescriptor hog;
     hog.winSize = winSize;
     hog.gammaCorrection = true;
 
@@ -165,88 +164,119 @@ TEST_P(HOG, Detect)
     }
 
     // OpenCL detection
-    std::vector<cv::Rect> d_found;
-    ocl_hog.detectMultiScale(d_img, d_found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+    std::vector<Rect> d_found;
+    ocl_hog.detectMultiScale(d_img, d_found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
 
     // CPU detection
-    std::vector<cv::Rect> found;
+    std::vector<Rect> found;
     switch (type)
     {
     case CV_8UC1:
-        hog.detectMultiScale(img, found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+        hog.detectMultiScale(img, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
         break;
     case CV_8UC4:
     default:
-        hog.detectMultiScale(img_rgb, found, 0, cv::Size(8, 8), cv::Size(0, 0), 1.05, 2);
+        hog.detectMultiScale(img_rgb, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
         break;
     }
 
-    // Ground-truth rectangular people window
-    cv::Rect win1_64x128(231, 190, 72, 144);
-    cv::Rect win2_64x128(621, 156, 97, 194);
-    cv::Rect win1_48x96(238, 198, 63, 126);
-    cv::Rect win2_48x96(619, 161, 92, 185);
-    cv::Rect win3_48x96(488, 136, 56, 112);
-
-    // Compare whether ground-truth windows are detected and compare the number of windows detected.
-    std::vector<int> d_comp(4);
-    std::vector<int> comp(4);
-    for(int i = 0; i < (int)d_comp.size(); i++)
-    {
-        d_comp[i] = 0;
-        comp[i] = 0;
-    }
-
-    int threshold = 10;
-    int val = 32;
-    d_comp[0] = (int)d_found.size();
-    comp[0] = (int)found.size();
-    if (winSize == cv::Size(48, 96))
-    {
-        for(int i = 0; i < (int)d_found.size(); i++)
-        {
-            if (match_rect(d_found[i], win1_48x96, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found[i], win2_48x96, threshold))
-                d_comp[2] = val;
-            if (match_rect(d_found[i], win3_48x96, threshold))
-                d_comp[3] = val;
-        }
-        for(int i = 0; i < (int)found.size(); i++)
-        {
-            if (match_rect(found[i], win1_48x96, threshold))
-                comp[1] = val;
-            if (match_rect(found[i], win2_48x96, threshold))
-                comp[2] = val;
-            if (match_rect(found[i], win3_48x96, threshold))
-                comp[3] = val;
-        }
-    }
-    else if (winSize == cv::Size(64, 128))
-    {
-        for(int i = 0; i < (int)d_found.size(); i++)
-        {
-            if (match_rect(d_found[i], win1_64x128, threshold))
-                d_comp[1] = val;
-            if (match_rect(d_found[i], win2_64x128, threshold))
-                d_comp[2] = val;
-        }
-        for(int i = 0; i < (int)found.size(); i++)
-        {
-            if (match_rect(found[i], win1_64x128, threshold))
-                comp[1] = val;
-            if (match_rect(found[i], win2_64x128, threshold))
-                comp[2] = val;
-        }
-    }
-
-    EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3);
+    EXPECT_LT(checkRectSimilarity(img.size(), found, d_found), 1.0);
 }
 
 
 INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
-                            testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
+                            testing::Values(Size(64, 128), Size(48, 96)),
                             testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
 
+#if 0
+///////////////////////////// Haar //////////////////////////////
+IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
+CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
+CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
+struct getRect
+{
+    Rect operator ()(const CvAvgComp &e) const
+    {
+        return e.rect;
+    }
+};
+
+PARAM_TEST_CASE(Haar, int, CascadeName)
+{
+    ocl::OclCascadeClassifier cascade, nestedCascade;
+    CascadeClassifier cpucascade, cpunestedCascade;
+
+    int flags;
+    std::string cascadeName;
+    vector<Rect> faces, oclfaces;
+    Mat img;
+    ocl::oclMat d_img;
+
+    virtual void SetUp()
+    {
+        flags = GET_PARAM(0);
+        cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(1));
+        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
+        {
+            std::cout << "ERROR: Could not load classifier cascade" << std::endl;
+            return;
+        }
+        img = readImage(workdir + "lena.jpg", IMREAD_GRAYSCALE);
+        if(img.empty())
+        {
+            std::cout << "Couldn't read lena.jpg" << std::endl;
+            return ;
+        }
+        equalizeHist(img, img);
+        d_img.upload(img);
+    }
+};
+
+TEST_P(Haar, FaceDetect)
+{
+    MemStorage storage(cvCreateMemStorage(0));
+    CvSeq *_objects;
+    _objects = cascade.oclHaarDetectObjects(d_img, storage, 1.1, 3,
+                                            flags, Size(30, 30), Size(0, 0));
+    vector<CvAvgComp> vecAvgComp;
+    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
+    oclfaces.resize(vecAvgComp.size());
+    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
+
+    cpucascade.detectMultiScale(img, faces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+
+    EXPECT_LT(checkRectSimilarity(img.size(), faces, oclfaces), 1.0);
+}
+
+TEST_P(Haar, FaceDetectUseBuf)
+{
+    ocl::OclCascadeClassifierBuf cascadebuf;
+    if(!cascadebuf.load(cascadeName))
+    {
+        std::cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << std::endl;
+        return;
+    }
+    cascadebuf.detectMultiScale(d_img, oclfaces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+    cpucascade.detectMultiScale(img, faces,  1.1, 3,
+                                flags,
+                                Size(30, 30), Size(0, 0));
+
+    // intentionally run ocl facedetect again and check if it still works after the first run
+    cascadebuf.detectMultiScale(d_img, oclfaces,  1.1, 3,
+                                flags,
+                                Size(30, 30));
+    cascadebuf.release();
+
+    EXPECT_LT(checkRectSimilarity(img.size(), faces, oclfaces), 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, Haar,
+    Combine(Values(CV_HAAR_SCALE_IMAGE, 0),
+            Values(cascade_frontalface_alt/*, cascade_frontalface_alt2*/)));
+#endif
 
 #endif //HAVE_OPENCL
diff --git a/modules/ocl/test/test_pyrdown.cpp b/modules/ocl/test/test_pyramids.cpp
similarity index 75%
rename from modules/ocl/test/test_pyrdown.cpp
rename to modules/ocl/test/test_pyramids.cpp
index 6d00fb5e45..1bd188dea6 100644
--- a/modules/ocl/test/test_pyrdown.cpp
+++ b/modules/ocl/test/test_pyramids.cpp
@@ -15,7 +15,6 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Dachuan Zhao, dachuan@multicorewareinc.com
 //    Yao Wang yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -56,11 +55,12 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
 
-PARAM_TEST_CASE(PyrDown, MatType, int)
+PARAM_TEST_CASE(PyrBase, MatType, int)
 {
     int type;
     int channels;
-
+    Mat dst_cpu;
+    oclMat gdst;
     virtual void SetUp()
     {
         type = GET_PARAM(0);
@@ -69,19 +69,19 @@ PARAM_TEST_CASE(PyrDown, MatType, int)
 
 };
 
+/////////////////////// PyrDown //////////////////////////
+struct PyrDown : PyrBase {};
 
 TEST_P(PyrDown, Mat)
 {
     for(int j = 0; j < LOOP_TIMES; j++)
     {
-        cv::Size size(MWIDTH, MHEIGHT);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Mat src = randomMat(rng, size, CV_MAKETYPE(type, channels), 0, 100, false);
-
-        cv::ocl::oclMat gsrc(src), gdst;
-        cv::Mat dst_cpu;
-        cv::pyrDown(src, dst_cpu);
-        cv::ocl::pyrDown(gsrc, gdst);
+        Size size(MWIDTH, MHEIGHT);
+        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        oclMat gsrc(src);
+        
+        pyrDown(src, dst_cpu);
+        pyrDown(gsrc, gdst);
 
         EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), type == CV_32F ? 1e-4f : 1.0f);
     }
@@ -90,5 +90,27 @@ TEST_P(PyrDown, Mat)
 INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine(
                             Values(CV_8U, CV_32F), Values(1, 3, 4)));
 
+/////////////////////// PyrUp //////////////////////////
 
+struct PyrUp : PyrBase {};
+
+TEST_P(PyrUp, Accuracy)
+{
+    for(int j = 0; j < LOOP_TIMES; j++)
+    {
+        Size size(MWIDTH, MHEIGHT);
+        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        oclMat gsrc(src);
+
+        pyrUp(src, dst_cpu);
+        pyrUp(gsrc, gdst);
+
+        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (type == CV_32F ? 1e-4f : 1.0));
+    }
+
+}
+
+
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
+                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/utility.cpp b/modules/ocl/test/utility.cpp
index 9873a88553..d7f6732069 100644
--- a/modules/ocl/test/utility.cpp
+++ b/modules/ocl/test/utility.cpp
@@ -100,12 +100,6 @@ Mat randomMat(Size size, int type, double minVal, double maxVal)
     return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
 }
 
-
-
-
-
-
-
 /*
 void showDiff(InputArray gold_, InputArray actual_, double eps)
 {
@@ -137,58 +131,7 @@ void showDiff(InputArray gold_, InputArray actual_, double eps)
 }
 */
 
-/*
-bool supportFeature(const DeviceInfo& info, FeatureSet feature)
-{
-    return TargetArchs::builtWith(feature) && info.supports(feature);
-}
 
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
-
-vector<DeviceInfo> devices(FeatureSet feature)
-{
-    const vector<DeviceInfo>& d = devices();
-
-    vector<DeviceInfo> devs_filtered;
-
-    if (TargetArchs::builtWith(feature))
-    {
-        devs_filtered.reserve(d.size());
-
-        for (size_t i = 0, size = d.size(); i < size; ++i)
-        {
-            const DeviceInfo& info = d[i];
-
-            if (info.supports(feature))
-                devs_filtered.push_back(info);
-        }
-    }
-
-    return devs_filtered;
-}
-*/
 
 vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
 {
@@ -264,3 +207,48 @@ void PrintTo(const Inverse &inverse, std::ostream *os)
         (*os) << "direct";
 }
 
+double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
+{
+    double final_test_result = 0.0;
+    size_t sz1 = ob1.size();
+    size_t sz2 = ob2.size();
+
+    if(sz1 != sz2)
+    {
+        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
+    else
+    {
+        if(sz1==0 && sz2==0)
+            return 0;
+        cv::Mat cpu_result(sz, CV_8UC1);
+        cpu_result.setTo(0);
+
+        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
+        {      
+            cv::Mat cpu_result_roi(cpu_result, *r);
+            cpu_result_roi.setTo(1);
+            cpu_result.copyTo(cpu_result);
+        }
+        int cpu_area = cv::countNonZero(cpu_result > 0);
+
+        cv::Mat gpu_result(sz, CV_8UC1);
+        gpu_result.setTo(0);
+        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
+        {
+            cv::Mat gpu_result_roi(gpu_result, *r2);
+            gpu_result_roi.setTo(1);
+            gpu_result.copyTo(gpu_result);
+        }
+
+        cv::Mat result_;
+        multiply(cpu_result, gpu_result, result_);
+        int result = cv::countNonZero(result_ > 0);
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
+    }
+    return final_test_result;
+}
+
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp
index 9eb48a0ef6..36e9b9a547 100644
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -57,13 +57,12 @@ cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal =
 
 void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
 
-//! return true if device supports specified feature and gpu module was built with support the feature.
-//bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+// This function test if gpu_rst matches cpu_rst.
+// If the two vectors are not equal, it will return the difference in vector size
+// Else it will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
+// The smaller, the better matched
+double checkRectSimilarity(cv::Size sz, std::vector<cv::Rect>& ob1, std::vector<cv::Rect>& ob2);
 
-//! return all devices compatible with current gpu module build.
-//const std::vector<cv::ocl::DeviceInfo>& devices();
-//! return all devices compatible with current gpu module build which support specified feature.
-//std::vector<cv::ocl::DeviceInfo> devices(cv::gpu::FeatureSet feature);
 
 //! read image from testdata folder.
 cv::Mat readImage(const std::string &fileName, int flags = cv::IMREAD_COLOR);
diff --git a/modules/python/CMakeLists.txt b/modules/python/CMakeLists.txt
index 119c8e1bd0..0b4c59d636 100644
--- a/modules/python/CMakeLists.txt
+++ b/modules/python/CMakeLists.txt
@@ -67,7 +67,7 @@ else()
 endif()
 target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS})
 
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import distutils.sysconfig; print distutils.sysconfig.get_config_var('SO')"
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import distutils.sysconfig; print(distutils.sysconfig.get_config_var('SO'))"
                 RESULT_VARIABLE PYTHON_CVPY_PROCESS
                 OUTPUT_VARIABLE CVPY_SUFFIX
                 OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index c834b1f322..e68da59cf8 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -23,6 +23,8 @@
 #  include "opencv2/nonfree.hpp"
 #endif
 
+#include "pycompat.hpp"
+
 using cv::flann::IndexParams;
 using cv::flann::SearchParams;
 
@@ -1176,7 +1178,11 @@ static int convert_to_char(PyObject *o, char *dst, const char *name = "no_name")
   }
 }
 
+#if PY_MAJOR_VERSION >= 3
+#define MKTYPE2(NAME) pyopencv_##NAME##_specials(); if (!to_ok(&pyopencv_##NAME##_Type)) return NULL;
+#else
 #define MKTYPE2(NAME) pyopencv_##NAME##_specials(); if (!to_ok(&pyopencv_##NAME##_Type)) return
+#endif
 
 #ifdef __GNUC__
 #  pragma GCC diagnostic ignored "-Wunused-parameter"
@@ -1190,7 +1196,7 @@ static PyMethodDef methods[] = {
 
 #include "pyopencv_generated_func_tab.h"
   {"createTrackbar", pycvCreateTrackbar, METH_VARARGS, "createTrackbar(trackbarName, windowName, value, count, onChange) -> None"},
-  {"setMouseCallback", (PyCFunction)pycvSetMouseCallback, METH_KEYWORDS, "setMouseCallback(windowName, onMouse [, param]) -> None"},
+  {"setMouseCallback", (PyCFunction)pycvSetMouseCallback, METH_VARARGS | METH_KEYWORDS, "setMouseCallback(windowName, onMouse [, param]) -> None"},
   {NULL, NULL},
 };
 
@@ -1205,15 +1211,35 @@ static int to_ok(PyTypeObject *to)
   return (PyType_Ready(to) == 0);
 }
 
+
+#if PY_MAJOR_VERSION >= 3
+extern "C" CV_EXPORTS PyObject* PyInit_cv2();
+static struct PyModuleDef cv2_moduledef =
+{
+    PyModuleDef_HEAD_INIT,
+    MODULESTR,
+    "Python wrapper for OpenCV.",
+    -1,     /* size of per-interpreter state of the module,
+               or -1 if the module keeps state in global variables. */
+    methods
+};
+
+PyObject* PyInit_cv2()
+#else
 extern "C" CV_EXPORTS void initcv2();
 
 void initcv2()
+#endif
 {
   import_array();
 
 #include "pyopencv_generated_type_reg.h"
 
+#if PY_MAJOR_VERSION >= 3
+  PyObject* m = PyModule_Create(&cv2_moduledef);
+#else
   PyObject* m = Py_InitModule(MODULESTR, methods);
+#endif
   PyObject* d = PyModule_GetDict(m);
 
   PyDict_SetItemString(d, "__version__", PyString_FromString(CV_VERSION));
@@ -1262,5 +1288,7 @@ void initcv2()
   PUBLISH(CV_64FC4);
 
 #include "pyopencv_generated_const_reg.h"
-
+#if PY_MAJOR_VERSION >= 3
+    return m;
+#endif
 }
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 0fed1838b0..816a386c02 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -1,8 +1,14 @@
 #!/usr/bin/env python
 
-import hdr_parser, sys, re, os, cStringIO
+from __future__ import print_function
+import hdr_parser, sys, re, os
 from string import Template
 
+if sys.version_info[0] >= 3:
+    from io import StringIO
+else:
+    from cStringIO import StringIO
+
 ignored_arg_types = ["RNG*"]
 
 gen_template_check_self = Template("""    if(!PyObject_TypeCheck(self, &pyopencv_${name}_Type))
@@ -33,6 +39,13 @@ gen_template_func_body = Template("""$code_decl
     }
 """)
 
+py_major_version = sys.version_info[0]
+if py_major_version >= 3:
+    head_init_str = "PyVarObject_HEAD_INIT(&PyType_Type, 0)"
+else:
+    head_init_str = """PyObject_HEAD_INIT(&PyType_Type)
+0,"""
+
 gen_template_simple_type_decl = Template("""
 struct pyopencv_${name}_t
 {
@@ -42,8 +55,7 @@ struct pyopencv_${name}_t
 
 static PyTypeObject pyopencv_${name}_Type =
 {
-    PyObject_HEAD_INIT(&PyType_Type)
-    0,
+    %s
     MODULESTR".$wname",
     sizeof(pyopencv_${name}_t),
 };
@@ -66,13 +78,13 @@ template<> bool pyopencv_to(PyObject* src, ${cname}& dst, const char* name)
         return true;
     if(!PyObject_TypeCheck(src, &pyopencv_${name}_Type))
     {
-        failmsg("Expected ${cname} for argument '%s'", name);
+        failmsg("Expected ${cname} for argument '%%s'", name);
         return false;
     }
     dst = ((pyopencv_${name}_t*)src)->v;
     return true;
 }
-""")
+""" % head_init_str)
 
 
 gen_template_type_decl = Template("""
@@ -84,8 +96,7 @@ struct pyopencv_${name}_t
 
 static PyTypeObject pyopencv_${name}_Type =
 {
-    PyObject_HEAD_INIT(&PyType_Type)
-    0,
+    %s
     MODULESTR".$wname",
     sizeof(pyopencv_${name}_t),
 };
@@ -110,14 +121,14 @@ template<> bool pyopencv_to(PyObject* src, Ptr<${cname}>& dst, const char* name)
         return true;
     if(!PyObject_TypeCheck(src, &pyopencv_${name}_Type))
     {
-        failmsg("Expected ${cname} for argument '%s'", name);
+        failmsg("Expected ${cname} for argument '%%s'", name);
         return false;
     }
     dst = ((pyopencv_${name}_t*)src)->v;
     return true;
 }
 
-""")
+""" % head_init_str)
 
 gen_template_map_type_cvt = Template("""
 template<> bool pyopencv_to(PyObject* src, ${cname}& dst, const char* name);
@@ -245,9 +256,9 @@ class ClassInfo(object):
         if decl:
             self.bases = decl[1].split()[1:]
             if len(self.bases) > 1:
-                print "Note: Class %s has more than 1 base class (not supported by Python C extensions)" % (self.name,)
-                print "      Bases: ", " ".join(self.bases)
-                print "      Only the first base class will be used"
+                print("Note: Class %s has more than 1 base class (not supported by Python C extensions)" % (self.name,))
+                print("      Bases: ", " ".join(self.bases))
+                print("      Only the first base class will be used")
                 self.bases = [self.bases[0].strip(",")]
                 #return sys.exit(-1)
             if self.bases and self.bases[0].startswith("cv::"):
@@ -280,8 +291,8 @@ class ClassInfo(object):
         if self.ismap:
             return self.gen_map_code(all_classes)
 
-        getset_code = cStringIO.StringIO()
-        getset_inits = cStringIO.StringIO()
+        getset_code = StringIO()
+        getset_inits = StringIO()
 
         sorted_props = [(p.name, p) for p in self.props]
         sorted_props.sort()
@@ -304,10 +315,10 @@ class ClassInfo(object):
                     getset_code.write(gen_template_set_prop.substitute(name=self.name, member=pname, membertype=p.tp, access=access_op))
                 getset_inits.write(gen_template_rw_prop_init.substitute(name=self.name, member=pname))
 
-        methods_code = cStringIO.StringIO()
-        methods_inits = cStringIO.StringIO()
+        methods_code = StringIO()
+        methods_inits = StringIO()
 
-        sorted_methods = self.methods.items()
+        sorted_methods = list(self.methods.items())
         sorted_methods.sort()
 
         for mname, m in sorted_methods:
@@ -315,7 +326,7 @@ class ClassInfo(object):
             methods_inits.write(m.get_tab_entry())
 
         baseptr = "NULL"
-        if self.bases and all_classes.has_key(self.bases[0]):
+        if self.bases and self.bases[0] in all_classes:
             baseptr = "&pyopencv_" + all_classes[self.bases[0]].name + "_Type"
 
         code = gen_template_type_impl.substitute(name=self.name, wname=self.wname, cname=self.cname,
@@ -532,7 +543,7 @@ class FuncInfo(object):
             p2 = s.rfind(")")
             docstring_list = [s[:p1+1] + "[" + s[p1+1:p2] + "]" + s[p2:]]
 
-        return Template('    {"$py_funcname", (PyCFunction)$wrap_funcname, METH_KEYWORDS, "$py_docstring"},\n'
+        return Template('    {"$py_funcname", (PyCFunction)$wrap_funcname, METH_VARARGS | METH_KEYWORDS, "$py_docstring"},\n'
                         ).substitute(py_funcname = self.variants[0].wname, wrap_funcname=self.get_wrapper_name(),
                                      py_docstring = "  or  ".join(docstring_list))
 
@@ -609,7 +620,7 @@ class FuncInfo(object):
                         defval0 = "0"
                         tp1 = tp.replace("*", "_ptr")
                 if tp1.endswith("*"):
-                    print "Error: type with star: a.tp=%s, tp=%s, tp1=%s" % (a.tp, tp, tp1)
+                    print("Error: type with star: a.tp=%s, tp=%s, tp1=%s" % (a.tp, tp, tp1))
                     sys.exit(-1)
 
                 amapping = simple_argtype_mapping.get(tp, (tp, "O", defval0))
@@ -715,11 +726,11 @@ class PythonWrapperGenerator(object):
         self.classes = {}
         self.funcs = {}
         self.consts = {}
-        self.code_types = cStringIO.StringIO()
-        self.code_funcs = cStringIO.StringIO()
-        self.code_func_tab = cStringIO.StringIO()
-        self.code_type_reg = cStringIO.StringIO()
-        self.code_const_reg = cStringIO.StringIO()
+        self.code_types = StringIO()
+        self.code_funcs = StringIO()
+        self.code_func_tab = StringIO()
+        self.code_type_reg = StringIO()
+        self.code_const_reg = StringIO()
         self.class_idx = 0
 
     def add_class(self, stype, name, decl):
@@ -727,9 +738,9 @@ class PythonWrapperGenerator(object):
         classinfo.decl_idx = self.class_idx
         self.class_idx += 1
 
-        if self.classes.has_key(classinfo.name):
-            print "Generator error: class %s (cname=%s) already exists" \
-                % (classinfo.name, classinfo.cname)
+        if classinfo.name in self.classes:
+            print("Generator error: class %s (cname=%s) already exists" \
+                % (classinfo.name, classinfo.cname))
             sys.exit(-1)
         self.classes[classinfo.name] = classinfo
         if classinfo.bases and not classinfo.isalgorithm:
@@ -738,9 +749,9 @@ class PythonWrapperGenerator(object):
     def add_const(self, name, decl):
         constinfo = ConstInfo(name, decl[1])
 
-        if self.consts.has_key(constinfo.name):
-            print "Generator error: constant %s (cname=%s) already exists" \
-                % (constinfo.name, constinfo.cname)
+        if constinfo.name in self.consts:
+            print("Generator error: constant %s (cname=%s) already exists" \
+                % (constinfo.name, constinfo.cname))
             sys.exit(-1)
         self.consts[constinfo.name] = constinfo
 
@@ -779,7 +790,7 @@ class PythonWrapperGenerator(object):
         else:
             classinfo = self.classes.get(classname, ClassInfo(""))
             if not classinfo.name:
-                print "Generator error: the class for method %s is missing" % (name,)
+                print("Generator error: the class for method %s is missing" % (name,))
                 sys.exit(-1)
             func_map = classinfo.methods
 
@@ -819,7 +830,7 @@ class PythonWrapperGenerator(object):
                     self.add_func(decl)
 
         # step 2: generate code for the classes and their methods
-        classlist = self.classes.items()
+        classlist = list(self.classes.items())
         classlist.sort()
         for name, classinfo in classlist:
             if classinfo.ismap:
@@ -844,7 +855,7 @@ class PythonWrapperGenerator(object):
                 self.code_type_reg.write("MKTYPE2(%s);\n" % (classinfo.name,) )
 
         # step 3: generate the code for all the global functions
-        funclist = self.funcs.items()
+        funclist = list(self.funcs.items())
         funclist.sort()
         for name, func in funclist:
             code = func.gen_code(self.classes)
@@ -852,7 +863,7 @@ class PythonWrapperGenerator(object):
             self.code_func_tab.write(func.get_tab_entry())
 
         # step 4: generate the code for constants
-        constlist = self.consts.items()
+        constlist = list(self.consts.items())
         constlist.sort()
         for name, constinfo in constlist:
             self.gen_const_reg(constinfo)
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index b13fe8cf8d..b6f21c31eb 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+from __future__ import print_function
 import os, sys, re, string
 
 # the list only for debugging. The real list, used in the real OpenCV build, is specified in CMakeLists.txt
@@ -43,13 +44,13 @@ class CppHeaderParser(object):
     def get_macro_arg(self, arg_str, npos):
         npos2 = npos3 = arg_str.find("(", npos)
         if npos2 < 0:
-            print "Error: no arguments for the macro at %d" % (self.lineno,)
+            print("Error: no arguments for the macro at %d" % (self.lineno,))
             sys.exit(-1)
         balance = 1
         while 1:
             t, npos3 = self.find_next_token(arg_str, ['(', ')'], npos3+1)
             if npos3 < 0:
-                print "Error: no matching ')' in the macro call at %d" % (self.lineno,)
+                print("Error: no matching ')' in the macro call at %d" % (self.lineno,))
                 sys.exit(-1)
             if t == '(':
                 balance += 1
@@ -143,13 +144,13 @@ class CppHeaderParser(object):
                 angle_stack.append(0)
             elif w == "," or w == '>':
                 if not angle_stack:
-                    print "Error at %d: argument contains ',' or '>' not within template arguments" % (self.lineno,)
+                    print("Error at %d: argument contains ',' or '>' not within template arguments" % (self.lineno,))
                     sys.exit(-1)
                 if w == ",":
                     arg_type += "_and_"
                 elif w == ">":
                     if angle_stack[0] == 0:
-                        print "Error at %s:%d: template has no arguments" % (self.hname, self.lineno)
+                        print("Error at %s:%d: template has no arguments" % (self.hname, self.lineno))
                         sys.exit(-1)
                     if angle_stack[0] > 1:
                         arg_type += "_end_"
@@ -173,7 +174,7 @@ class CppHeaderParser(object):
             p1 = arg_name.find("[")
             p2 = arg_name.find("]",p1+1)
             if p2 < 0:
-                print "Error at %d: no closing ]" % (self.lineno,)
+                print("Error at %d: no closing ]" % (self.lineno,))
                 sys.exit(-1)
             counter_str = arg_name[p1+1:p2].strip()
             if counter_str == "":
@@ -358,7 +359,7 @@ class CppHeaderParser(object):
         if bool(re.match(r".*\)\s*const(\s*=\s*0)?", decl_str)):
             decl[2].append("/C")
         if "virtual" in decl_str:
-            print decl_str
+            print(decl_str)
         return decl
 
     def parse_func_decl(self, decl_str):
@@ -412,12 +413,12 @@ class CppHeaderParser(object):
         if decl_str.startswith("CVAPI"):
             rtype_end = decl_str.find(")", args_begin+1)
             if rtype_end < 0:
-                print "Error at %d. no terminating ) in CVAPI() macro: %s" % (self.lineno, decl_str)
+                print("Error at %d. no terminating ) in CVAPI() macro: %s" % (self.lineno, decl_str))
                 sys.exit(-1)
             decl_str = decl_str[args_begin+1:rtype_end] + " " + decl_str[rtype_end+1:]
             args_begin = decl_str.find("(")
         if args_begin < 0:
-            print "Error at %d: no args in '%s'" % (self.lineno, decl_str)
+            print("Error at %d: no args in '%s'" % (self.lineno, decl_str))
             sys.exit(-1)
 
         decl_start = decl_str[:args_begin].strip()
@@ -425,7 +426,7 @@ class CppHeaderParser(object):
         if decl_start.endswith("operator"):
             args_begin = decl_str.find("(", args_begin+1)
             if args_begin < 0:
-                print "Error at %d: no args in '%s'" % (self.lineno, decl_str)
+                print("Error at %d: no args in '%s'" % (self.lineno, decl_str))
                 sys.exit(-1)
             decl_start = decl_str[:args_begin].strip()
             # TODO: normalize all type of operators
@@ -455,7 +456,7 @@ class CppHeaderParser(object):
                     return [] # exotic - dynamic 2d array
                 else:
                     #print rettype, funcname, modlist, argno
-                    print "Error at %s:%d the function/method name is missing: '%s'" % (self.hname, self.lineno, decl_start)
+                    print("Error at %s:%d the function/method name is missing: '%s'" % (self.hname, self.lineno, decl_start))
                     sys.exit(-1)
 
         if self.wrap_mode and (("::" in funcname) or funcname.startswith("~")):
@@ -486,9 +487,9 @@ class CppHeaderParser(object):
             npos += 1
             t, npos = self.find_next_token(decl_str, ["(", ")", ",", "<", ">"], npos)
             if not t:
-                print "Error: no closing ')' at %d" % (self.lineno,)
-                print decl_str
-                print decl_str[arg_start:]
+                print("Error: no closing ')' at %d" % (self.lineno,))
+                print(decl_str)
+                print(decl_str[arg_start:])
                 sys.exit(-1)
             if t == "<":
                 angle_balance += 1
@@ -583,7 +584,7 @@ class CppHeaderParser(object):
             if block_type in ["file", "enum"]:
                 continue
             if block_type not in ["struct", "class", "namespace"]:
-                print "Error at %d: there are non-valid entries in the current block stack " % (self.lineno, self.block_stack)
+                print("Error at %d: there are non-valid entries in the current block stack " % (self.lineno, self.block_stack))
                 sys.exit(-1)
             if block_name:
                 n += block_name + "."
@@ -605,7 +606,7 @@ class CppHeaderParser(object):
             stmt_type = "block"
 
         if context == "block":
-            print "Error at %d: should not call parse_stmt inside blocks" % (self.lineno,)
+            print("Error at %d: should not call parse_stmt inside blocks" % (self.lineno,))
             sys.exit(-1)
 
         if context == "class" or context == "struct":
@@ -632,7 +633,7 @@ class CppHeaderParser(object):
                 try:
                     classname, bases, modlist = self.parse_class_decl(stmt[len("typedef "):])
                 except:
-                    print "Error at %s:%d" % (self.hname, self.lineno)
+                    print("Error at %s:%d" % (self.hname, self.lineno))
                     exit(1)
                 if classname.startswith("_Ipl"):
                     classname = classname[1:]
@@ -647,7 +648,7 @@ class CppHeaderParser(object):
                     try:
                         classname, bases, modlist = self.parse_class_decl(stmt)
                     except:
-                        print "Error at %s:%d" % (self.hname, self.lineno)
+                        print("Error at %s:%d" % (self.hname, self.lineno))
                         exit(1)
                     decl = []
                     if ("CV_EXPORTS_W" in stmt) or ("CV_EXPORTS_AS" in stmt) or (not self.wrap_mode):# and ("CV_EXPORTS" in stmt)):
@@ -767,7 +768,7 @@ class CppHeaderParser(object):
                 state = SCAN
 
             if state != SCAN:
-                print "Error at %d: invlid state = %d" % (self.lineno, state)
+                print("Error at %d: invlid state = %d" % (self.lineno, state))
                 sys.exit(-1)
 
             while 1:
@@ -795,7 +796,7 @@ class CppHeaderParser(object):
                     while 1:
                         t2, pos2 = self.find_next_token(l, ["\\", "\""], pos2)
                         if t2 == "":
-                            print "Error at %d: no terminating '\"'" % (self.lineno,)
+                            print("Error at %d: no terminating '\"'" % (self.lineno,))
                             sys.exit(-1)
                         if t2 == "\"":
                             break
@@ -836,7 +837,7 @@ class CppHeaderParser(object):
 
                 if token == "}":
                     if not self.block_stack:
-                        print "Error at %d: the block stack is empty" % (self.lineno,)
+                        print("Error at %d: the block stack is empty" % (self.lineno,))
                     self.block_stack[-1:] = []
                     if pos+1 < len(l) and l[pos+1] == ';':
                         pos += 1
@@ -851,13 +852,13 @@ class CppHeaderParser(object):
         Prints the list of declarations, retrieived by the parse() method
         """
         for d in decls:
-            print d[0], d[1], ";".join(d[2])
+            print(d[0], d[1], ";".join(d[2]))
             for a in d[3]:
-                print "   ", a[0], a[1], a[2],
+                print("   ", a[0], a[1], a[2], end="")
                 if a[3]:
-                    print "; ".join(a[3])
+                    print("; ".join(a[3]))
                 else:
-                    print
+                    print()
 
 if __name__ == '__main__':
     parser = CppHeaderParser()
@@ -867,4 +868,4 @@ if __name__ == '__main__':
     #for hname in sys.argv[1:]:
         #decls += parser.parse(hname, wmode=False)
     parser.print_decls(decls)
-    print len(decls)
+    print(len(decls))
diff --git a/modules/ocl/test/test_pyrup.cpp b/modules/python/src2/pycompat.hpp
similarity index 62%
rename from modules/ocl/test/test_pyrup.cpp
rename to modules/python/src2/pycompat.hpp
index 3c3c6ef47f..c473fffb20 100644
--- a/modules/ocl/test/test_pyrup.cpp
+++ b/modules/python/src2/pycompat.hpp
@@ -10,14 +10,10 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//    Zhang Chunpeng chunpeng@multicorewareinc.com
-//    Yao Wang yao@multicorewareinc.com
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -26,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -44,47 +40,25 @@
 //
 //M*/
 
-#include "precomp.hpp"
+// Defines for Python 2/3 compatibility.
+#ifndef __PYCOMPAT_HPP__
+#define __PYCOMPAT_HPP__
 
-#ifdef HAVE_OPENCL
+#if PY_MAJOR_VERSION >= 3
+// Python3 treats all ints as longs, PyInt_X functions have been removed.
+#define PyInt_Check PyLong_Check
+#define PyInt_CheckExact PyLong_CheckExact
+#define PyInt_AsLong PyLong_AsLong
+#define PyInt_AS_LONG PyLong_AS_LONG
+#define PyInt_FromLong PyLong_FromLong
+#define PyNumber_Int PyNumber_Long
 
-using namespace cv;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
+// Python3 strings are unicode, these defines mimic the Python2 functionality.
+#define PyString_Check PyUnicode_Check
+#define PyString_FromString PyUnicode_FromString
+#define PyString_AsString PyUnicode_AsUTF8
+#define PyString_FromStringAndSize PyUnicode_FromStringAndSize
+#define PyString_Size PyUnicode_GET_SIZE
+#endif
 
-PARAM_TEST_CASE(PyrUp, MatType, int)
-{
-    int type;
-    int channels;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-    }
-};
-
-TEST_P(PyrUp, Accuracy)
-{
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        Size size(MWIDTH, MHEIGHT);
-        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
-        Mat dst_gold;
-        pyrUp(src, dst_gold);
-        ocl::oclMat dst;
-        ocl::oclMat srcMat(src);
-        ocl::pyrUp(srcMat, dst);
-
-        EXPECT_MAT_NEAR(dst_gold, Mat(dst), (type == CV_32F ? 1e-4f : 1.0));
-    }
-
-}
-
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
-                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
-
-
-#endif // HAVE_OPENCL
\ No newline at end of file
+#endif // END HEADER GUARD
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 99c864ae2a..e8b7bf24f9 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -64,10 +64,6 @@ struct DistIdxPair
 
 struct MatchPairsBody : ParallelLoopBody
 {
-    MatchPairsBody(const MatchPairsBody& other)
-            : matcher(other.matcher), features(other.features),
-              pairwise_matches(other.pairwise_matches), near_pairs(other.near_pairs) {}
-
     MatchPairsBody(FeaturesMatcher &_matcher, const std::vector<ImageFeatures> &_features,
                    std::vector<MatchesInfo> &_pairwise_matches, std::vector<std::pair<int,int> > &_near_pairs)
             : matcher(_matcher), features(_features),
diff --git a/modules/stitching/src/motion_estimators.cpp b/modules/stitching/src/motion_estimators.cpp
index d7b64e1080..1bb3df7bbc 100644
--- a/modules/stitching/src/motion_estimators.cpp
+++ b/modules/stitching/src/motion_estimators.cpp
@@ -69,13 +69,13 @@ struct CalcRotation
         K_from(0,0) = cameras[edge.from].focal;
         K_from(1,1) = cameras[edge.from].focal * cameras[edge.from].aspect;
         K_from(0,2) = cameras[edge.from].ppx;
-        K_from(0,2) = cameras[edge.from].ppy;
+        K_from(1,2) = cameras[edge.from].ppy;
 
         Mat_<double> K_to = Mat::eye(3, 3, CV_64F);
         K_to(0,0) = cameras[edge.to].focal;
         K_to(1,1) = cameras[edge.to].focal * cameras[edge.to].aspect;
         K_to(0,2) = cameras[edge.to].ppx;
-        K_to(0,2) = cameras[edge.to].ppy;
+        K_to(1,2) = cameras[edge.to].ppy;
 
         Mat R = K_from.inv() * pairwise_matches[pair_idx].H.inv() * K_to;
         cameras[edge.to].R = cameras[edge.from].R * R;
diff --git a/modules/superres/src/btv_l1_gpu.cpp b/modules/superres/src/btv_l1_gpu.cpp
index 6813187c45..7b2ad73700 100644
--- a/modules/superres/src/btv_l1_gpu.cpp
+++ b/modules/superres/src/btv_l1_gpu.cpp
@@ -230,7 +230,7 @@ namespace
         Ptr<DenseOpticalFlowExt> opticalFlow_;
 
     private:
-        std::vector<Ptr<FilterEngine_GPU> > filters_;
+        std::vector<Ptr<gpu::Filter> > filters_;
         int curBlurKernelSize_;
         double curBlurSigma_;
         int curSrcType_;
@@ -299,7 +299,7 @@ namespace
         {
             filters_.resize(src.size());
             for (size_t i = 0; i < src.size(); ++i)
-                filters_[i] = createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                filters_[i] = gpu::createGaussianFilter(src[0].type(), -1, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
             curBlurKernelSize_ = blurKernelSize_;
             curBlurSigma_ = blurSigma_;
             curSrcType_ = src[0].type();
@@ -346,7 +346,7 @@ namespace
                 // a = M * Ih
                 gpu::remap(highRes_, a_[k], backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
                 // b = HM * Ih
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // c = DHF * Ih
                 gpu::resize(b_[k], c_[k], lowResSize, 0, 0, INTER_NEAREST, streams_[k]);
 
@@ -355,7 +355,7 @@ namespace
                 // a = Dt * diff
                 upscale(c_[k], a_[k], scale_, streams_[k]);
                 // b = HtDt * diff
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // diffTerm = MtHtDt * diff
                 gpu::remap(b_[k], diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
             }
diff --git a/modules/superres/src/frame_source.cpp b/modules/superres/src/frame_source.cpp
index cba2b14ea3..7da817cfac 100644
--- a/modules/superres/src/frame_source.cpp
+++ b/modules/superres/src/frame_source.cpp
@@ -210,7 +210,7 @@ namespace
 
     private:
         String fileName_;
-        VideoReader_GPU reader_;
+        Ptr<gpucodec::VideoReader> reader_;
         GpuMat frame_;
     };
 
@@ -223,13 +223,13 @@ namespace
     {
         if (_frame.kind() == _InputArray::GPU_MAT)
         {
-            bool res = reader_.read(_frame.getGpuMatRef());
+            bool res = reader_->nextFrame(_frame.getGpuMatRef());
             if (!res)
                 _frame.release();
         }
         else
         {
-            bool res = reader_.read(frame_);
+            bool res = reader_->nextFrame(frame_);
             if (!res)
                 _frame.release();
             else
@@ -239,9 +239,7 @@ namespace
 
     void VideoFrameSource_GPU::reset()
     {
-        reader_.close();
-        reader_.open(fileName_);
-        CV_Assert( reader_.isOpened() );
+        reader_ = gpucodec::createVideoReader(fileName_);
     }
 }
 
diff --git a/modules/ts/misc/testlog_parser.py b/modules/ts/misc/testlog_parser.py
index 7ae6aa5980..8ab21417ca 100755
--- a/modules/ts/misc/testlog_parser.py
+++ b/modules/ts/misc/testlog_parser.py
@@ -100,34 +100,39 @@ class TestInfo(object):
     def dump(self, units="ms"):
         print "%s ->\t\033[1;31m%s\033[0m = \t%.2f%s" % (str(self), self.status, self.get("gmean", units), units)
 
-    def shortName(self):
+
+    def getName(self):
         pos = self.name.find("/")
         if pos > 0:
-            name = self.name[:pos]
-        else:
-            name = self.name
-        if self.fixture.endswith(name):
-            fixture = self.fixture[:-len(name)]
+            return self.name[:pos]
+        return self.name
+
+
+    def getFixture(self):
+        if self.fixture.endswith(self.getName()):
+            fixture = self.fixture[:-len(self.getName())]
         else:
             fixture = self.fixture
         if fixture.endswith("_"):
             fixture = fixture[:-1]
+        return fixture
+
+
+    def param(self):
+        return '::'.join(filter(None, [self.type_param, self.value_param]))
+
+    def shortName(self):
+        name = self.getName()
+        fixture = self.getFixture()
         return '::'.join(filter(None, [name, fixture]))
 
+
     def __str__(self):
-        pos = self.name.find("/")
-        if pos > 0:
-            name = self.name[:pos]
-        else:
-            name = self.name
-        if self.fixture.endswith(name):
-            fixture = self.fixture[:-len(name)]
-        else:
-            fixture = self.fixture
-        if fixture.endswith("_"):
-            fixture = fixture[:-1]
+        name = self.getName()
+        fixture = self.getFixture()
         return '::'.join(filter(None, [name, fixture, self.type_param, self.value_param]))
 
+
     def __cmp__(self, other):
         r = cmp(self.fixture, other.fixture);
         if r != 0:
diff --git a/modules/ts/misc/xls-report.py b/modules/ts/misc/xls-report.py
new file mode 100755
index 0000000000..e79bb123dd
--- /dev/null
+++ b/modules/ts/misc/xls-report.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+
+from __future__ import division
+
+import ast
+import logging
+import numbers
+import os, os.path
+import re
+
+from argparse import ArgumentParser
+from collections import OrderedDict
+from glob import glob
+from itertools import ifilter
+
+import xlwt
+
+from testlog_parser import parseLogFile
+
+# To build XLS report you neet to put your xmls (OpenCV tests output) in the
+# following way:
+#
+# "root" --- folder, representing the whole XLS document. It contains several
+# subfolders --- sheet-paths of the XLS document. Each sheet-path contains it's
+# subfolders --- config-paths. Config-paths are columns of the sheet and
+# they contains xmls files --- output of OpenCV modules testing.
+# Config-path means OpenCV build configuration, including different
+# options such as NEON, TBB, GPU enabling/disabling.
+#
+# root
+# root\sheet_path
+# root\sheet_path\configuration1 (column 1)
+# root\sheet_path\configuration2 (column 2)
+
+re_image_size = re.compile(r'^ \d+ x \d+$', re.VERBOSE)
+re_data_type = re.compile(r'^ (?: 8 | 16 | 32 | 64 ) [USF] C [1234] $', re.VERBOSE)
+
+time_style = xlwt.easyxf(num_format_str='#0.00')
+no_time_style = xlwt.easyxf('pattern: pattern solid, fore_color gray25')
+
+speedup_style = time_style
+good_speedup_style = xlwt.easyxf('font: color green', num_format_str='#0.00')
+bad_speedup_style = xlwt.easyxf('font: color red', num_format_str='#0.00')
+no_speedup_style = no_time_style
+error_speedup_style = xlwt.easyxf('pattern: pattern solid, fore_color orange')
+header_style = xlwt.easyxf('font: bold true; alignment: horizontal centre, vertical top, wrap True')
+
+def collect_xml(collection, configuration, xml_fullname):
+    xml_fname = os.path.split(xml_fullname)[1]
+    module = xml_fname[:xml_fname.index('_')]
+
+    module_tests = collection.setdefault(module, OrderedDict())
+
+    for test in sorted(parseLogFile(xml_fullname)):
+        test_results = module_tests.setdefault((test.shortName(), test.param()), {})
+        test_results[configuration] = test.get("gmean") if test.status == 'run' else test.status
+
+def main():
+    arg_parser = ArgumentParser(description='Build an XLS performance report.')
+    arg_parser.add_argument('sheet_dirs', nargs='+', metavar='DIR', help='directory containing perf test logs')
+    arg_parser.add_argument('-o', '--output', metavar='XLS', default='report.xls', help='name of output file')
+    arg_parser.add_argument('-c', '--config', metavar='CONF', help='global configuration file')
+
+    args = arg_parser.parse_args()
+
+    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)
+
+    if args.config is not None:
+        with open(args.config) as global_conf_file:
+            global_conf = ast.literal_eval(global_conf_file.read())
+    else:
+        global_conf = {}
+
+    wb = xlwt.Workbook()
+
+    for sheet_path in args.sheet_dirs:
+        try:
+            with open(os.path.join(sheet_path, 'sheet.conf')) as sheet_conf_file:
+                sheet_conf = ast.literal_eval(sheet_conf_file.read())
+        except Exception:
+            sheet_conf = {}
+            logging.debug('no sheet.conf for %s', sheet_path)
+
+        sheet_conf = dict(global_conf.items() + sheet_conf.items())
+
+        if 'configurations' in sheet_conf:
+            config_names = sheet_conf['configurations']
+        else:
+            try:
+                config_names = [p for p in os.listdir(sheet_path)
+                    if os.path.isdir(os.path.join(sheet_path, p))]
+            except Exception as e:
+                logging.warning('error while determining configuration names for %s: %s', sheet_path, e)
+                continue
+
+        collection = {}
+
+        for configuration, configuration_path in \
+                [(c, os.path.join(sheet_path, c))  for c in config_names]:
+            logging.info('processing %s', configuration_path)
+            for xml_fullname in glob(os.path.join(configuration_path, '*.xml')):
+                collect_xml(collection, configuration, xml_fullname)
+
+        sheet = wb.add_sheet(sheet_conf.get('sheet_name', os.path.basename(os.path.abspath(sheet_path))))
+
+        sheet.row(0).height = 800
+        sheet.panes_frozen = True
+        sheet.remove_splits = True
+        sheet.horz_split_pos = 1
+        sheet.horz_split_first_visible = 1
+
+        sheet_comparisons = sheet_conf.get('comparisons', [])
+
+        for i, w in enumerate([2000, 15000, 2500, 2000, 15000]
+                + (len(config_names) + 1 + len(sheet_comparisons)) * [3000]):
+            sheet.col(i).width = w
+
+        for i, caption in enumerate(['Module', 'Test', 'Image\nsize', 'Data\ntype', 'Parameters']
+                + config_names + [None]
+                + [comp['to'] + '\nvs\n' + comp['from'] for comp in sheet_comparisons]):
+            sheet.row(0).write(i, caption, header_style)
+
+        row = 1
+
+        module_colors = sheet_conf.get('module_colors', {})
+        module_styles = {module: xlwt.easyxf('pattern: pattern solid, fore_color {}'.format(color))
+                         for module, color in module_colors.iteritems()}
+
+        for module, tests in sorted(collection.iteritems()):
+            for ((test, param), configs) in tests.iteritems():
+                sheet.write(row, 0, module, module_styles.get(module, xlwt.Style.default_style))
+                sheet.write(row, 1, test)
+
+                param_list = param[1:-1].split(", ")
+                sheet.write(row, 2, next(ifilter(re_image_size.match, param_list), None))
+                sheet.write(row, 3, next(ifilter(re_data_type.match, param_list), None))
+
+                sheet.row(row).write(4, param)
+                for i, c in enumerate(config_names):
+                    if c in configs:
+                        sheet.write(row, 5 + i, configs[c], time_style)
+                    else:
+                        sheet.write(row, 5 + i, None, no_time_style)
+
+                for i, comp in enumerate(sheet_comparisons):
+                    cmp_from = configs.get(comp["from"])
+                    cmp_to = configs.get(comp["to"])
+                    col = 5 + len(config_names) + 1 + i
+
+                    if isinstance(cmp_from, numbers.Number) and isinstance(cmp_to, numbers.Number):
+                        try:
+                            speedup = cmp_from / cmp_to
+                            sheet.write(row, col, speedup, good_speedup_style if speedup > 1.1 else
+                                                           bad_speedup_style  if speedup < 0.9 else
+                                                           speedup_style)
+                        except ArithmeticError as e:
+                            sheet.write(row, col, None, error_speedup_style)
+                    else:
+                        sheet.write(row, col, None, no_speedup_style)
+
+                row += 1
+                if row % 1000 == 0: sheet.flush_row_data()
+
+    wb.save(args.output)
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/ts/src/gpu_perf.cpp b/modules/ts/src/gpu_perf.cpp
index dca1814680..2bca535c46 100644
--- a/modules/ts/src/gpu_perf.cpp
+++ b/modules/ts/src/gpu_perf.cpp
@@ -288,7 +288,7 @@ namespace perf
 
             printf("[----------]\n"), fflush(stdout);
             printf("[ DEVICE   ] \t# %d %s.\n", i, info.name()), fflush(stdout);
-            printf("[          ] \tCompute capability: %d.%d\n", (int)info.major(), (int)info.minor()), fflush(stdout);
+            printf("[          ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
             printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()), fflush(stdout);
             printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
             printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)), fflush(stdout);
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 0f3751e52f..3b1c7cac29 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -2,6 +2,10 @@
 #include <float.h>
 #include <limits.h>
 
+#ifdef HAVE_TEGRA_OPTIMIZATION
+#include "tegra.hpp"
+#endif
+
 using namespace cv;
 
 namespace cvtest
@@ -2939,28 +2943,76 @@ MatComparator::operator()(const char* expr1, const char* expr2,
 
 void printVersionInfo(bool useStdOut)
 {
-    ::testing::Test::RecordProperty("CV_VERSION", CV_VERSION);
+    ::testing::Test::RecordProperty("cv_version", CV_VERSION);
     if(useStdOut) std::cout << "OpenCV version: " << CV_VERSION << std::endl;
 
     std::string buildInfo( cv::getBuildInformation() );
 
     size_t pos1 = buildInfo.find("Version control");
-    size_t pos2 = buildInfo.find("\n", pos1);\
+    size_t pos2 = buildInfo.find('\n', pos1);
     if(pos1 != std::string::npos && pos2 != std::string::npos)
     {
-        std::string ver( buildInfo.substr(pos1, pos2-pos1) );
-        ::testing::Test::RecordProperty("Version_control", ver);
-        if(useStdOut) std::cout << ver << std::endl;
+        size_t value_start = buildInfo.rfind(' ', pos2) + 1;
+        std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
+        ::testing::Test::RecordProperty("cv_vcs_version", ver);
+        if (useStdOut) std::cout << "OpenCV VCS version: " << ver << std::endl;
     }
 
     pos1 = buildInfo.find("inner version");
-    pos2 = buildInfo.find("\n", pos1);\
+    pos2 = buildInfo.find('\n', pos1);
     if(pos1 != std::string::npos && pos2 != std::string::npos)
     {
-        std::string ver( buildInfo.substr(pos1, pos2-pos1) );
-        ::testing::Test::RecordProperty("inner_version", ver);
-        if(useStdOut) std::cout << ver << std::endl;
+        size_t value_start = buildInfo.rfind(' ', pos2) + 1;
+        std::string ver( buildInfo.substr(value_start, pos2 - value_start) );
+        ::testing::Test::RecordProperty("cv_inner_vcs_version", ver);
+        if(useStdOut) std::cout << "Inner VCS version: " << ver << std::endl;
     }
+
+#ifdef CV_PARALLEL_FRAMEWORK
+    ::testing::Test::RecordProperty("cv_parallel_framework", CV_PARALLEL_FRAMEWORK);
+    if (useStdOut)
+    {
+        std::cout << "Parallel framework: " << CV_PARALLEL_FRAMEWORK << std::endl;
+    }
+#endif
+
+    std::string cpu_features;
+
+#if CV_SSE
+    if (checkHardwareSupport(CV_CPU_SSE)) cpu_features += " sse";
+#endif
+#if CV_SSE2
+    if (checkHardwareSupport(CV_CPU_SSE2)) cpu_features += " sse2";
+#endif
+#if CV_SSE3
+    if (checkHardwareSupport(CV_CPU_SSE3)) cpu_features += " sse3";
+#endif
+#if CV_SSSE3
+    if (checkHardwareSupport(CV_CPU_SSSE3)) cpu_features += " ssse3";
+#endif
+#if CV_SSE4_1
+    if (checkHardwareSupport(CV_CPU_SSE4_1)) cpu_features += " sse4.1";
+#endif
+#if CV_SSE4_2
+    if (checkHardwareSupport(CV_CPU_SSE4_2)) cpu_features += " sse4.2";
+#endif
+#if CV_AVX
+    if (checkHardwareSupport(CV_CPU_AVX)) cpu_features += " avx";
+#endif
+#if CV_NEON
+    cpu_features += " neon"; // NEON is currently not checked at runtime
+#endif
+
+    cpu_features.erase(0, 1); // erase initial space
+
+    ::testing::Test::RecordProperty("cv_cpu_features", cpu_features);
+    if (useStdOut) std::cout << "CPU features: " << cpu_features << std::endl;
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    const char * tegra_optimization = tegra::isDeviceSupported() ? "enabled" : "disabled";
+    ::testing::Test::RecordProperty("cv_tegra_optimization", tegra_optimization);
+    if (useStdOut) std::cout << "Tegra optimization: " << tegra_optimization << std::endl;
+#endif
 }
 
 }
diff --git a/modules/video/perf/perf_optflowpyrlk.cpp b/modules/video/perf/perf_optflowpyrlk.cpp
index aa22531007..339cbd0354 100644
--- a/modules/video/perf/perf_optflowpyrlk.cpp
+++ b/modules/video/perf/perf_optflowpyrlk.cpp
@@ -165,7 +165,8 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize_Deriv, OpticalFlowPyrLK_self, testing::Com
     declare.in(pyramid1, pyramid2, inPoints).out(outPoints);
     declare.time(400);
 
-    TEST_CYCLE()
+    int runs = 3;
+    TEST_CYCLE_MULTIRUN(runs)
     {
         calcOpticalFlowPyrLK(pyramid1, pyramid2, inPoints, outPoints, status, err,
                              Size(winSize, winSize), maxLevel, criteria,
@@ -217,4 +218,4 @@ PERF_TEST_P(Path_Win_Deriv_Border_Reuse, OpticalFlowPyrLK_pyr, testing::Combine(
     }
 
     SANITY_CHECK(pyramid);
-}
\ No newline at end of file
+}
diff --git a/modules/videostab/src/global_motion.cpp b/modules/videostab/src/global_motion.cpp
index 45e2d164ee..d6c291ca70 100644
--- a/modules/videostab/src/global_motion.cpp
+++ b/modules/videostab/src/global_motion.cpp
@@ -360,6 +360,9 @@ Mat estimateGlobalMotionRansac(
     const int npoints = points0.getMat().checkVector(2);
     CV_Assert(points1.getMat().checkVector(2) == npoints);
 
+    if (npoints < params.size)
+        return Mat::eye(3, 3, CV_32F);
+
     const Point2f *points0_ = points0.getMat().ptr<Point2f>();
     const Point2f *points1_ = points1.getMat().ptr<Point2f>();
     const int niters = params.niters();
@@ -678,6 +681,8 @@ Mat KeypointBasedMotionEstimator::estimate(const Mat &frame0, const Mat &frame1,
 {
     // find keypoints
     detector_->detect(frame0, keypointsPrev_);
+    if (keypointsPrev_.empty())
+        return Mat::eye(3, 3, CV_32F);
 
     // extract points from keypoints
     pointsPrev_.resize(keypointsPrev_.size());
diff --git a/platforms/android/android.toolchain.cmake b/platforms/android/android.toolchain.cmake
index 0f7e340678..d7f09c7888 100644
--- a/platforms/android/android.toolchain.cmake
+++ b/platforms/android/android.toolchain.cmake
@@ -289,6 +289,9 @@
 #   - March 2013
 #     [+] updated for NDK r8e (x86 version)
 #     [+] support x86_64 version of NDK
+#   - April 2013
+#     [+] support non-release NDK layouts (from Linaro git and Android git)
+#     [~] automatically detect if explicit link to crtbegin_*.o is needed
 # ------------------------------------------------------------------------------
 
 cmake_minimum_required( VERSION 2.6.3 )
@@ -516,24 +519,19 @@ if( NOT ANDROID_NDK )
   endif( ANDROID_NDK )
  endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
 endif( NOT ANDROID_NDK )
+
 # remember found paths
 if( ANDROID_NDK )
  get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- # try to detect change
- if( CMAKE_AR )
-  string( LENGTH "${ANDROID_NDK}" __length )
-  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
-  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK )
-   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
-   " )
-  endif()
-  unset( __androidNdkPreviousPath )
-  unset( __length )
- endif()
  set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
  set( BUILD_WITH_ANDROID_NDK True )
- file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
- string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ if( EXISTS "${ANDROID_NDK}/RELEASE.TXT" )
+  file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
+  string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ else()
+  set( ANDROID_NDK_RELEASE "r1x" )
+  set( ANDROID_NDK_RELEASE_FULL "unreleased" )
+ endif()
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
  get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
  # try to detect change
@@ -560,6 +558,51 @@ else()
       sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
 endif()
 
+# android NDK layout
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT DEFINED ANDROID_NDK_LAYOUT )
+  # try to automatically detect the layout
+  if( EXISTS "${ANDROID_NDK}/RELEASE.TXT")
+   set( ANDROID_NDK_LAYOUT "RELEASE" )
+  elseif( EXISTS "${ANDROID_NDK}/../../linux-x86/toolchain/" )
+   set( ANDROID_NDK_LAYOUT "LINARO" )
+  elseif( EXISTS "${ANDROID_NDK}/../../gcc/" )
+   set( ANDROID_NDK_LAYOUT "ANDROID" )
+  endif()
+ endif()
+ set( ANDROID_NDK_LAYOUT "${ANDROID_NDK_LAYOUT}" CACHE STRING "The inner layout of NDK" )
+ mark_as_advanced( ANDROID_NDK_LAYOUT )
+ if( ANDROID_NDK_LAYOUT STREQUAL "LINARO" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../${ANDROID_NDK_HOST_SYSTEM_NAME}/toolchain" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ elseif( ANDROID_NDK_LAYOUT STREQUAL "ANDROID" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../gcc/${ANDROID_NDK_HOST_SYSTEM_NAME}/arm" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ else() # ANDROID_NDK_LAYOUT STREQUAL "RELEASE"
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/toolchains" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME2}" )
+ endif()
+ get_filename_component( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK_TOOLCHAINS_PATH}" ABSOLUTE )
+
+ # try to detect change of NDK
+ if( CMAKE_AR )
+  string( LENGTH "${ANDROID_NDK_TOOLCHAINS_PATH}" __length )
+  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
+  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK_TOOLCHAINS_PATH )
+   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
+   " )
+  endif()
+  unset( __androidNdkPreviousPath )
+  unset( __length )
+ endif()
+endif()
+
+
 # get all the details about standalone toolchain
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
@@ -587,17 +630,23 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  endif()
 endif()
 
-macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __host_system_name )
+macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
  foreach( __toolchain ${${__availableToolchainsLst}} )
-  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/" )
+  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
    string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
   else()
    set( __gcc_toolchain "${__toolchain}" )
   endif()
-  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__gcc_toolchain}/prebuilt/${__host_system_name}" )
+  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
   if( __machine )
-   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?$" __version "${__gcc_toolchain}" )
-   string( REGEX MATCH "^[^-]+" __arch "${__gcc_toolchain}" )
+   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
+   if( __machine MATCHES i686 )
+    set( __arch "x86" )
+   elseif( __machine MATCHES arm )
+    set( __arch "arm" )
+   elseif( __machine MATCHES mipsel )
+    set( __arch "mipsel" )
+   endif()
    list( APPEND __availableToolchainMachines "${__machine}" )
    list( APPEND __availableToolchainArchs "${__arch}" )
    list( APPEND __availableToolchainCompilerVersions "${__version}" )
@@ -615,29 +664,29 @@ if( BUILD_WITH_ANDROID_NDK )
  set( __availableToolchainMachines "" )
  set( __availableToolchainArchs "" )
  set( __availableToolchainCompilerVersions "" )
- if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_NAME}/" )
+ if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_TOOLCHAIN_NAME}/" )
   # do not go through all toolchains if we know the name
   set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
    if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
    endif()
   endif()
  endif()
  if( NOT __availableToolchains )
-  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK}/toolchains" "${ANDROID_NDK}/toolchains/*" )
+  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK_TOOLCHAINS_PATH}" "${ANDROID_NDK_TOOLCHAINS_PATH}/*" )
   if( __availableToolchains )
    list(SORT __availableToolchainsLst) # we need clang to go after gcc
   endif()
   __LIST_FILTER( __availableToolchainsLst "^[.]" )
   __LIST_FILTER( __availableToolchainsLst "llvm" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
    if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
    endif()
   endif()
  endif()
@@ -768,6 +817,7 @@ else()
   list( GET __availableToolchainArchs ${__idx} __toolchainArch )
   if( __toolchainArch STREQUAL ANDROID_ARCH_FULLNAME )
    list( GET __availableToolchainCompilerVersions ${__idx} __toolchainVersion )
+   string( REPLACE "x" "99" __toolchainVersion "${__toolchainVersion}")
    if( __toolchainVersion VERSION_GREATER __toolchainMaxVersion )
     set( __toolchainMaxVersion "${__toolchainVersion}" )
     set( __toolchainIdx ${__idx} )
@@ -971,11 +1021,11 @@ if( "${ANDROID_TOOLCHAIN_NAME}" STREQUAL "standalone-clang" )
 elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
  string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
  string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-4.6" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
- if( NOT EXISTS "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}/bin/clang${TOOL_OS_SUFFIX}" )
+ if( NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}/bin/clang${TOOL_OS_SUFFIX}" )
   message( FATAL_ERROR "Could not find the Clang compiler driver" )
  endif()
  set( ANDROID_COMPILER_IS_CLANG 1 )
- set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
 else()
  set( ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
  unset( ANDROID_COMPILER_IS_CLANG CACHE )
@@ -989,7 +1039,7 @@ endif()
 
 # setup paths and STL for NDK
 if( BUILD_WITH_ANDROID_NDK )
- set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
  set( ANDROID_SYSROOT "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}" )
 
  if( ANDROID_STL STREQUAL "none" )
@@ -1048,11 +1098,11 @@ if( BUILD_WITH_ANDROID_NDK )
  endif()
  # find libsupc++.a - rtti & exceptions
  if( ANDROID_STL STREQUAL "system_re" OR ANDROID_STL MATCHES "gnustl" )
-  if( ANDROID_NDK_RELEASE STRGREATER "r8" ) # r8b
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  elseif( NOT ANDROID_NDK_RELEASE STRLESS "r7" AND ANDROID_NDK_RELEASE STRLESS "r8b")
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  else( ANDROID_NDK_RELEASE STRLESS "r7" )
+  set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r8b or newer
+  if( NOT EXISTS "${__libsupcxx}" )
+   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r7-r8
+  endif()
+  if( NOT EXISTS "${__libsupcxx}" ) # before r7
    if( ARMEABI_V7A )
     if( ANDROID_FORCE_ARM_BUILD )
      set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
@@ -1102,7 +1152,7 @@ unset( _ndk_ccache )
 
 # setup the cross-compiler
 if( NOT CMAKE_C_COMPILER )
- if( NDK_CCACHE )
+ if( NDK_CCACHE AND NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
   set( CMAKE_C_COMPILER   "${NDK_CCACHE}" CACHE PATH "ccache as C compiler" )
   set( CMAKE_CXX_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C++ compiler" )
   if( ANDROID_COMPILER_IS_CLANG )
@@ -1174,11 +1224,25 @@ set( CMAKE_ASM_SOURCE_FILE_EXTENSIONS s S asm )
 remove_definitions( -DANDROID )
 add_definitions( -DANDROID )
 
-if(ANDROID_SYSROOT MATCHES "[ ;\"]")
- set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+if( ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ if( CMAKE_HOST_WIN32 )
+  # try to convert path to 8.3 form
+  file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "@echo %~s1" )
+  execute_process( COMMAND "$ENV{ComSpec}" /c "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "${ANDROID_SYSROOT}"
+                   OUTPUT_VARIABLE __path OUTPUT_STRIP_TRAILING_WHITESPACE
+                   RESULT_VARIABLE __result ERROR_QUIET )
+  if( __result EQUAL 0 )
+   file( TO_CMAKE_PATH "${__path}" ANDROID_SYSROOT )
+   set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+  else()
+   set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+  endif()
+ else()
+  set( ANDROID_CXX_FLAGS "'--sysroot=${ANDROID_SYSROOT}'" )
+ endif()
  if( NOT _CMAKE_IN_TRY_COMPILE )
-  # quotes will break try_compile and compiler identification
-  message(WARNING "Your Android system root has non-alphanumeric symbols. It can break compiler features detection and the whole build.")
+  # quotes can break try_compile and compiler identification
+  message(WARNING "Path to your Android NDK (or toolchain) has non-alphanumeric symbols.\nThe build might be broken.\n")
  endif()
 else()
  set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
@@ -1249,22 +1313,18 @@ elseif( ARMEABI )
  set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv5te -mtune=xscale -msoft-float" )
 endif()
 
+if( ANDROID_STL MATCHES "gnustl" AND (EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}") )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+else()
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+endif()
+
 # STL
 if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
- if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- else()
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- endif()
- if ( X86 AND ANDROID_STL MATCHES "gnustl" AND ANDROID_NDK_RELEASE STREQUAL "r6" )
-  # workaround "undefined reference to `__dso_handle'" problem
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
- endif()
  if( EXISTS "${__libstl}" )
   set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libstl}\"" )
   set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libstl}\"" )
@@ -1283,9 +1343,12 @@ if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
   set( CMAKE_C_LINK_EXECUTABLE       "${CMAKE_C_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
  endif()
  if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} -lm" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} -lm" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} -lm" )
+  if( NOT EXISTS "${ANDROID_LIBM_PATH}" )
+   set( ANDROID_LIBM_PATH -lm )
+  endif()
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} ${ANDROID_LIBM_PATH}" )
  endif()
 endif()
 
@@ -1321,7 +1384,14 @@ if( ARMEABI_V7A )
 endif()
 
 if( ANDROID_NO_UNDEFINED )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ if( MIPS )
+  # there is some sysroot-related problem in mips linker...
+  if( NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined -Wl,-rpath-link,${ANDROID_SYSROOT}/usr/lib" )
+  endif()
+ else()
+  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ endif()
 endif()
 
 if( ANDROID_SO_UNDEFINED )
@@ -1401,9 +1471,9 @@ set( CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FL
 set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
 
 if( MIPS AND BUILD_WITH_ANDROID_NDK AND ANDROID_NDK_RELEASE STREQUAL "r8" )
- set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
- set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
- set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
+ set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
+ set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
+ set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
 endif()
 
 # configure rtti
@@ -1430,6 +1500,43 @@ endif()
 include_directories( SYSTEM "${ANDROID_SYSROOT}/usr/include" ${ANDROID_STL_INCLUDE_DIRS} )
 link_directories( "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
 
+# detect if need link crtbegin_so.o explicitly
+if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
+ set( __cmd "${CMAKE_CXX_CREATE_SHARED_LIBRARY}" )
+ string( REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_C_COMPILER>"   "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}"   __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CXX_FLAGS>" "${CMAKE_CXX_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<LANGUAGE_COMPILE_FLAGS>" "" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_FLAGS>" "${CMAKE_SHARED_LINKER_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS>" "-shared" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET_SONAME>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET>" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain_crtlink_test.so" __cmd "${__cmd}" )
+ string( REPLACE "<OBJECTS>" "\"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_LIBRARIES>" "" __cmd "${__cmd}" )
+ separate_arguments( __cmd )
+ foreach( __var ANDROID_NDK ANDROID_NDK_TOOLCHAINS_PATH ANDROID_STANDALONE_TOOLCHAIN )
+  if( ${__var} )
+   set( __tmp "${${__var}}" )
+   separate_arguments( __tmp )
+   string( REPLACE "${__tmp}" "${${__var}}" __cmd "${__cmd}")
+  endif()
+ endforeach()
+ string( REPLACE "'" "" __cmd "${__cmd}" )
+ string( REPLACE "\"" "" __cmd "${__cmd}" )
+ execute_process( COMMAND ${__cmd} RESULT_VARIABLE __cmd_result OUTPUT_QUIET ERROR_QUIET )
+ if( __cmd_result EQUAL 0 )
+  set( ANDROID_EXPLICIT_CRT_LINK ON )
+ else()
+  set( ANDROID_EXPLICIT_CRT_LINK OFF )
+ endif()
+endif()
+
+if( ANDROID_EXPLICIT_CRT_LINK )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+endif()
+
 # setup output directories
 set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "root for library output, set this to change where android libs are installed to" )
 set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )
@@ -1521,6 +1628,7 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
  foreach( __var NDK_CCACHE  LIBRARY_OUTPUT_PATH_ROOT  ANDROID_FORBID_SYGWIN  ANDROID_SET_OBSOLETE_VARIABLES
                 ANDROID_NDK_HOST_X64
                 ANDROID_NDK
+                ANDROID_NDK_LAYOUT
                 ANDROID_STANDALONE_TOOLCHAIN
                 ANDROID_TOOLCHAIN_NAME
                 ANDROID_ABI
@@ -1534,6 +1642,8 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
                 ANDROID_GOLD_LINKER
                 ANDROID_NOEXECSTACK
                 ANDROID_RELRO
+                ANDROID_LIBM_PATH
+                ANDROID_EXPLICIT_CRT_LINK
                 )
   if( DEFINED ${__var} )
    if( "${__var}" MATCHES " ")
@@ -1577,6 +1687,7 @@ endif()
 #   ANDROID_STANDALONE_TOOLCHAIN
 #   ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
 #   ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
+#   ANDROID_NDK_LAYOUT : the inner NDK structure (RELEASE, LINARO, ANDROID)
 #   LIBRARY_OUTPUT_PATH_ROOT : <any valid path>
 #   NDK_CCACHE : <path to your ccache executable>
 # Obsolete:
@@ -1622,6 +1733,7 @@ endif()
 #   ANDROID_EXCEPTIONS : if exceptions are enabled by the runtime
 #   ANDROID_GCC_TOOLCHAIN_NAME : read-only, differs from ANDROID_TOOLCHAIN_NAME only if clang is used
 #   ANDROID_CLANG_VERSION : version of clang compiler if clang is used
+#   ANDROID_LIBM_PATH : path to libm.so (set to something like $(TOP)/out/target/product/<product_name>/obj/lib/libm.so) to workaround unresolved `sincos`
 #
 # Defaults:
 #   ANDROID_DEFAULT_NDK_API_LEVEL
diff --git a/samples/android/native-activity/.cproject b/samples/android/native-activity/.cproject
index 09687f3ac0..44aadfe9af 100644
--- a/samples/android/native-activity/.cproject
+++ b/samples/android/native-activity/.cproject
@@ -1,75 +1,61 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
-	<storageModule moduleId="org.eclipse.cdt.core.settings">
-		<cconfiguration id="0.129633445">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="0.129633445" moduleId="org.eclipse.cdt.core.settings" name="Default">
-				<externalSettings/>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="${ProjName}" buildProperties="" description="" id="0.129633445" name="Default" parent="org.eclipse.cdt.build.core.prefbase.cfg">
-					<folderInfo id="0.129633445." name="/" resourcePath="">
-						<toolChain id="org.eclipse.cdt.build.core.prefbase.toolchain.2006441180" name="No ToolChain" resourceTypeBasedDiscovery="false" superClass="org.eclipse.cdt.build.core.prefbase.toolchain">
-							<targetPlatform id="org.eclipse.cdt.build.core.prefbase.toolchain.2006441180.527973180" name=""/>
-							<builder autoBuildTarget="" command="${NDKROOT}/ndk-build.cmd" enableAutoBuild="true" enableCleanBuild="false" id="org.eclipse.cdt.build.core.settings.default.builder.180541221" incrementalBuildTarget="" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.libs.791069665" name="holder for library settings" superClass="org.eclipse.cdt.build.core.settings.holder.libs"/>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.1894181736" name="Assembly" superClass="org.eclipse.cdt.build.core.settings.holder">
-								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.588929884" languageId="org.eclipse.cdt.core.assembly" languageName="Assembly" sourceContentType="org.eclipse.cdt.core.asmSource" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
-							</tool>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.303359177" name="GNU C++" superClass="org.eclipse.cdt.build.core.settings.holder">
-								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.373249505" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
-								</option>
-								<option id="org.eclipse.cdt.build.core.settings.holder.symbols.1424359063" name="Symbols" superClass="org.eclipse.cdt.build.core.settings.holder.symbols" valueType="definedSymbols">
-									<listOptionValue builtIn="false" value="ANDROID=1"/>
-								</option>
-								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.360067880" languageId="org.eclipse.cdt.core.g++" languageName="GNU C++" sourceContentType="org.eclipse.cdt.core.cxxSource,org.eclipse.cdt.core.cxxHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
-							</tool>
-							<tool id="org.eclipse.cdt.build.core.settings.holder.1156172258" name="GNU C" superClass="org.eclipse.cdt.build.core.settings.holder">
-								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.149918263" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
-								</option>
-								<option id="org.eclipse.cdt.build.core.settings.holder.symbols.719752707" name="Symbols" superClass="org.eclipse.cdt.build.core.settings.holder.symbols" valueType="definedSymbols">
-									<listOptionValue builtIn="false" value="ANDROID=1"/>
-								</option>
-								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.232493949" languageId="org.eclipse.cdt.core.gcc" languageName="GNU C" sourceContentType="org.eclipse.cdt.core.cSource,org.eclipse.cdt.core.cHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-					<sourceEntries>
-						<entry flags="VALUE_WORKSPACE_PATH" kind="sourcePath" name="jni"/>
-					</sourceEntries>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
-	</storageModule>
-	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-		<project id="OpenCV Sample - face-detection.null.1639518055" name="OpenCV Sample - face-detection"/>
-	</storageModule>
-	<storageModule moduleId="scannerConfiguration">
-		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		<scannerConfigBuildInfo instanceId="0.129633445">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		</scannerConfigBuildInfo>
-	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/OpenCV Sample - face-detection"/>
-	</storageModule>
-	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
-</cproject>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?>
+
+<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="0.882924228">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="0.882924228" moduleId="org.eclipse.cdt.core.settings" name="Default">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildProperties="" description="" id="0.882924228" name="Default" parent="org.eclipse.cdt.build.core.prefbase.cfg">
+					<folderInfo id="0.882924228." name="/" resourcePath="">
+						<toolChain id="org.eclipse.cdt.build.core.prefbase.toolchain.1667980868" name="No ToolChain" resourceTypeBasedDiscovery="false" superClass="org.eclipse.cdt.build.core.prefbase.toolchain">
+							<targetPlatform id="org.eclipse.cdt.build.core.prefbase.toolchain.1667980868.2108168132" name=""/>
+							<builder autoBuildTarget="" command="&quot;${NDKROOT}/ndk-build.cmd&quot;" enableAutoBuild="true" enableCleanBuild="false" id="org.eclipse.cdt.build.core.settings.default.builder.328915772" incrementalBuildTarget="" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.libs.630148311" name="holder for library settings" superClass="org.eclipse.cdt.build.core.settings.holder.libs"/>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.525090327" name="Assembly" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.1491216279" languageId="org.eclipse.cdt.core.assembly" languageName="Assembly" sourceContentType="org.eclipse.cdt.core.asmSource" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.1242729366" name="GNU C++" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<option id="org.eclipse.cdt.build.core.settings.holder.incpaths.881377735" name="Include Paths" superClass="org.eclipse.cdt.build.core.settings.holder.incpaths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/platforms/android-9/arch-arm/usr/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${ProjDirPath}/../../sdk/native/jni/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${NDKROOT}/sources/android/native_app_glue&quot;"/>
+								</option>
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.273216997" languageId="org.eclipse.cdt.core.g++" languageName="GNU C++" sourceContentType="org.eclipse.cdt.core.cxxSource,org.eclipse.cdt.core.cxxHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+							<tool id="org.eclipse.cdt.build.core.settings.holder.1779128177" name="GNU C" superClass="org.eclipse.cdt.build.core.settings.holder">
+								<inputType id="org.eclipse.cdt.build.core.settings.holder.inType.1778510041" languageId="org.eclipse.cdt.core.gcc" languageName="GNU C" sourceContentType="org.eclipse.cdt.core.cSource,org.eclipse.cdt.core.cHeader" superClass="org.eclipse.cdt.build.core.settings.holder.inType"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="CvNativeActivity.null.708321898" name="CvNativeActivity"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="0.882924228">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="refreshScope" versionNumber="1">
+		<resource resourceType="PROJECT" workspacePath="/CvNativeActivity"/>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+</cproject>
diff --git a/samples/android/native-activity/.project b/samples/android/native-activity/.project
index cf0823c0b3..c20be83f60 100644
--- a/samples/android/native-activity/.project
+++ b/samples/android/native-activity/.project
@@ -5,6 +5,64 @@
 	<projects>
 	</projects>
 	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>auto,full,incremental,</triggers>
+			<arguments>
+				<dictionary>
+					<key>?name?</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.append_environment</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.autoBuildTarget</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.buildArguments</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.buildCommand</key>
+					<value>&quot;${NDKROOT}/ndk-build.cmd&quot;</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
+					<value>clean</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.contents</key>
+					<value>org.eclipse.cdt.make.core.activeConfigSettings</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.enableAutoBuild</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.enableCleanBuild</key>
+					<value>false</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.enableFullBuild</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.fullBuildTarget</key>
+					<value></value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.stopOnError</key>
+					<value>true</value>
+				</dictionary>
+				<dictionary>
+					<key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
+					<value>false</value>
+				</dictionary>
+			</arguments>
+		</buildCommand>
 		<buildCommand>
 			<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
 			<arguments>
@@ -25,9 +83,19 @@
 			<arguments>
 			</arguments>
 		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
 	</buildSpec>
 	<natures>
 		<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
 		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
 	</natures>
 </projectDescription>
diff --git a/samples/android/native-activity/jni/native.cpp b/samples/android/native-activity/jni/native.cpp
index 66bc006db1..5cfb3a9611 100644
--- a/samples/android/native-activity/jni/native.cpp
+++ b/samples/android/native-activity/jni/native.cpp
@@ -9,7 +9,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
-#include <float.h>
 #include <queue>
 
 #include <opencv2/core/core.hpp>
@@ -60,7 +59,7 @@ static cv::Size calc_optimal_camera_resolution(const char* supported, int width,
             }
         }
 
-        idx++; // to skip coma symbol
+        idx++; // to skip comma symbol
 
     } while(supported[idx-1] != '\0');
 
@@ -86,9 +85,9 @@ static void engine_draw_frame(Engine* engine, const cv::Mat& frame)
 
     for (int yy = top_indent; yy < std::min(frame.rows+top_indent, buffer.height); yy++)
     {
-        unsigned char* line = (unsigned char*)pixels;
-        memcpy(line+left_indent*4*sizeof(unsigned char), frame.ptr<unsigned char>(yy),
-               std::min(frame.cols, buffer.width)*4*sizeof(unsigned char));
+        unsigned char* line = (unsigned char*)pixels + left_indent*4*sizeof(unsigned char);
+        size_t line_size = std::min(frame.cols, buffer.width)*4*sizeof(unsigned char);
+        memcpy(line, frame.ptr<unsigned char>(yy), line_size);
         // go to next line
         pixels = (int32_t*)pixels + buffer.stride;
     }
@@ -139,7 +138,7 @@ static void engine_handle_cmd(android_app* app, int32_t cmd)
                     return;
                 }
 
-                LOGI("Camera initialized at resoution %dx%d", camera_resolution.width, camera_resolution.height);
+                LOGI("Camera initialized at resolution %dx%d", camera_resolution.width, camera_resolution.height);
             }
             break;
         case APP_CMD_TERM_WINDOW:
@@ -157,7 +156,8 @@ void android_main(android_app* app)
     // Make sure glue isn't stripped.
     app_dummy();
 
-    memset(&engine, 0, sizeof(engine));
+    size_t engine_size = sizeof(engine); // for Eclipse CDT parser
+    memset((void*)&engine, 0, engine_size);
     app->userData = &engine;
     app->onAppCmd = engine_handle_cmd;
     engine.app = app;
diff --git a/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp b/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp
index 5ec82d5d3f..402d6456ac 100644
--- a/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp
+++ b/samples/cpp/tutorial_code/features2D/SURF_FlannMatcher.cpp
@@ -71,7 +71,7 @@ int main( int argc, char** argv )
   std::vector< DMatch > good_matches;
 
   for( int i = 0; i < descriptors_1.rows; i++ )
-  { if( matches[i].distance < 2*min_dist )
+  { if( matches[i].distance <= 2*min_dist )
     { good_matches.push_back( matches[i]); }
   }
 
diff --git a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
index 87b5255990..1815cc6dec 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
@@ -308,6 +308,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
     gpu::split(tmp2, vI2);
     Scalar mssim;
 
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(vI2[0].type(), -1, Size(11, 11), 1.5);
+
     for( int i = 0; i < gI1.channels(); ++i )
     {
         gpu::GpuMat I2_2, I1_2, I1_I2;
@@ -318,8 +320,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         /*************************** END INITS **********************************/
         gpu::GpuMat mu1, mu2;   // PRELIMINARY COMPUTING
-        gpu::GaussianBlur(vI1[i], mu1, Size(11, 11), 1.5);
-        gpu::GaussianBlur(vI2[i], mu2, Size(11, 11), 1.5);
+        gauss->apply(vI1[i], mu1);
+        gauss->apply(vI2[i], mu2);
 
         gpu::GpuMat mu1_2, mu2_2, mu1_mu2;
         gpu::multiply(mu1, mu1, mu1_2);
@@ -328,13 +330,13 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         gpu::GpuMat sigma1_2, sigma2_2, sigma12;
 
-        gpu::GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
+        gauss->apply(I1_2, sigma1_2);
         gpu::subtract(sigma1_2, mu1_2, sigma1_2); // sigma1_2 -= mu1_2;
 
-        gpu::GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
+        gauss->apply(I2_2, sigma2_2);
         gpu::subtract(sigma2_2, mu2_2, sigma2_2); // sigma2_2 -= mu2_2;
 
-        gpu::GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
+        gauss->apply(I1_I2, sigma12);
         gpu::subtract(sigma12, mu1_mu2, sigma12); // sigma12 -= mu1_mu2;
 
         ///////////////////////////////// FORMULA ////////////////////////////////
@@ -375,7 +377,7 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
     gpu::split(b.t2, b.vI2, stream);
     Scalar mssim;
 
-    gpu::GpuMat buf;
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(b.vI1[0].type(), -1, Size(11, 11), 1.5);
 
     for( int i = 0; i < b.gI1.channels(); ++i )
     {
@@ -383,22 +385,22 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
         gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
         gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
 
-        gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
-        gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.vI1[i], b.mu1, stream);
+        gauss->apply(b.vI2[i], b.mu2, stream);
 
         gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
         gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
         gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
 
-        gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_2, b.sigma1_2, stream);
         gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);
         //b.sigma1_2 -= b.mu1_2;  - This would result in an extra data transfer operation
 
-        gpu::GaussianBlur(b.I2_2, b.sigma2_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I2_2, b.sigma2_2, stream);
         gpu::subtract(b.sigma2_2, b.mu2_2, b.sigma2_2, gpu::GpuMat(), -1, stream);
         //b.sigma2_2 -= b.mu2_2;
 
-        gpu::GaussianBlur(b.I1_I2, b.sigma12, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_I2, b.sigma12, stream);
         gpu::subtract(b.sigma12, b.mu1_mu2, b.sigma12, gpu::GpuMat(), -1, stream);
         //b.sigma12 -= b.mu1_mu2;
 
diff --git a/samples/gpu/driver_api_multi.cpp b/samples/gpu/driver_api_multi.cpp
index 8b4623f41b..e78f7e54fd 100644
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -86,8 +86,8 @@ int main()
         if (!dev_info.isCompatible())
         {
             std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.major()
-                 << dev_info.minor() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
             return -1;
         }
     }
@@ -130,15 +130,15 @@ void Worker::operator()(int device_id) const
     rng.fill(src, RNG::UNIFORM, 0, 1);
 
     // CPU works
-    transpose(src, dst);
+    cv::transpose(src, dst);
 
     // GPU works
     GpuMat d_src(src);
     GpuMat d_dst;
-    transpose(d_src, d_dst);
+    gpu::transpose(d_src, d_dst);
 
     // Check results
-    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
+    bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
     std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
         << (passed ? "passed" : "FAILED") << endl;
 
diff --git a/samples/gpu/driver_api_stereo_multi.cpp b/samples/gpu/driver_api_stereo_multi.cpp
index fac9e36941..d40c20c1e9 100644
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -85,7 +85,7 @@ void inline contextOff()
 // GPUs data
 GpuMat d_left[2];
 GpuMat d_right[2];
-StereoBM_GPU* bm[2];
+Ptr<gpu::StereoBM> bm[2];
 GpuMat d_result[2];
 
 static void printHelp()
@@ -116,8 +116,8 @@ int main(int argc, char** argv)
         if (!dev_info.isCompatible())
         {
             std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.major()
-                 << dev_info.minor() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
             return -1;
         }
     }
@@ -162,14 +162,14 @@ int main(int argc, char** argv)
     contextOn(0);
     d_left[0].upload(left.rowRange(0, left.rows / 2));
     d_right[0].upload(right.rowRange(0, right.rows / 2));
-    bm[0] = new StereoBM_GPU();
+    bm[0] = gpu::createStereoBM();
     contextOff();
 
     // Split source images for processing on the GPU #1
     contextOn(1);
     d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
     d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
-    bm[1] = new StereoBM_GPU();
+    bm[1] = gpu::createStereoBM();
     contextOff();
 
     // Execute calculation in two threads using two GPUs
@@ -182,7 +182,7 @@ int main(int argc, char** argv)
     d_left[0].release();
     d_right[0].release();
     d_result[0].release();
-    delete bm[0];
+    bm[0].release();
     contextOff();
 
     // Release the second GPU resources
@@ -191,7 +191,7 @@ int main(int argc, char** argv)
     d_left[1].release();
     d_right[1].release();
     d_result[1].release();
-    delete bm[1];
+    bm[1].release();
     contextOff();
 
     waitKey();
@@ -204,8 +204,7 @@ void Worker::operator()(int device_id) const
 {
     contextOn(device_id);
 
-    bm[device_id]->operator()(d_left[device_id], d_right[device_id],
-                              d_result[device_id]);
+    bm[device_id]->compute(d_left[device_id], d_right[device_id], d_result[device_id]);
 
     std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
         << "): finished\n";
diff --git a/samples/gpu/farneback_optical_flow.cpp b/samples/gpu/farneback_optical_flow.cpp
index c93ceb055f..c2a5d411e4 100644
--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@@ -22,9 +22,9 @@ inline T mapVal(T x, T a, T b, T c, T d)
 static void colorizeFlow(const Mat &u, const Mat &v, Mat &dst)
 {
     double uMin, uMax;
-    minMaxLoc(u, &uMin, &uMax, 0, 0);
+    cv::minMaxLoc(u, &uMin, &uMax, 0, 0);
     double vMin, vMax;
-    minMaxLoc(v, &vMin, &vMax, 0, 0);
+    cv::minMaxLoc(v, &vMin, &vMax, 0, 0);
     uMin = ::abs(uMin); uMax = ::abs(uMax);
     vMin = ::abs(vMin); vMax = ::abs(vMax);
     float dMax = static_cast<float>(::max(::max(uMin, uMax), ::max(vMin, vMax)));
diff --git a/samples/gpu/morphology.cpp b/samples/gpu/morphology.cpp
index 1ed8f96dc9..abc6c980b0 100644
--- a/samples/gpu/morphology.cpp
+++ b/samples/gpu/morphology.cpp
@@ -1,120 +1,186 @@
+#include <iostream>
 
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include <stdlib.h>
-#include <stdio.h>
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuimgproc.hpp"
 
 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
 
-static void help()
+class App
 {
+public:
+    App(int argc, const char* argv[]);
 
-printf("\nShow off image morphology: erosion, dialation, open and close\n"
-    "Call:\n   morphology2 [image]\n"
-    "This program also shows use of rect, elipse and cross kernels\n\n");
-printf( "Hot keys: \n"
-    "\tESC - quit the program\n"
-    "\tr - use rectangle structuring element\n"
-    "\te - use elliptic structuring element\n"
-    "\tc - use cross-shaped structuring element\n"
-    "\tSPACE - loop through all the options\n" );
-}
+    int run();
 
-GpuMat src, dst;
+private:
+    void help();
 
-int element_shape = MORPH_RECT;
+    void OpenClose();
+    void ErodeDilate();
 
-//the address of variable which receives trackbar position update
-int max_iters = 10;
-int open_close_pos = 0;
-int erode_dilate_pos = 0;
+    static void OpenCloseCallback(int, void*);
+    static void ErodeDilateCallback(int, void*);
 
-// callback function for open/close trackbar
-static void OpenClose(int, void*)
+    gpu::GpuMat src, dst;
+
+    int element_shape;
+
+    int max_iters;
+    int open_close_pos;
+    int erode_dilate_pos;
+};
+
+App::App(int argc, const char* argv[])
 {
-    int n = open_close_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::morphologyEx(src, dst, MORPH_OPEN, element);
-    else
-        cv::gpu::morphologyEx(src, dst, MORPH_CLOSE, element);
-    imshow("Open/Close",(Mat)dst);
-}
+    element_shape = MORPH_RECT;
+    open_close_pos = erode_dilate_pos = max_iters = 10;
 
-// callback function for erode/dilate trackbar
-static void ErodeDilate(int, void*)
-{
-    int n = erode_dilate_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::erode(src, dst, element);
-    else
-        cv::gpu::dilate(src, dst, element);
-    imshow("Erode/Dilate",(Mat)dst);
-}
-
-
-int main( int argc, char** argv )
-{
-    char* filename = argc == 2 ? argv[1] : (char*)"baboon.jpg";
-    if (string(argv[1]) == "--help")
+    if (argc == 2 && String(argv[1]) == "--help")
     {
         help();
-        return -1;
+        exit(0);
     }
 
-    src.upload(imread(filename, 1));
-    if (src.empty())
+    String filename = argc == 2 ? argv[1] : "baboon.jpg";
+
+    Mat img = imread(filename);
+    if (img.empty())
     {
-        help();
-        return -1;
+        cerr << "Can't open image " << filename.c_str() << endl;
+        exit(-1);
     }
 
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    help();
-
-
+    src.upload(img);
     if (src.channels() == 3)
     {
         // gpu support only 4th channel images
-        GpuMat src4ch;
-        cv::gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
+        gpu::GpuMat src4ch;
+        gpu::cvtColor(src, src4ch, COLOR_BGR2BGRA);
         src = src4ch;
     }
 
-    //create windows for output images
-    namedWindow("Open/Close",1);
-    namedWindow("Erode/Dilate",1);
+    help();
 
-    open_close_pos = erode_dilate_pos = max_iters;
-    createTrackbar("iterations", "Open/Close",&open_close_pos,max_iters*2+1,OpenClose);
-    createTrackbar("iterations", "Erode/Dilate",&erode_dilate_pos,max_iters*2+1,ErodeDilate);
+    gpu::printShortCudaDeviceInfo(gpu::getDevice());
+}
+
+int App::run()
+{
+    // create windows for output images
+    namedWindow("Open/Close");
+    namedWindow("Erode/Dilate");
+
+    createTrackbar("iterations", "Open/Close", &open_close_pos, max_iters * 2 + 1, OpenCloseCallback, this);
+    createTrackbar("iterations", "Erode/Dilate", &erode_dilate_pos, max_iters * 2 + 1, ErodeDilateCallback, this);
 
     for(;;)
     {
-        int c;
+        OpenClose();
+        ErodeDilate();
 
-        OpenClose(open_close_pos, 0);
-        ErodeDilate(erode_dilate_pos, 0);
-        c = waitKey();
+        char c = (char) waitKey();
 
-        if( (char)c == 27 )
+        switch (c)
+        {
+        case 27:
+            return 0;
             break;
-        if( (char)c == 'e' )
+
+        case 'e':
             element_shape = MORPH_ELLIPSE;
-        else if( (char)c == 'r' )
+            break;
+
+        case 'r':
             element_shape = MORPH_RECT;
-        else if( (char)c == 'c' )
+            break;
+
+        case 'c':
             element_shape = MORPH_CROSS;
-        else if( (char)c == ' ' )
+            break;
+
+        case ' ':
             element_shape = (element_shape + 1) % 3;
+            break;
+        }
+    }
+}
+
+void App::help()
+{
+    cout << "Show off image morphology: erosion, dialation, open and close \n";
+    cout << "Call: \n";
+    cout << "   gpu-example-morphology [image] \n";
+    cout << "This program also shows use of rect, elipse and cross kernels \n" << endl;
+
+    cout << "Hot keys: \n";
+    cout << "\tESC - quit the program \n";
+    cout << "\tr - use rectangle structuring element \n";
+    cout << "\te - use elliptic structuring element \n";
+    cout << "\tc - use cross-shaped structuring element \n";
+    cout << "\tSPACE - loop through all the options \n" << endl;
+}
+
+void App::OpenClose()
+{
+    int n = open_close_pos - max_iters;
+    int an = n > 0 ? n : -n;
+
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> openFilter = gpu::createMorphologyFilter(MORPH_OPEN, src.type(), element);
+        openFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> closeFilter = gpu::createMorphologyFilter(MORPH_CLOSE, src.type(), element);
+        closeFilter->apply(src, dst);
     }
 
-    return 0;
+    Mat h_dst(dst);
+    imshow("Open/Close", h_dst);
+}
+
+void App::ErodeDilate()
+{
+    int n = erode_dilate_pos - max_iters;
+    int an = n > 0 ? n : -n;
+
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an));
+
+    if (n < 0)
+    {
+        Ptr<gpu::Filter> erodeFilter = gpu::createMorphologyFilter(MORPH_ERODE, src.type(), element);
+        erodeFilter->apply(src, dst);
+    }
+    else
+    {
+        Ptr<gpu::Filter> dilateFilter = gpu::createMorphologyFilter(MORPH_DILATE, src.type(), element);
+        dilateFilter->apply(src, dst);
+    }
+
+    Mat h_dst(dst);
+    imshow("Erode/Dilate", h_dst);
+}
+
+void App::OpenCloseCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->OpenClose();
+}
+
+void App::ErodeDilateCallback(int, void* data)
+{
+    App* thiz = (App*) data;
+    thiz->ErodeDilate();
+}
+
+int main(int argc, const char* argv[])
+{
+    App app(argc, argv);
+    return app.run();
 }
diff --git a/samples/gpu/multi.cpp b/samples/gpu/multi.cpp
index 34b111829c..b83fd2ce46 100644
--- a/samples/gpu/multi.cpp
+++ b/samples/gpu/multi.cpp
@@ -62,8 +62,8 @@ int main()
         if (!dev_info.isCompatible())
         {
             std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.major()
-                 << dev_info.minor() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
             return -1;
         }
     }
@@ -87,15 +87,15 @@ void Worker::operator()(int device_id) const
     rng.fill(src, RNG::UNIFORM, 0, 1);
 
     // CPU works
-    transpose(src, dst);
+    cv::transpose(src, dst);
 
     // GPU works
     GpuMat d_src(src);
     GpuMat d_dst;
-    transpose(d_src, d_dst);
+    gpu::transpose(d_src, d_dst);
 
     // Check results
-    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
+    bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
     std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
         << (passed ? "passed" : "FAILED") << endl;
 
diff --git a/samples/gpu/performance/performance.cpp b/samples/gpu/performance/performance.cpp
index 42fd978c1b..8af0b3d0d4 100644
--- a/samples/gpu/performance/performance.cpp
+++ b/samples/gpu/performance/performance.cpp
@@ -191,7 +191,7 @@ int main(int argc, const char* argv[])
     DeviceInfo dev_info(device);
     if (!dev_info.isCompatible())
     {
-        cerr << "GPU module isn't built for GPU #" << device << " " << dev_info.name() << ", CC " << dev_info.major() << '.' << dev_info.minor() << endl;
+        cerr << "GPU module isn't built for GPU #" << device << " " << dev_info.name() << ", CC " << dev_info.majorVersion() << '.' << dev_info.minorVersion() << endl;
         return -1;
     }
     setDevice(device);
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 97eb7a82aa..4333b76257 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -746,10 +746,12 @@ TEST(erode)
 
         d_src.upload(src);
 
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        Ptr<gpu::Filter> erode = gpu::createMorphologyFilter(MORPH_ERODE, d_src.type(), ker);
+
+        erode->apply(d_src, d_dst);
 
         GPU_ON;
-        gpu::erode(d_src, d_dst, ker, d_buf);
+        erode->apply(d_src, d_dst);
         GPU_OFF;
     }
 }
@@ -929,10 +931,12 @@ TEST(GaussianBlur)
         gpu::GpuMat d_dst(src.size(), src.type());
         gpu::GpuMat d_buf;
 
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(3, 3), 1);
+
+        gauss->apply(d_src, d_dst);
 
         GPU_ON;
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        gauss->apply(d_src, d_dst);
         GPU_OFF;
     }
 }
@@ -961,10 +965,11 @@ TEST(filter2D)
             gpu::GpuMat d_src(src);
             gpu::GpuMat d_dst;
 
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            Ptr<gpu::Filter> filter2D = gpu::createLinearFilter(d_src.type(), -1, kernel);
+            filter2D->apply(d_src, d_dst);
 
             GPU_ON;
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            filter2D->apply(d_src, d_dst);
             GPU_OFF;
         }
     }
diff --git a/samples/gpu/stereo_match.cpp b/samples/gpu/stereo_match.cpp
index edf8886ffa..a080153a61 100644
--- a/samples/gpu/stereo_match.cpp
+++ b/samples/gpu/stereo_match.cpp
@@ -65,9 +65,9 @@ private:
     Mat left, right;
     gpu::GpuMat d_left, d_right;
 
-    gpu::StereoBM_GPU bm;
-    gpu::StereoBeliefPropagation bp;
-    gpu::StereoConstantSpaceBP csbp;
+    Ptr<gpu::StereoBM> bm;
+    Ptr<gpu::StereoBeliefPropagation> bp;
+    Ptr<gpu::StereoConstantSpaceBP> csbp;
 
     int64 work_begin;
     double work_fps;
@@ -172,9 +172,9 @@ void App::run()
     imshow("right", right);
 
     // Set common parameters
-    bm.ndisp = p.ndisp;
-    bp.ndisp = p.ndisp;
-    csbp.ndisp = p.ndisp;
+    bm = gpu::createStereoBM(p.ndisp);
+    bp = gpu::createStereoBeliefPropagation(p.ndisp);
+    csbp = cv::gpu::createStereoConstantSpaceBP(p.ndisp);
 
     // Prepare disparity map of specified type
     Mat disp(left.size(), CV_8U);
@@ -201,10 +201,10 @@ void App::run()
                 imshow("left", left);
                 imshow("right", right);
             }
-            bm(d_left, d_right, d_disp);
+            bm->compute(d_left, d_right, d_disp);
             break;
-        case Params::BP: bp(d_left, d_right, d_disp); break;
-        case Params::CSBP: csbp(d_left, d_right, d_disp); break;
+        case Params::BP: bp->compute(d_left, d_right, d_disp); break;
+        case Params::CSBP: csbp->compute(d_left, d_right, d_disp); break;
         }
         workEnd();
 
@@ -228,16 +228,16 @@ void App::printParams() const
     switch (p.method)
     {
     case Params::BM:
-        cout << "win_size: " << bm.winSize << endl;
-        cout << "prefilter_sobel: " << bm.preset << endl;
+        cout << "win_size: " << bm->getBlockSize() << endl;
+        cout << "prefilter_sobel: " << bm->getPreFilterType() << endl;
         break;
     case Params::BP:
-        cout << "iter_count: " << bp.iters << endl;
-        cout << "level_count: " << bp.levels << endl;
+        cout << "iter_count: " << bp->getNumIters() << endl;
+        cout << "level_count: " << bp->getNumLevels() << endl;
         break;
     case Params::CSBP:
-        cout << "iter_count: " << csbp.iters << endl;
-        cout << "level_count: " << csbp.levels << endl;
+        cout << "iter_count: " << csbp->getNumIters() << endl;
+        cout << "level_count: " << csbp->getNumLevels() << endl;
         break;
     }
     cout << endl;
@@ -289,92 +289,92 @@ void App::handleKey(char key)
     case 's': case 'S':
         if (p.method == Params::BM)
         {
-            switch (bm.preset)
+            switch (bm->getPreFilterType())
             {
-            case gpu::StereoBM_GPU::BASIC_PRESET:
-                bm.preset = gpu::StereoBM_GPU::PREFILTER_XSOBEL;
+            case 0:
+                bm->setPreFilterType(cv::StereoBM::PREFILTER_XSOBEL);
                 break;
-            case gpu::StereoBM_GPU::PREFILTER_XSOBEL:
-                bm.preset = gpu::StereoBM_GPU::BASIC_PRESET;
+            case cv::StereoBM::PREFILTER_XSOBEL:
+                bm->setPreFilterType(0);
                 break;
             }
-            cout << "prefilter_sobel: " << bm.preset << endl;
+            cout << "prefilter_sobel: " << bm->getPreFilterType() << endl;
         }
         break;
     case '1':
         p.ndisp = p.ndisp == 1 ? 8 : p.ndisp + 8;
         cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
+        bm->setNumDisparities(p.ndisp);
+        bp->setNumDisparities(p.ndisp);
+        csbp->setNumDisparities(p.ndisp);
         break;
     case 'q': case 'Q':
         p.ndisp = max(p.ndisp - 8, 1);
         cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
+        bm->setNumDisparities(p.ndisp);
+        bp->setNumDisparities(p.ndisp);
+        csbp->setNumDisparities(p.ndisp);
         break;
     case '2':
         if (p.method == Params::BM)
         {
-            bm.winSize = min(bm.winSize + 1, 51);
-            cout << "win_size: " << bm.winSize << endl;
+            bm->setBlockSize(min(bm->getBlockSize() + 1, 51));
+            cout << "win_size: " << bm->getBlockSize() << endl;
         }
         break;
     case 'w': case 'W':
         if (p.method == Params::BM)
         {
-            bm.winSize = max(bm.winSize - 1, 2);
-            cout << "win_size: " << bm.winSize << endl;
+            bm->setBlockSize(max(bm->getBlockSize() - 1, 2));
+            cout << "win_size: " << bm->getBlockSize() << endl;
         }
         break;
     case '3':
         if (p.method == Params::BP)
         {
-            bp.iters += 1;
-            cout << "iter_count: " << bp.iters << endl;
+            bp->setNumIters(bp->getNumIters() + 1);
+            cout << "iter_count: " << bp->getNumIters() << endl;
         }
         else if (p.method == Params::CSBP)
         {
-            csbp.iters += 1;
-            cout << "iter_count: " << csbp.iters << endl;
+            csbp->setNumIters(csbp->getNumIters() + 1);
+            cout << "iter_count: " << csbp->getNumIters() << endl;
         }
         break;
     case 'e': case 'E':
         if (p.method == Params::BP)
         {
-            bp.iters = max(bp.iters - 1, 1);
-            cout << "iter_count: " << bp.iters << endl;
+            bp->setNumIters(max(bp->getNumIters() - 1, 1));
+            cout << "iter_count: " << bp->getNumIters() << endl;
         }
         else if (p.method == Params::CSBP)
         {
-            csbp.iters = max(csbp.iters - 1, 1);
-            cout << "iter_count: " << csbp.iters << endl;
+            csbp->setNumIters(max(csbp->getNumIters() - 1, 1));
+            cout << "iter_count: " << csbp->getNumIters() << endl;
         }
         break;
     case '4':
         if (p.method == Params::BP)
         {
-            bp.levels += 1;
-            cout << "level_count: " << bp.levels << endl;
+            bp->setNumLevels(bp->getNumLevels() + 1);
+            cout << "level_count: " << bp->getNumLevels() << endl;
         }
         else if (p.method == Params::CSBP)
         {
-            csbp.levels += 1;
-            cout << "level_count: " << csbp.levels << endl;
+            csbp->setNumLevels(csbp->getNumLevels() + 1);
+            cout << "level_count: " << csbp->getNumLevels() << endl;
         }
         break;
     case 'r': case 'R':
         if (p.method == Params::BP)
         {
-            bp.levels = max(bp.levels - 1, 1);
-            cout << "level_count: " << bp.levels << endl;
+            bp->setNumLevels(max(bp->getNumLevels() - 1, 1));
+            cout << "level_count: " << bp->getNumLevels() << endl;
         }
         else if (p.method == Params::CSBP)
         {
-            csbp.levels = max(csbp.levels - 1, 1);
-            cout << "level_count: " << csbp.levels << endl;
+            csbp->setNumLevels(max(csbp->getNumLevels() - 1, 1));
+            cout << "level_count: " << csbp->getNumLevels() << endl;
         }
         break;
     }
diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp
index f85efe109e..83e2f2578b 100644
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@@ -51,7 +51,7 @@ struct Worker { void operator()(int device_id) const; };
 // GPUs data
 GpuMat d_left[2];
 GpuMat d_right[2];
-StereoBM_GPU* bm[2];
+Ptr<gpu::StereoBM> bm[2];
 GpuMat d_result[2];
 
 static void printHelp()
@@ -81,8 +81,8 @@ int main(int argc, char** argv)
         if (!dev_info.isCompatible())
         {
             std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.major()
-                 << dev_info.minor() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
             return -1;
         }
     }
@@ -112,13 +112,13 @@ int main(int argc, char** argv)
     setDevice(0);
     d_left[0].upload(left.rowRange(0, left.rows / 2));
     d_right[0].upload(right.rowRange(0, right.rows / 2));
-    bm[0] = new StereoBM_GPU();
+    bm[0] = gpu::createStereoBM();
 
     // Split source images for processing on the GPU #1
     setDevice(1);
     d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
     d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
-    bm[1] = new StereoBM_GPU();
+    bm[1] = gpu::createStereoBM();
 
     // Execute calculation in two threads using two GPUs
     int devices[] = {0, 1};
@@ -130,7 +130,7 @@ int main(int argc, char** argv)
     d_left[0].release();
     d_right[0].release();
     d_result[0].release();
-    delete bm[0];
+    bm[0].release();
 
     // Release the second GPU resources
     setDevice(1);
@@ -138,7 +138,7 @@ int main(int argc, char** argv)
     d_left[1].release();
     d_right[1].release();
     d_result[1].release();
-    delete bm[1];
+    bm[1].release();
 
     waitKey();
     return 0;
@@ -149,8 +149,7 @@ void Worker::operator()(int device_id) const
 {
     setDevice(device_id);
 
-    bm[device_id]->operator()(d_left[device_id], d_right[device_id],
-                              d_result[device_id]);
+    bm[device_id]->compute(d_left[device_id], d_right[device_id], d_result[device_id]);
 
     std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
         << "): finished\n";
diff --git a/samples/gpu/video_reader.cpp b/samples/gpu/video_reader.cpp
index 7eea726399..42f6f91db4 100644
--- a/samples/gpu/video_reader.cpp
+++ b/samples/gpu/video_reader.cpp
@@ -30,8 +30,7 @@ int main(int argc, const char* argv[])
     cv::VideoCapture reader(fname);
 
     cv::gpu::GpuMat d_frame;
-    cv::gpu::VideoReader_GPU d_reader(fname);
-    d_reader.dumpFormat(std::cout);
+    cv::Ptr<cv::gpucodec::VideoReader> d_reader = cv::gpucodec::createVideoReader(fname);
 
     cv::TickMeter tm;
     std::vector<double> cpu_times;
@@ -46,7 +45,7 @@ int main(int argc, const char* argv[])
         cpu_times.push_back(tm.getTimeMilli());
 
         tm.reset(); tm.start();
-        if (!d_reader.read(d_frame))
+        if (!d_reader->nextFrame(d_frame))
             break;
         tm.stop();
         gpu_times.push_back(tm.getTimeMilli());
diff --git a/samples/gpu/video_writer.cpp b/samples/gpu/video_writer.cpp
index d540d04093..c1bcc5d368 100644
--- a/samples/gpu/video_writer.cpp
+++ b/samples/gpu/video_writer.cpp
@@ -33,7 +33,7 @@ int main(int argc, const char* argv[])
     cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
 
     cv::VideoWriter writer;
-    cv::gpu::VideoWriter_GPU d_writer;
+    cv::Ptr<cv::gpucodec::VideoWriter> d_writer;
 
     cv::Mat frame;
     cv::gpu::GpuMat d_frame;
@@ -64,11 +64,11 @@ int main(int argc, const char* argv[])
                 return -1;
         }
 
-        if (!d_writer.isOpened())
+        if (d_writer.empty())
         {
             std::cout << "Open GPU Writer" << std::endl;
 
-            d_writer.open("output_gpu.avi", frame.size(), FPS);
+            d_writer = cv::gpucodec::createVideoWriter("output_gpu.avi", frame.size(), FPS);
         }
 
         d_frame.upload(frame);
@@ -81,7 +81,7 @@ int main(int argc, const char* argv[])
         cpu_times.push_back(tm.getTimeMilli());
 
         tm.reset(); tm.start();
-        d_writer.write(d_frame);
+        d_writer->write(d_frame);
         tm.stop();
         gpu_times.push_back(tm.getTimeMilli());
     }
diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp
index 5ffed2e40b..4ab92af5e0 100644
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@@ -17,16 +17,19 @@ using namespace cv;
 #define LOOP_NUM 10
 
 const static Scalar colors[] =  { CV_RGB(0,0,255),
-        CV_RGB(0,128,255),
-        CV_RGB(0,255,255),
-        CV_RGB(0,255,0),
-        CV_RGB(255,128,0),
-        CV_RGB(255,255,0),
-        CV_RGB(255,0,0),
-        CV_RGB(255,0,255)} ;
+                                  CV_RGB(0,128,255),
+                                  CV_RGB(0,255,255),
+                                  CV_RGB(0,255,0),
+                                  CV_RGB(255,128,0),
+                                  CV_RGB(255,255,0),
+                                  CV_RGB(255,0,0),
+                                  CV_RGB(255,0,255)
+                                } ;
+
 
 int64 work_begin = 0;
 int64 work_end = 0;
+string outputName;
 
 static void workBegin()
 {
@@ -37,34 +40,40 @@ static void workEnd()
     work_end += (getTickCount() - work_begin);
 }
 
-
-static double getTime(){
+static double getTime()
+{
     return work_end /((double)cvGetTickFrequency() * 1000.);
 }
 
 void detect( Mat& img, vector<Rect>& faces,
-    cv::ocl::OclCascadeClassifierBuf& cascade,
-    double scale, bool calTime);
+             ocl::OclCascadeClassifierBuf& cascade,
+             double scale, bool calTime);
+
 
 void detectCPU( Mat& img, vector<Rect>& faces,
-    CascadeClassifier& cascade,
-    double scale, bool calTime);
+                CascadeClassifier& cascade,
+                double scale, bool calTime);
 
 void Draw(Mat& img, vector<Rect>& faces, double scale);
 
+
 // This function test if gpu_rst matches cpu_rst.
 // If the two vectors are not equal, it will return the difference in vector size
 // Else if will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
-double checkRectSimilarity(Size sz, std::vector<Rect>& cpu_rst, std::vector<Rect>& gpu_rst);
+double checkRectSimilarity(Size sz, vector<Rect>& cpu_rst, vector<Rect>& gpu_rst);
+
 
 int main( int argc, const char** argv )
 {
     const char* keys =
         "{ h | help       | false       | print help message }"
         "{ i | input      |             | specify input image }"
-        "{ t | template   | ../../../data/haarcascades/haarcascade_frontalface_alt.xml  | specify template file }"
+        "{ t | template   | haarcascade_frontalface_alt.xml |"
+        " specify template file path }"
         "{ c | scale      |   1.0       | scale image }"
-        "{ s | use_cpu    | false       | use cpu or gpu to process the image }";
+        "{ s | use_cpu    | false       | use cpu or gpu to process the image }"
+        "{ o | output     | facedetect_output.jpg  |"
+        " specify output image save path(only works when input is images) }";
 
     CommandLineParser cmd(argc, argv, keys);
     if (cmd.get<bool>("help"))
@@ -78,9 +87,10 @@ int main( int argc, const char** argv )
 
     bool useCPU = cmd.get<bool>("s");
     string inputName = cmd.get<string>("i");
+    outputName = cmd.get<string>("o");
     string cascadeName = cmd.get<string>("t");
     double scale = cmd.get<double>("c");
-    cv::ocl::OclCascadeClassifierBuf cascade;
+    ocl::OclCascadeClassifierBuf cascade;
     CascadeClassifier  cpu_cascade;
 
     if( !cascade.load( cascadeName ) || !cpu_cascade.load(cascadeName) )
@@ -114,9 +124,10 @@ int main( int argc, const char** argv )
         return -1;
     }
 
+
     cvNamedWindow( "result", 1 );
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = cv::ocl::getDevice(oclinfo);
+    vector<ocl::Info> oclinfo;
+    int devnums = ocl::getDevice(oclinfo);
     if( devnums < 1 )
     {
         std::cout << "no device found\n";
@@ -139,10 +150,12 @@ int main( int argc, const char** argv )
                 frame.copyTo( frameCopy );
             else
                 flip( frame, frameCopy, 0 );
-            if(useCPU){
+            if(useCPU)
+            {
                 detectCPU(frameCopy, faces, cpu_cascade, scale, false);
             }
-            else{
+            else
+            {
                 detect(frameCopy, faces, cascade, scale, false);
             }
             Draw(frameCopy, faces, scale);
@@ -150,8 +163,10 @@ int main( int argc, const char** argv )
                 goto _cleanup_;
         }
 
+
         waitKey(0);
 
+
 _cleanup_:
         cvReleaseCapture( &capture );
     }
@@ -161,15 +176,18 @@ _cleanup_:
         vector<Rect> faces;
         vector<Rect> ref_rst;
         double accuracy = 0.;
-        for(int i = 0; i <= LOOP_NUM;i ++)
+        for(int i = 0; i <= LOOP_NUM; i ++)
         {
             cout << "loop" << i << endl;
-            if(useCPU){
+            if(useCPU)
+            {
                 detectCPU(image, faces, cpu_cascade, scale, i==0?false:true);
             }
-            else{
+            else
+            {
                 detect(image, faces, cascade, scale, i==0?false:true);
-                if(i == 0){
+                if(i == 0)
+                {
                     detectCPU(image, ref_rst, cpu_cascade, scale, false);
                     accuracy = checkRectSimilarity(image.size(), ref_rst, faces);
                 }
@@ -189,31 +207,30 @@ _cleanup_:
     }
 
     cvDestroyWindow("result");
-
     return 0;
 }
 
 void detect( Mat& img, vector<Rect>& faces,
-    cv::ocl::OclCascadeClassifierBuf& cascade,
-    double scale, bool calTime)
+             ocl::OclCascadeClassifierBuf& cascade,
+             double scale, bool calTime)
 {
-    cv::ocl::oclMat image(img);
-    cv::ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
+    ocl::oclMat image(img);
+    ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
     if(calTime) workBegin();
-    cv::ocl::cvtColor( image, gray, COLOR_BGR2GRAY );
-    cv::ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    cv::ocl::equalizeHist( smallImg, smallImg );
+    ocl::cvtColor( image, gray, COLOR_BGR2GRAY );
+    ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
+    ocl::equalizeHist( smallImg, smallImg );
 
     cascade.detectMultiScale( smallImg, faces, 1.1,
-        3, 0
-        |CV_HAAR_SCALE_IMAGE
-        , Size(30,30), Size(0, 0) );
+                              3, 0
+                              |CV_HAAR_SCALE_IMAGE
+                              , Size(30,30), Size(0, 0) );
     if(calTime) workEnd();
 }
 
 void detectCPU( Mat& img, vector<Rect>& faces,
-    CascadeClassifier& cascade,
-    double scale, bool calTime)
+                CascadeClassifier& cascade,
+                double scale, bool calTime)
 {
     if(calTime) workBegin();
     Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
@@ -221,11 +238,12 @@ void detectCPU( Mat& img, vector<Rect>& faces,
     resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR);
     equalizeHist(cpu_smallImg, cpu_smallImg);
     cascade.detectMultiScale(cpu_smallImg, faces, 1.1,
-        3, 0 | CV_HAAR_SCALE_IMAGE,
-        Size(30, 30), Size(0, 0));
+                             3, 0 | CV_HAAR_SCALE_IMAGE,
+                             Size(30, 30), Size(0, 0));
     if(calTime) workEnd();
 }
 
+
 void Draw(Mat& img, vector<Rect>& faces, double scale)
 {
     int i = 0;
@@ -239,31 +257,38 @@ void Draw(Mat& img, vector<Rect>& faces, double scale)
         radius = cvRound((r->width + r->height)*0.25*scale);
         circle( img, center, radius, color, 3, 8, 0 );
     }
-    cv::imshow( "result", img );
+    imshow( "result", img );
+    imwrite( outputName, img );
 }
 
-double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
+
+double checkRectSimilarity(Size sz, vector<Rect>& ob1, vector<Rect>& ob2)
 {
     double final_test_result = 0.0;
     size_t sz1 = ob1.size();
     size_t sz2 = ob2.size();
 
     if(sz1 != sz2)
+    {
         return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
     else
     {
-        cv::Mat cpu_result(sz, CV_8UC1);
+        if(sz1==0 && sz2==0)
+            return 0;
+        Mat cpu_result(sz, CV_8UC1);
         cpu_result.setTo(0);
 
         for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
         {
-            cv::Mat cpu_result_roi(cpu_result, *r);
+            Mat cpu_result_roi(cpu_result, *r);
             cpu_result_roi.setTo(1);
             cpu_result.copyTo(cpu_result);
         }
-        int cpu_area = cv::countNonZero(cpu_result > 0);
+        int cpu_area = countNonZero(cpu_result > 0);
 
-        cv::Mat gpu_result(sz, CV_8UC1);
+
+        Mat gpu_result(sz, CV_8UC1);
         gpu_result.setTo(0);
         for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
         {
@@ -272,11 +297,13 @@ double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& o
             gpu_result.copyTo(gpu_result);
         }
 
-        cv::Mat result_;
+        Mat result_;
         multiply(cpu_result, gpu_result, result_);
-        int result = cv::countNonZero(result_ > 0);
-
-        final_test_result = 1.0 - (double)result/(double)cpu_area;
+        int result = countNonZero(result_ > 0);
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
     }
     return final_test_result;
 }
diff --git a/samples/ocl/hog.cpp b/samples/ocl/hog.cpp
index daff267718..b26c98c337 100644
--- a/samples/ocl/hog.cpp
+++ b/samples/ocl/hog.cpp
@@ -11,75 +11,39 @@
 using namespace std;
 using namespace cv;
 
-bool help_showed = false;
-
-class Args
-{
-public:
-    Args();
-    static Args read(int argc, char** argv);
-
-    string src;
-    bool src_is_video;
-    bool src_is_camera;
-    int camera_id;
-
-    bool write_video;
-    string dst_video;
-    double dst_video_fps;
-
-    bool make_gray;
-
-    bool resize_src;
-    int width, height;
-
-    double scale;
-    int nlevels;
-    int gr_threshold;
-
-    double hit_threshold;
-    bool hit_threshold_auto;
-
-    int win_width;
-    int win_stride_width, win_stride_height;
-
-    bool gamma_corr;
-};
-
 class App
 {
 public:
-    App(const Args& s);
+    App(CommandLineParser& cmd);
     void run();
-
     void handleKey(char key);
-
     void hogWorkBegin();
     void hogWorkEnd();
     string hogWorkFps() const;
-
     void workBegin();
     void workEnd();
     string workFps() const;
-
     string message() const;
 
+
 // This function test if gpu_rst matches cpu_rst.
 // If the two vectors are not equal, it will return the difference in vector size
-// Else if will return 
+// Else if will return
 // (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
-    double checkRectSimilarity(Size sz, 
-                               std::vector<Rect>& cpu_rst, 
+    double checkRectSimilarity(Size sz,
+                               std::vector<Rect>& cpu_rst,
                                std::vector<Rect>& gpu_rst);
 private:
     App operator=(App&);
 
-    Args args;
+    //Args args;
     bool running;
-
     bool use_gpu;
     bool make_gray;
     double scale;
+    double resize_scale;
+    int win_width;
+    int win_stride_width, win_stride_height;
     int gr_threshold;
     int nlevels;
     double hit_threshold;
@@ -87,119 +51,49 @@ private:
 
     int64 hog_work_begin;
     double hog_work_fps;
-
     int64 work_begin;
     double work_fps;
-};
 
-static void printHelp()
-{
-    cout << "Histogram of Oriented Gradients descriptor and detector sample.\n"
-         << "\nUsage: hog_gpu\n"
-         << "  (<image>|--video <vide>|--camera <camera_id>) # frames source\n"
-         << "  [--make_gray <true/false>] # convert image to gray one or not\n"
-         << "  [--resize_src <true/false>] # do resize of the source image or not\n"
-         << "  [--width <int>] # resized image width\n"
-         << "  [--height <int>] # resized image height\n"
-         << "  [--hit_threshold <double>] # classifying plane distance threshold (0.0 usually)\n"
-         << "  [--scale <double>] # HOG window scale factor\n"
-         << "  [--nlevels <int>] # max number of HOG window scales\n"
-         << "  [--win_width <int>] # width of the window (48 or 64)\n"
-         << "  [--win_stride_width <int>] # distance by OX axis between neighbour wins\n"
-         << "  [--win_stride_height <int>] # distance by OY axis between neighbour wins\n"
-         << "  [--gr_threshold <int>] # merging similar rects constant\n"
-         << "  [--gamma_correct <int>] # do gamma correction or not\n"
-         << "  [--write_video <bool>] # write video or not\n"
-         << "  [--dst_video <path>] # output video path\n"
-         << "  [--dst_video_fps <double>] # output video fps\n";
-    help_showed = true;
-}
+    string img_source;
+    string vdo_source;
+    string output;
+    int camera_id;
+};
 
 int main(int argc, char** argv)
 {
+    const char* keys =
+        "{ h |  help    | false          | print help message }"
+        "{ i |  input   |                | specify input image}"
+        "{ c | camera   | -1             | enable camera capturing }"
+        "{ v | video    |                | use video as input }"
+        "{ g |  gray    | false          | convert image to gray one or not}"
+        "{ s |  scale   | 1.0            | resize the image before detect}"
+        "{ l |larger_win| false          | use 64x128 window}"
+        "{ o |  output  |                | specify output path when input is images}";
+    CommandLineParser cmd(argc, argv, keys);
+    App app(cmd);
     try
     {
-        if (argc < 2)
-            printHelp();
-        Args args = Args::read(argc, argv);
-        if (help_showed)
-            return -1;
-        App app(args);
         app.run();
     }
-    catch (const Exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch (const exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch(...) { return cout << "unknown exception" << endl, 1; }
+    catch (const Exception& e)
+    {
+        return cout << "error: "  << e.what() << endl, 1;
+    }
+    catch (const exception& e)
+    {
+        return cout << "error: "  << e.what() << endl, 1;
+    }
+    catch(...)
+    {
+        return cout << "unknown exception" << endl, 1;
+    }
     return 0;
 }
 
-
-Args::Args()
+App::App(CommandLineParser& cmd)
 {
-    src_is_video = false;
-    src_is_camera = false;
-    camera_id = 0;
-
-    write_video = false;
-    dst_video_fps = 24.;
-
-    make_gray = false;
-
-    resize_src = false;
-    width = 640;
-    height = 480;
-
-    scale = 1.05;
-    nlevels = 13;
-    gr_threshold = 8;
-    hit_threshold = 1.4;
-    hit_threshold_auto = true;
-
-    win_width = 48;
-    win_stride_width = 8;
-    win_stride_height = 8;
-
-    gamma_corr = true;
-}
-
-
-Args Args::read(int argc, char** argv)
-{
-    Args args;
-    for (int i = 1; i < argc; i++)
-    {
-        if (string(argv[i]) == "--make_gray") args.make_gray = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--resize_src") args.resize_src = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--width") args.width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--height") args.height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--hit_threshold")
-        {
-            args.hit_threshold = atof(argv[++i]);
-            args.hit_threshold_auto = false;
-        }
-        else if (string(argv[i]) == "--scale") args.scale = atof(argv[++i]);
-        else if (string(argv[i]) == "--nlevels") args.nlevels = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--dst_video") args.dst_video = argv[++i];
-        else if (string(argv[i]) == "--dst_video_fps") args.dst_video_fps = atof(argv[++i]);
-        else if (string(argv[i]) == "--help") printHelp();
-        else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; }
-        else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; }
-        else if (args.src.empty()) args.src = argv[i];
-        else throw runtime_error((string("unknown key: ") + argv[i]));
-    }
-    return args;
-}
-
-
-App::App(const Args& s)
-{
-    args = s;
     cout << "\nControls:\n"
          << "\tESC - exit\n"
          << "\tm - change mode GPU <-> CPU\n"
@@ -210,56 +104,56 @@ App::App(const Args& s)
          << "\t4/r - increase/decrease hit threshold\n"
          << endl;
 
+
     use_gpu = true;
-    make_gray = args.make_gray;
-    scale = args.scale;
-    gr_threshold = args.gr_threshold;
-    nlevels = args.nlevels;
+    make_gray = cmd.get<bool>("g");
+    resize_scale = cmd.get<double>("s");
+    win_width = cmd.get<bool>("l") == true ? 64 : 48;
+    vdo_source = cmd.get<string>("v");
+    img_source = cmd.get<string>("i");
+    output = cmd.get<string>("o");
+    camera_id = cmd.get<int>("c");
 
-    if (args.hit_threshold_auto)
-        args.hit_threshold = args.win_width == 48 ? 1.4 : 0.;
-    hit_threshold = args.hit_threshold;
+    win_stride_width = 8;
+    win_stride_height = 8;
+    gr_threshold = 8;
+    nlevels = 13;
+    hit_threshold = win_width == 48 ? 1.4 : 0.;
+    scale = 1.05;
+    gamma_corr = true;
 
-    gamma_corr = args.gamma_corr;
-
-    if (args.win_width != 64 && args.win_width != 48)
-        args.win_width = 64;
-
-    cout << "Scale: " << scale << endl;
-    if (args.resize_src)
-        cout << "Resized source: (" << args.width << ", " << args.height << ")\n";
     cout << "Group threshold: " << gr_threshold << endl;
     cout << "Levels number: " << nlevels << endl;
-    cout << "Win width: " << args.win_width << endl;
-    cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n";
+    cout << "Win width: " << win_width << endl;
+    cout << "Win stride: (" << win_stride_width << ", " << win_stride_height << ")\n";
     cout << "Hit threshold: " << hit_threshold << endl;
     cout << "Gamma correction: " << gamma_corr << endl;
     cout << endl;
 }
 
-
 void App::run()
 {
-    std::vector<ocl::Info> oclinfo;
+    vector<ocl::Info> oclinfo;
     ocl::getDevice(oclinfo);
     running = true;
-    cv::VideoWriter video_writer;
+    VideoWriter video_writer;
 
-    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(args.win_stride_width, args.win_stride_height);
+    Size win_size(win_width, win_width * 2);
+    Size win_stride(win_stride_width, win_stride_height);
 
     // Create HOG descriptors and detectors here
     vector<float> detector;
     if (win_size == Size(64, 128))
-        detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
+        detector = ocl::HOGDescriptor::getPeopleDetector64x128();
     else
-        detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
+        detector = ocl::HOGDescriptor::getPeopleDetector48x96();
 
-    cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
-                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
+
+    ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
+                               ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
+                               ocl::HOGDescriptor::DEFAULT_NLEVELS);
+    HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
+                          HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
     gpu_hog.setSVMDetector(detector);
     cpu_hog.setSVMDetector(detector);
 
@@ -268,29 +162,29 @@ void App::run()
         VideoCapture vc;
         Mat frame;
 
-        if (args.src_is_video)
+        if (vdo_source!="")
         {
-            vc.open(args.src.c_str());
+            vc.open(vdo_source.c_str());
             if (!vc.isOpened())
-                throw runtime_error(string("can't open video file: " + args.src));
+                throw runtime_error(string("can't open video file: " + vdo_source));
             vc >> frame;
         }
-        else if (args.src_is_camera)
+        else if (camera_id != -1)
         {
-            vc.open(args.camera_id);
+            vc.open(camera_id);
             if (!vc.isOpened())
             {
                 stringstream msg;
-                msg << "can't open camera: " << args.camera_id;
+                msg << "can't open camera: " << camera_id;
                 throw runtime_error(msg.str());
             }
             vc >> frame;
         }
         else
         {
-            frame = imread(args.src);
+            frame = imread(img_source);
             if (frame.empty())
-                throw runtime_error(string("can't open image file: " + args.src));
+                throw runtime_error(string("can't open image file: " + img_source));
         }
 
         Mat img_aux, img, img_to_show;
@@ -308,13 +202,15 @@ void App::run()
             else frame.copyTo(img_aux);
 
             // Resize image
-            if (args.resize_src) resize(img_aux, img, Size(args.width, args.height));
+            if (abs(scale-1.0)>0.001)
+            {
+                Size sz((int)((double)img_aux.cols/resize_scale), (int)((double)img_aux.rows/resize_scale));
+                resize(img_aux, img, sz);
+            }
             else img = img_aux;
             img_to_show = img;
-
             gpu_hog.nlevels = nlevels;
             cpu_hog.nlevels = nlevels;
-
             vector<Rect> found;
 
             // Perform HOG classification
@@ -331,15 +227,16 @@ void App::run()
                     vector<Rect> ref_rst;
                     cvtColor(img, img, COLOR_BGRA2BGR);
                     cpu_hog.detectMultiScale(img, ref_rst, hit_threshold, win_stride,
-                                              Size(0, 0), scale, gr_threshold-2);
+                                             Size(0, 0), scale, gr_threshold-2);
                     double accuracy = checkRectSimilarity(img.size(), ref_rst, found);
-                    cout << "\naccuracy value: " << accuracy << endl;           
-                } 
-           }
+                    cout << "\naccuracy value: " << accuracy << endl;
+                }
+            }
             else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
-                                          Size(0, 0), scale, gr_threshold);
+                                              Size(0, 0), scale, gr_threshold);
             hogWorkEnd();
 
+
             // Draw positive classified windows
             for (size_t i = 0; i < found.size(); i++)
             {
@@ -354,25 +251,31 @@ void App::run()
             putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
             putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
             imshow("opencv_gpu_hog", img_to_show);
-
-            if (args.src_is_video || args.src_is_camera) vc >> frame;
+            if (vdo_source!="" || camera_id!=-1) vc >> frame;
 
             workEnd();
 
-            if (args.write_video)
+            if (output!="")
             {
-                if (!video_writer.isOpened())
+                if (img_source!="")     // wirte image
                 {
-                    video_writer.open(args.dst_video, VideoWriter::fourcc('x','v','i','d'), args.dst_video_fps,
-                                      img_to_show.size(), true);
-                    if (!video_writer.isOpened())
-                        throw std::runtime_error("can't create video writer");
+                    imwrite(output, img_to_show);
                 }
+                else                    //write video
+                {
+                    if (!video_writer.isOpened())
+                    {
+                        video_writer.open(output, VideoWriter::fourcc('x','v','i','d'), 24,
+                                          img_to_show.size(), true);
+                        if (!video_writer.isOpened())
+                            throw std::runtime_error("can't create video writer");
+                    }
 
-                if (make_gray) cvtColor(img_to_show, img, COLOR_GRAY2BGR);
-                else cvtColor(img_to_show, img, COLOR_BGRA2BGR);
+                    if (make_gray) cvtColor(img_to_show, img, COLOR_GRAY2BGR);
+                    else cvtColor(img_to_show, img, COLOR_BGRA2BGR);
 
-                video_writer << img;
+                    video_writer << img;
+                }
             }
 
             handleKey((char)waitKey(3));
@@ -380,7 +283,6 @@ void App::run()
     }
 }
 
-
 void App::handleKey(char key)
 {
     switch (key)
@@ -443,7 +345,10 @@ void App::handleKey(char key)
 }
 
 
-inline void App::hogWorkBegin() { hog_work_begin = getTickCount(); }
+inline void App::hogWorkBegin()
+{
+    hog_work_begin = getTickCount();
+}
 
 inline void App::hogWorkEnd()
 {
@@ -459,8 +364,10 @@ inline string App::hogWorkFps() const
     return ss.str();
 }
 
-
-inline void App::workBegin() { work_begin = getTickCount(); }
+inline void App::workBegin()
+{
+    work_begin = getTickCount();
+}
 
 inline void App::workEnd()
 {
@@ -476,8 +383,9 @@ inline string App::workFps() const
     return ss.str();
 }
 
-double App::checkRectSimilarity(Size sz, 
-                                std::vector<Rect>& ob1, 
+
+double App::checkRectSimilarity(Size sz,
+                                std::vector<Rect>& ob1,
                                 std::vector<Rect>& ob2)
 {
     double final_test_result = 0.0;
@@ -485,20 +393,26 @@ double App::checkRectSimilarity(Size sz,
     size_t sz2 = ob2.size();
 
     if(sz1 != sz2)
+    {
         return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
+    }
     else
     {
+        if(sz1==0 && sz2==0)
+            return 0;
         cv::Mat cpu_result(sz, CV_8UC1);
         cpu_result.setTo(0);
 
+
         for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
-        {      
+        {
             cv::Mat cpu_result_roi(cpu_result, *r);
             cpu_result_roi.setTo(1);
             cpu_result.copyTo(cpu_result);
         }
         int cpu_area = cv::countNonZero(cpu_result > 0);
 
+
         cv::Mat gpu_result(sz, CV_8UC1);
         gpu_result.setTo(0);
         for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
@@ -511,10 +425,10 @@ double App::checkRectSimilarity(Size sz,
         cv::Mat result_;
         multiply(cpu_result, gpu_result, result_);
         int result = cv::countNonZero(result_ > 0);
-
-        final_test_result = 1.0 - (double)result/(double)cpu_area;
+        if(cpu_area!=0 && result!=0)
+            final_test_result = 1.0 - (double)result/(double)cpu_area;
+        else if(cpu_area==0 && result!=0)
+            final_test_result = -1;
     }
     return final_test_result;
-
 }
-
diff --git a/samples/ocl/pyrlk_optical_flow.cpp b/samples/ocl/pyrlk_optical_flow.cpp
index 3ce0edc8fc..f7d0661712 100644
--- a/samples/ocl/pyrlk_optical_flow.cpp
+++ b/samples/ocl/pyrlk_optical_flow.cpp
@@ -12,19 +12,20 @@ using namespace cv;
 using namespace cv::ocl;
 
 typedef unsigned char uchar;
-#define LOOP_NUM 10 
+#define LOOP_NUM 10
 int64 work_begin = 0;
 int64 work_end = 0;
 
-static void workBegin() 
-{ 
+static void workBegin()
+{
     work_begin = getTickCount();
 }
 static void workEnd()
 {
     work_end += (getTickCount() - work_begin);
 }
-static double getTime(){
+static double getTime()
+{
     return work_end * 1000. / getTickFrequency();
 }
 
@@ -94,14 +95,15 @@ int main(int argc, const char* argv[])
     //set this to save kernel compile time from second time you run
     ocl::setBinpath("./");
     const char* keys =
-        "{ help h           | false | print help message }"
-        "{ left l           |       | specify left image }"
-        "{ right r          |       | specify right image }"
-        "{ camera c         | 0     | enable camera capturing }"
-        "{ use_cpu s        | false | use cpu or gpu to process the image }"
-        "{ video v          |       | use video as input }"
-        "{ points           | 1000  | specify points count [GoodFeatureToTrack] }"
-        "{ min_dist         | 0     | specify minimal distance between points [GoodFeatureToTrack] }";
+        "{ help h           | false           | print help message }"
+        "{ left l           |                 | specify left image }"
+        "{ right r          |                 | specify right image }"
+        "{ camera c         | 0               | enable camera capturing }"
+        "{ use_cpu s        | false           | use cpu or gpu to process the image }"
+        "{ video v          |                 | use video as input }"
+        "{ output o         | pyrlk_output.jpg| specify output save path when input is images }"
+        "{ points           | 1000            | specify points count [GoodFeatureToTrack] }"
+        "{ min_dist         | 0               | specify minimal distance between points [GoodFeatureToTrack] }";
 
     CommandLineParser cmd(argc, argv, keys);
 
@@ -115,10 +117,10 @@ int main(int argc, const char* argv[])
     string fname0 = cmd.get<string>("left");
     string fname1 = cmd.get<string>("right");
     string vdofile = cmd.get<string>("video");
+    string outfile = cmd.get<string>("output");
     int points = cmd.get<int>("points");
     double minDist = cmd.get<double>("min_dist");
     bool useCPU = cmd.has("s");
-    bool useCamera = cmd.has("c");
     int inputName = cmd.get<int>("c");
 
     oclMat d_nextPts, d_status;
@@ -131,21 +133,9 @@ int main(int argc, const char* argv[])
     vector<unsigned char> status(points);
     vector<float> err;
 
-    if (frame0.empty() || frame1.empty())
-    {
-        useCamera = true;
-        defaultPicturesFail = true;
-        VideoCapture capture(inputName);
-        if (!capture.isOpened())
-        {
-            cout << "Can't load input images" << endl;
-            return -1;
-        }
-    }
-
     cout << "Points count : " << points << endl << endl;
 
-    if (useCamera)
+    if (frame0.empty() || frame1.empty())
     {
         VideoCapture capture;
         Mat frame, frameCopy;
@@ -238,10 +228,10 @@ _cleanup_:
     else
     {
 nocamera:
-        for(int i = 0; i <= LOOP_NUM;i ++) 
+        for(int i = 0; i <= LOOP_NUM; i ++)
         {
             cout << "loop" << i << endl;
-            if (i > 0) workBegin();     
+            if (i > 0) workBegin();
 
             if (useCPU)
             {
@@ -271,8 +261,8 @@ nocamera:
                 cout << getTime() / LOOP_NUM << " ms" << endl;
 
                 drawArrows(frame0, pts, nextPts, status, Scalar(255, 0, 0));
-
                 imshow("PyrLK [Sparse]", frame0);
+                imwrite(outfile, frame0);
             }
         }
     }
diff --git a/samples/ocl/squares.cpp b/samples/ocl/squares.cpp
index 40d60fe2cd..d31e360073 100644
--- a/samples/ocl/squares.cpp
+++ b/samples/ocl/squares.cpp
@@ -2,11 +2,11 @@
 // It loads several images sequentially and tries to find squares in
 // each image
 
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/ocl/ocl.hpp"
-
 #include <iostream>
 #include <math.h>
 #include <string.h>
@@ -14,23 +14,50 @@
 using namespace cv;
 using namespace std;
 
-static void help()
-{
-    cout <<
-        "\nA program using OCL module pyramid scaling, Canny, dilate functions, threshold, split; cpu contours, contour simpification and\n"
-        "memory storage (it's got it all folks) to find\n"
-        "squares in a list of images pic1-6.png\n"
-        "Returns sequence of squares detected on the image.\n"
-        "the sequence is stored in the specified memory storage\n"
-        "Call:\n"
-        "./squares\n"
-        "Using OpenCV version %s\n" << CV_VERSION << "\n" << endl;
-}
+#define ACCURACY_CHECK 1
 
+#if ACCURACY_CHECK
+// check if two vectors of vector of points are near or not
+// prior assumption is that they are in correct order
+static bool checkPoints(
+    vector< vector<Point> > set1,
+    vector< vector<Point> > set2,
+    int maxDiff = 5)
+{
+    if(set1.size() != set2.size())
+    {
+        return false;
+    }
+
+    for(vector< vector<Point> >::iterator it1 = set1.begin(), it2 = set2.begin();
+            it1 < set1.end() && it2 < set2.end(); it1 ++, it2 ++)
+    {
+        vector<Point> pts1 = *it1;
+        vector<Point> pts2 = *it2;
+
+
+        if(pts1.size() != pts2.size())
+        {
+            return false;
+        }
+        for(size_t i = 0; i < pts1.size(); i ++)
+        {
+            Point pt1 = pts1[i], pt2 = pts2[i];
+            if(std::abs(pt1.x - pt2.x) > maxDiff ||
+                    std::abs(pt1.y - pt2.y) > maxDiff)
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+#endif
 
 int thresh = 50, N = 11;
 const char* wndname = "OpenCL Square Detection Demo";
 
+
 // helper function:
 // finds a cosine of angle between vectors
 // from pt0->pt1 and from pt0->pt2
@@ -43,9 +70,92 @@ static double angle( Point pt1, Point pt2, Point pt0 )
     return (dx1*dx2 + dy1*dy2)/sqrt((dx1*dx1 + dy1*dy1)*(dx2*dx2 + dy2*dy2) + 1e-10);
 }
 
+
 // returns sequence of squares detected on the image.
 // the sequence is stored in the specified memory storage
 static void findSquares( const Mat& image, vector<vector<Point> >& squares )
+{
+    squares.clear();
+    Mat pyr, timg, gray0(image.size(), CV_8U), gray;
+
+    // down-scale and upscale the image to filter out the noise
+    pyrDown(image, pyr, Size(image.cols/2, image.rows/2));
+    pyrUp(pyr, timg, image.size());
+    vector<vector<Point> > contours;
+
+    // find squares in every color plane of the image
+    for( int c = 0; c < 3; c++ )
+    {
+        int ch[] = {c, 0};
+        mixChannels(&timg, 1, &gray0, 1, ch, 1);
+
+        // try several threshold levels
+        for( int l = 0; l < N; l++ )
+        {
+            // hack: use Canny instead of zero threshold level.
+            // Canny helps to catch squares with gradient shading
+            if( l == 0 )
+            {
+                // apply Canny. Take the upper threshold from slider
+                // and set the lower to 0 (which forces edges merging)
+                Canny(gray0, gray, 0, thresh, 5);
+                // dilate canny output to remove potential
+                // holes between edge segments
+                dilate(gray, gray, Mat(), Point(-1,-1));
+            }
+            else
+            {
+                // apply threshold if l!=0:
+                //     tgray(x,y) = gray(x,y) < (l+1)*255/N ? 255 : 0
+                cv::threshold(gray0, gray, (l+1)*255/N, 255, THRESH_BINARY);
+            }
+
+            // find contours and store them all as a list
+            findContours(gray, contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
+
+            vector<Point> approx;
+
+            // test each contour
+            for( size_t i = 0; i < contours.size(); i++ )
+            {
+                // approximate contour with accuracy proportional
+                // to the contour perimeter
+                approxPolyDP(Mat(contours[i]), approx, arcLength(Mat(contours[i]), true)*0.02, true);
+
+                // square contours should have 4 vertices after approximation
+                // relatively large area (to filter out noisy contours)
+                // and be convex.
+                // Note: absolute value of an area is used because
+                // area may be positive or negative - in accordance with the
+                // contour orientation
+                if( approx.size() == 4 &&
+                        fabs(contourArea(Mat(approx))) > 1000 &&
+                        isContourConvex(Mat(approx)) )
+                {
+                    double maxCosine = 0;
+
+                    for( int j = 2; j < 5; j++ )
+                    {
+                        // find the maximum cosine of the angle between joint edges
+                        double cosine = fabs(angle(approx[j%4], approx[j-2], approx[j-1]));
+                        maxCosine = MAX(maxCosine, cosine);
+                    }
+
+                    // if cosines of all angles are small
+                    // (all angles are ~90 degree) then write quandrange
+                    // vertices to resultant sequence
+                    if( maxCosine < 0.3 )
+                        squares.push_back(approx);
+                }
+            }
+        }
+    }
+}
+
+
+// returns sequence of squares detected on the image.
+// the sequence is stored in the specified memory storage
+static void findSquares_ocl( const Mat& image, vector<vector<Point> >& squares )
 {
     squares.clear();
 
@@ -91,7 +201,6 @@ static void findSquares( const Mat& image, vector<vector<Point> >& squares )
             findContours(gray, contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
 
             vector<Point> approx;
-
             // test each contour
             for( size_t i = 0; i < contours.size(); i++ )
             {
@@ -106,11 +215,10 @@ static void findSquares( const Mat& image, vector<vector<Point> >& squares )
                 // area may be positive or negative - in accordance with the
                 // contour orientation
                 if( approx.size() == 4 &&
-                    fabs(contourArea(Mat(approx))) > 1000 &&
-                    isContourConvex(Mat(approx)) )
+                        fabs(contourArea(Mat(approx))) > 1000 &&
+                        isContourConvex(Mat(approx)) )
                 {
                     double maxCosine = 0;
-
                     for( int j = 2; j < 5; j++ )
                     {
                         // find the maximum cosine of the angle between joint edges
@@ -139,40 +247,93 @@ static void drawSquares( Mat& image, const vector<vector<Point> >& squares )
         int n = (int)squares[i].size();
         polylines(image, &p, &n, 1, true, Scalar(0,255,0), 3, LINE_AA);
     }
-
-    imshow(wndname, image);
 }
 
 
-int main(int /*argc*/, char** /*argv*/)
+// draw both pure-C++ and ocl square results onto a single image
+static Mat drawSquaresBoth( const Mat& image,
+                            const vector<vector<Point> >& sqsCPP,
+                            const vector<vector<Point> >& sqsOCL
+)
 {
+    Mat imgToShow(Size(image.cols * 2, image.rows), image.type());
+    Mat lImg = imgToShow(Rect(Point(0, 0), image.size()));
+    Mat rImg = imgToShow(Rect(Point(image.cols, 0), image.size()));
+    image.copyTo(lImg);
+    image.copyTo(rImg);
+    drawSquares(lImg, sqsCPP);
+    drawSquares(rImg, sqsOCL);
+    float fontScale = 0.8f;
+    Scalar white = Scalar::all(255), black = Scalar::all(0);
+
+    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
+    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
+    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
+    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
+
+    return imgToShow;
+}
+
+
+int main(int argc, char** argv)
+{
+    const char* keys =
+        "{ i | input   |                    | specify input image }"
+        "{ o | output  | squares_output.jpg | specify output save path}";
+    CommandLineParser cmd(argc, argv, keys);
+    string inputName = cmd.get<string>("i");
+    string outfile = cmd.get<string>("o");
+    if(inputName.empty())
+    {
+        cout << "Avaible options:" << endl;
+        cmd.printMessage();
+        return 0;
+    }
 
-    //ocl::setBinpath("F:/kernel_bin");
     vector<ocl::Info> info;
     CV_Assert(ocl::getDevice(info));
-
-    static const char* names[] = { "pic1.png", "pic2.png", "pic3.png",
-        "pic4.png", "pic5.png", "pic6.png", 0 };
-    help();
+    int iterations = 10;
     namedWindow( wndname, 1 );
-    vector<vector<Point> > squares;
+    vector<vector<Point> > squares_cpu, squares_ocl;
 
-    for( int i = 0; names[i] != 0; i++ )
+    Mat image = imread(inputName, 1);
+    if( image.empty() )
     {
-        Mat image = imread(names[i], 1);
-        if( image.empty() )
-        {
-            cout << "Couldn't load " << names[i] << endl;
-            continue;
-        }
-
-        findSquares(image, squares);
-        drawSquares(image, squares);
-
-        int c = waitKey();
-        if( (char)c == 27 )
-            break;
+        cout << "Couldn't load " << inputName << endl;
+        return -1;
     }
+    int j = iterations;
+    int64 t_ocl = 0, t_cpp = 0;
+    //warm-ups
+    cout << "warming up ..." << endl;
+    findSquares(image, squares_cpu);
+    findSquares_ocl(image, squares_ocl);
+
+
+#if ACCURACY_CHECK
+    cout << "Checking ocl accuracy ... " << endl;
+    cout << (checkPoints(squares_cpu, squares_ocl) ? "Pass" : "Failed") << endl;
+#endif
+    do
+    {
+        int64 t_start = cv::getTickCount();
+        findSquares(image, squares_cpu);
+        t_cpp += cv::getTickCount() - t_start;
+
+
+        t_start  = cv::getTickCount();
+        findSquares_ocl(image, squares_ocl);
+        t_ocl += cv::getTickCount() - t_start;
+        cout << "run loop: " << j << endl;
+    }
+    while(--j);
+    cout << "cpp average time: " << 1000.0f * (double)t_cpp / getTickFrequency() / iterations << "ms" << endl;
+    cout << "ocl average time: " << 1000.0f * (double)t_ocl / getTickFrequency() / iterations << "ms" << endl;
+
+    Mat result = drawSquaresBoth(image, squares_cpu, squares_ocl);
+    imshow(wndname, result);
+    imwrite(outfile, result);
+    waitKey(0);
 
     return 0;
 }
diff --git a/samples/ocl/stereo_match.cpp b/samples/ocl/stereo_match.cpp
index 8cc6530d50..8737a047aa 100644
--- a/samples/ocl/stereo_match.cpp
+++ b/samples/ocl/stereo_match.cpp
@@ -12,56 +12,45 @@ using namespace cv;
 using namespace std;
 using namespace ocl;
 
-bool help_showed = false;
-
-struct Params
-{
-    Params();
-    static Params read(int argc, char** argv);
-
-    string left;
-    string right;
-
-    string method_str() const
-    {
-        switch (method)
-        {
-        case BM: return "BM";
-        case BP: return "BP";
-        case CSBP: return "CSBP";
-        }
-        return "";
-    }
-    enum {BM, BP, CSBP} method;
-    int ndisp; // Max disparity + 1
-    enum {GPU, CPU} type;
-};
-
 
 struct App
 {
-    App(const Params& p);
+    App(CommandLineParser& cmd);
     void run();
     void handleKey(char key);
     void printParams() const;
 
-    void workBegin() { work_begin = getTickCount(); }
+    void workBegin()
+    {
+        work_begin = getTickCount();
+    }
     void workEnd()
     {
         int64 d = getTickCount() - work_begin;
         double f = getTickFrequency();
         work_fps = f / d;
     }
-
+    string method_str() const
+    {
+        switch (method)
+        {
+        case BM:
+            return "BM";
+        case BP:
+            return "BP";
+        case CSBP:
+            return "CSBP";
+        }
+        return "";
+    }
     string text() const
     {
         stringstream ss;
-        ss << "(" << p.method_str() << ") FPS: " << setiosflags(ios::left)
-            << setprecision(4) << work_fps;
+        ss << "(" << method_str() << ") FPS: " << setiosflags(ios::left)
+           << setprecision(4) << work_fps;
         return ss.str();
     }
 private:
-    Params p;
     bool running;
 
     Mat left_src, right_src;
@@ -74,42 +63,45 @@ private:
 
     int64 work_begin;
     double work_fps;
-};
 
-static void printHelp()
-{
-    cout << "Usage: stereo_match_gpu\n"
-        << "\t--left <left_view> --right <right_view> # must be rectified\n"
-        << "\t--method <stereo_match_method> # BM | BP | CSBP\n"
-        << "\t--ndisp <number> # number of disparity levels\n"
-        << "\t--type <device_type> # cpu | CPU | gpu | GPU\n";
-    help_showed = true;
-}
+    string l_img, r_img;
+    string out_img;
+    enum {BM, BP, CSBP} method;
+    int ndisp; // Max disparity + 1
+    enum {GPU, CPU} type;
+};
 
 int main(int argc, char** argv)
 {
+    const char* keys =
+        "{ h | help     | false                     | print help message }"
+        "{ l | left     |                           | specify left image }"
+        "{ r | right    |                           | specify right image }"
+        "{ m | method   | BM                        | specify match method(BM/BP/CSBP) }"
+        "{ n | ndisp    | 64                        |  specify number of disparity levels }"
+        "{ s | cpu_ocl  | false                     | use cpu or gpu as ocl device to process the image }"
+        "{ o | output   | stereo_match_output.jpg   | specify output path when input is images}";
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Avaible options:" << endl;
+        cmd.printMessage();
+        return 0;
+    }
     try
     {
-        if (argc < 2)
-        {
-            printHelp();
-            return 1;
-        }
+        App app(cmd);
+        int flag = CVCL_DEVICE_TYPE_GPU;
+        if(cmd.get<bool>("s") == true)
+            flag = CVCL_DEVICE_TYPE_CPU;
 
-        Params args = Params::read(argc, argv);
-        if (help_showed)
-            return -1;
-
-        int flags[2] = { CVCL_DEVICE_TYPE_GPU, CVCL_DEVICE_TYPE_CPU };
         vector<Info> info;
-
-        if(getDevice(info, flags[args.type]) == 0)
+        if(getDevice(info, flag) == 0)
         {
             throw runtime_error("Error: Did not find a valid OpenCL device!");
         }
         cout << "Device name:" << info[0].DeviceName[0] << endl;
 
-        App app(args);
         app.run();
     }
     catch (const exception& e)
@@ -119,77 +111,39 @@ int main(int argc, char** argv)
     return 0;
 }
 
-
-Params::Params()
-{
-    method = BM;
-    ndisp = 64;
-    type = GPU;
-}
-
-
-Params Params::read(int argc, char** argv)
-{
-    Params p;
-
-    for (int i = 1; i < argc; i++)
-    {
-        if (string(argv[i]) == "--left") p.left = argv[++i];
-        else if (string(argv[i]) == "--right") p.right = argv[++i];
-        else if (string(argv[i]) == "--method")
-        {
-            if (string(argv[i + 1]) == "BM") p.method = BM;
-            else if (string(argv[i + 1]) == "BP") p.method = BP;
-            else if (string(argv[i + 1]) == "CSBP") p.method = CSBP;
-            else throw runtime_error("unknown stereo match method: " + string(argv[i + 1]));
-            i++;
-        }
-        else if (string(argv[i]) == "--ndisp") p.ndisp = atoi(argv[++i]);
-        else if (string(argv[i]) == "--type")
-        {
-            string t(argv[++i]);
-            if (t == "cpu" || t == "CPU")
-            {
-                p.type = CPU;
-            } 
-            else if (t == "gpu" || t == "GPU")
-            {
-                p.type = GPU;
-            }
-            else throw runtime_error("unknown device type: " + t);
-        }
-        else if (string(argv[i]) == "--help") printHelp();
-        else throw runtime_error("unknown key: " + string(argv[i]));
-    }
-
-    return p;
-}
-
-
-App::App(const Params& params)
-    : p(params), running(false)
+App::App(CommandLineParser& cmd)
+    : running(false),method(BM)
 {
     cout << "stereo_match_ocl sample\n";
     cout << "\nControls:\n"
-        << "\tesc - exit\n"
-        << "\tp - print current parameters\n"
-        << "\tg - convert source images into gray\n"
-        << "\tm - change stereo match method\n"
-        << "\ts - change Sobel prefiltering flag (for BM only)\n"
-        << "\t1/q - increase/decrease maximum disparity\n"
-        << "\t2/w - increase/decrease window size (for BM only)\n"
-        << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
-        << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
+         << "\tesc - exit\n"
+         << "\tp - print current parameters\n"
+         << "\tg - convert source images into gray\n"
+         << "\tm - change stereo match method\n"
+         << "\ts - change Sobel prefiltering flag (for BM only)\n"
+         << "\t1/q - increase/decrease maximum disparity\n"
+         << "\t2/w - increase/decrease window size (for BM only)\n"
+         << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
+         << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
+    l_img = cmd.get<string>("l");
+    r_img = cmd.get<string>("r");
+    string mstr = cmd.get<string>("m");
+    if(mstr == "BM") method = BM;
+    else if(mstr == "BP") method = BP;
+    else if(mstr == "CSBP") method = CSBP;
+    else cout << "unknown method!\n";
+    ndisp = cmd.get<int>("n");
+    out_img = cmd.get<string>("o");
 }
 
 
 void App::run()
 {
     // Load images
-    left_src = imread(p.left);
-    right_src = imread(p.right);
-    if (left_src.empty()) throw runtime_error("can't open file \"" + p.left + "\"");
-    if (right_src.empty()) throw runtime_error("can't open file \"" + p.right + "\"");
+    left_src = imread(l_img);
+    right_src = imread(r_img);
+    if (left_src.empty()) throw runtime_error("can't open file \"" + l_img + "\"");
+    if (right_src.empty()) throw runtime_error("can't open file \"" + r_img + "\"");
 
     cvtColor(left_src, left, COLOR_BGR2GRAY);
     cvtColor(right_src, right, COLOR_BGR2GRAY);
@@ -201,14 +155,15 @@ void App::run()
     imshow("right", right);
 
     // Set common parameters
-    bm.ndisp = p.ndisp;
-    bp.ndisp = p.ndisp;
-    csbp.ndisp = p.ndisp;
+    bm.ndisp = ndisp;
+    bp.ndisp = ndisp;
+    csbp.ndisp = ndisp;
 
     cout << endl;
     printParams();
 
     running = true;
+    bool written = false;
     while (running)
     {
 
@@ -216,9 +171,9 @@ void App::run()
         Mat disp;
         oclMat d_disp;
         workBegin();
-        switch (p.method)
+        switch (method)
         {
-        case Params::BM:
+        case BM:
             if (d_left.channels() > 1 || d_right.channels() > 1)
             {
                 cout << "BM doesn't support color images\n";
@@ -232,25 +187,27 @@ void App::run()
             }
             bm(d_left, d_right, d_disp);
             break;
-        case Params::BP:
+        case BP:
             bp(d_left, d_right, d_disp);
             break;
-        case Params::CSBP:
+        case CSBP:
             csbp(d_left, d_right, d_disp);
             break;
         }
-        ocl::finish();
-        workEnd();
-
         // Show results
         d_disp.download(disp);
-        if (p.method != Params::BM)
+        workEnd();
+        if (method != BM)
         {
             disp.convertTo(disp, 0);
         }
         putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
         imshow("disparity", disp);
-
+        if(!written)
+        {
+            imwrite(out_img, disp);
+            written = true;
+        }
         handleKey((char)waitKey(3));
     }
 }
@@ -261,19 +218,19 @@ void App::printParams() const
     cout << "--- Parameters ---\n";
     cout << "image_size: (" << left.cols << ", " << left.rows << ")\n";
     cout << "image_channels: " << left.channels() << endl;
-    cout << "method: " << p.method_str() << endl
-        << "ndisp: " << p.ndisp << endl;
-    switch (p.method)
+    cout << "method: " << method_str() << endl
+         << "ndisp: " << ndisp << endl;
+    switch (method)
     {
-    case Params::BM:
+    case BM:
         cout << "win_size: " << bm.winSize << endl;
         cout << "prefilter_sobel: " << bm.preset << endl;
         break;
-    case Params::BP:
+    case BP:
         cout << "iter_count: " << bp.iters << endl;
         cout << "level_count: " << bp.levels << endl;
         break;
-    case Params::CSBP:
+    case CSBP:
         cout << "iter_count: " << csbp.iters << endl;
         cout << "level_count: " << csbp.levels << endl;
         break;
@@ -289,11 +246,13 @@ void App::handleKey(char key)
     case 27:
         running = false;
         break;
-    case 'p': case 'P':
+    case 'p':
+    case 'P':
         printParams();
         break;
-    case 'g': case 'G':
-        if (left.channels() == 1 && p.method != Params::BM)
+    case 'g':
+    case 'G':
+        if (left.channels() == 1 && method != BM)
         {
             left = left_src;
             right = right_src;
@@ -309,23 +268,25 @@ void App::handleKey(char key)
         imshow("left", left);
         imshow("right", right);
         break;
-    case 'm': case 'M':
-        switch (p.method)
+    case 'm':
+    case 'M':
+        switch (method)
         {
-        case Params::BM:
-            p.method = Params::BP;
+        case BM:
+            method = BP;
             break;
-        case Params::BP:
-            p.method = Params::CSBP;
+        case BP:
+            method = CSBP;
             break;
-        case Params::CSBP:
-            p.method = Params::BM;
+        case CSBP:
+            method = BM;
             break;
         }
-        cout << "method: " << p.method_str() << endl;
+        cout << "method: " << method_str() << endl;
         break;
-    case 's': case 'S':
-        if (p.method == Params::BM)
+    case 's':
+    case 'S':
+        if (method == BM)
         {
             switch (bm.preset)
             {
@@ -340,76 +301,80 @@ void App::handleKey(char key)
         }
         break;
     case '1':
-        p.ndisp = p.ndisp == 1 ? 8 : p.ndisp + 8;
-        cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
+        ndisp == 1 ? ndisp = 8 : ndisp += 8;
+        cout << "ndisp: " << ndisp << endl;
+        bm.ndisp = ndisp;
+        bp.ndisp = ndisp;
+        csbp.ndisp = ndisp;
         break;
-    case 'q': case 'Q':
-        p.ndisp = max(p.ndisp - 8, 1);
-        cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
+    case 'q':
+    case 'Q':
+        ndisp = max(ndisp - 8, 1);
+        cout << "ndisp: " << ndisp << endl;
+        bm.ndisp = ndisp;
+        bp.ndisp = ndisp;
+        csbp.ndisp = ndisp;
         break;
     case '2':
-        if (p.method == Params::BM)
+        if (method == BM)
         {
             bm.winSize = min(bm.winSize + 1, 51);
             cout << "win_size: " << bm.winSize << endl;
         }
         break;
-    case 'w': case 'W':
-        if (p.method == Params::BM)
+    case 'w':
+    case 'W':
+        if (method == BM)
         {
             bm.winSize = max(bm.winSize - 1, 2);
             cout << "win_size: " << bm.winSize << endl;
         }
         break;
     case '3':
-        if (p.method == Params::BP)
+        if (method == BP)
         {
             bp.iters += 1;
             cout << "iter_count: " << bp.iters << endl;
         }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
         {
             csbp.iters += 1;
             cout << "iter_count: " << csbp.iters << endl;
         }
         break;
-    case 'e': case 'E':
-        if (p.method == Params::BP)
+    case 'e':
+    case 'E':
+        if (method == BP)
         {
             bp.iters = max(bp.iters - 1, 1);
             cout << "iter_count: " << bp.iters << endl;
         }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
         {
             csbp.iters = max(csbp.iters - 1, 1);
             cout << "iter_count: " << csbp.iters << endl;
         }
         break;
     case '4':
-        if (p.method == Params::BP)
+        if (method == BP)
         {
             bp.levels += 1;
             cout << "level_count: " << bp.levels << endl;
         }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
         {
             csbp.levels += 1;
             cout << "level_count: " << csbp.levels << endl;
         }
         break;
-    case 'r': case 'R':
-        if (p.method == Params::BP)
+    case 'r':
+    case 'R':
+        if (method == BP)
         {
             bp.levels = max(bp.levels - 1, 1);
             cout << "level_count: " << bp.levels << endl;
         }
-        else if (p.method == Params::CSBP)
+        else if (method == CSBP)
         {
             csbp.levels = max(csbp.levels - 1, 1);
             cout << "level_count: " << csbp.levels << endl;
@@ -417,5 +382,3 @@ void App::handleKey(char key)
         break;
     }
 }
-
-
diff --git a/samples/ocl/surf_matcher.cpp b/samples/ocl/surf_matcher.cpp
index e938a77c27..29619808ae 100644
--- a/samples/ocl/surf_matcher.cpp
+++ b/samples/ocl/surf_matcher.cpp
@@ -1,48 +1,3 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
 #include <iostream>
 #include <stdio.h>
 #include "opencv2/core/core.hpp"
@@ -62,14 +17,6 @@ const float GOOD_PORTION = 0.15f;
 
 namespace
 {
-void help();
-
-void help()
-{
-    std::cout << "\nThis program demonstrates using SURF_OCL features detector and descriptor extractor" << std::endl;
-    std::cout << "\nUsage:\n\tsurf_matcher --left <image1> --right <image2> [-c]" << std::endl;
-    std::cout << "\nExample:\n\tsurf_matcher --left box.png --right box_in_scene.png" << std::endl;
-}
 
 int64 work_begin = 0;
 int64 work_end = 0;
@@ -82,7 +29,8 @@ void workEnd()
 {
     work_end = getTickCount() - work_begin;
 }
-double getTime(){
+double getTime()
+{
     return work_end /((double)getTickFrequency() * 1000.);
 }
 
@@ -125,7 +73,7 @@ Mat drawGoodMatches(
     std::sort(matches.begin(), matches.end());
     std::vector< DMatch > good_matches;
     double minDist = matches.front().distance,
-        maxDist = matches.back().distance;
+           maxDist = matches.back().distance;
 
     const int ptsPairs = std::min(GOOD_PTS_MAX, (int)(matches.size() * GOOD_PORTION));
     for( int i = 0; i < ptsPairs; i++ )
@@ -140,8 +88,8 @@ Mat drawGoodMatches(
     // drawing the results
     Mat img_matches;
     drawMatches( cpu_img1, keypoints1, cpu_img2, keypoints2,
-        good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
-        std::vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );
+                 good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
+                 std::vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );
 
     //-- Localize the object
     std::vector<Point2f> obj;
@@ -155,8 +103,10 @@ Mat drawGoodMatches(
     }
     //-- Get the corners from the image_1 ( the object to be "detected" )
     std::vector<Point2f> obj_corners(4);
-    obj_corners[0] = Point(0,0); obj_corners[1] = Point( cpu_img1.cols, 0 );
-    obj_corners[2] = Point( cpu_img1.cols, cpu_img1.rows ); obj_corners[3] = Point( 0, cpu_img1.rows );
+    obj_corners[0] = Point(0,0);
+    obj_corners[1] = Point( cpu_img1.cols, 0 );
+    obj_corners[2] = Point( cpu_img1.cols, cpu_img1.rows );
+    obj_corners[3] = Point( 0, cpu_img1.rows );
     std::vector<Point2f> scene_corners(4);
 
     Mat H = findHomography( obj, scene, RANSAC );
@@ -166,17 +116,17 @@ Mat drawGoodMatches(
 
     //-- Draw lines between the corners (the mapped object in the scene - image_2 )
     line( img_matches,
-        scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0),
-        Scalar( 0, 255, 0), 2, LINE_AA );
+          scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
     line( img_matches,
-        scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0),
-        Scalar( 0, 255, 0), 2, LINE_AA );
+          scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
     line( img_matches,
-        scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0),
-        Scalar( 0, 255, 0), 2, LINE_AA );
+          scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
     line( img_matches,
-        scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0),
-        Scalar( 0, 255, 0), 2, LINE_AA );
+          scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
     return img_matches;
 }
 
@@ -186,6 +136,21 @@ Mat drawGoodMatches(
 // use cpu findHomography interface to calculate the transformation matrix
 int main(int argc, char* argv[])
 {
+    const char* keys =
+        "{ help h    | false           | print help message  }"
+        "{ left l    |                 | specify left image  }"
+        "{ right r   |                 | specify right image }"
+        "{ output o  | SURF_output.jpg | specify output save path (only works in CPU or GPU only mode) }"
+        "{ use_cpu c | false           | use CPU algorithms  }"
+        "{ use_all a | false           | use both CPU and GPU algorithms}";
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.get<bool>("help"))
+    {
+        std::cout << "Avaible options:" << std::endl;
+        cmd.printMessage();
+        return 0;
+    }
+
     std::vector<cv::ocl::Info> info;
     if(cv::ocl::getDevice(info) == 0)
     {
@@ -196,54 +161,38 @@ int main(int argc, char* argv[])
 
     Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
     oclMat img1, img2;
-    bool useCPU = false;
+    bool useCPU = cmd.get<bool>("c");
     bool useGPU = false;
-    bool useALL = false;
+    bool useALL = cmd.get<bool>("a");
 
-    for (int i = 1; i < argc; ++i)
+    std::string outpath = cmd.get<std::string>("o");
+
+    cpu_img1 = imread(cmd.get<std::string>("l"));
+    CV_Assert(!cpu_img1.empty());
+    cvtColor(cpu_img1, cpu_img1_grey, COLOR_BGR2GRAY);
+    img1 = cpu_img1_grey;
+
+    cpu_img2 = imread(cmd.get<std::string>("r"));
+    CV_Assert(!cpu_img2.empty());
+    cvtColor(cpu_img2, cpu_img2_grey, COLOR_BGR2GRAY);
+    img2 = cpu_img2_grey;
+
+    if(useALL)
     {
-        if (String(argv[i]) == "--left")
-        {
-            cpu_img1 = imread(argv[++i]);
-            CV_Assert(!cpu_img1.empty());
-            cvtColor(cpu_img1, cpu_img1_grey, COLOR_BGR2GRAY);
-            img1 = cpu_img1_grey;
-        }
-        else if (String(argv[i]) == "--right")
-        {
-            cpu_img2 = imread(argv[++i]);
-            CV_Assert(!cpu_img2.empty());
-            cvtColor(cpu_img2, cpu_img2_grey, COLOR_BGR2GRAY);
-            img2 = cpu_img2_grey;
-        }
-        else if (String(argv[i]) == "-c")
-        {
-            useCPU = true;
-            useGPU = false;
-            useALL = false;
-        }else if(String(argv[i]) == "-g")
-        {
-            useGPU = true;
-            useCPU = false;
-            useALL = false;
-        }else if(String(argv[i]) == "-a")
-        {
-            useALL = true;
-            useCPU = false;
-            useGPU = false;
-        }
-        else if (String(argv[i]) == "--help")
-        {
-            help();
-            return -1;
-        }
+        useCPU = false;
+        useGPU = false;
     }
+    else if(useCPU==false && useALL==false)
+    {
+        useGPU = true;
+    }
+
     if(!useCPU)
     {
         std::cout
-            << "Device name:"
-            << info[0].DeviceName[0]
-        << std::endl;
+                << "Device name:"
+                << info[0].DeviceName[0]
+                << std::endl;
     }
     double surf_time = 0.;
 
@@ -299,7 +248,8 @@ int main(int argc, char* argv[])
 
         surf_time = getTime();
         std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
-    }else
+    }
+    else
     {
         //cpu runs
         for (int i = 0; i <= LOOP_NUM; i++)
@@ -354,7 +304,7 @@ int main(int argc, char* argv[])
             for(size_t i = 0; i < cpu_corner.size(); i++)
             {
                 if((std::abs(cpu_corner[i].x - gpu_corner[i].x) > 10)
-                    ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))
+                        ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))
                 {
                     std::cout<<"Failed\n";
                     result = false;
@@ -372,12 +322,15 @@ int main(int argc, char* argv[])
     {
         namedWindow("cpu surf matches", 0);
         imshow("cpu surf matches", img_matches);
+        imwrite(outpath, img_matches);
     }
     else if(useGPU)
     {
         namedWindow("ocl surf matches", 0);
         imshow("ocl surf matches", img_matches);
-    }else
+        imwrite(outpath, img_matches);
+    }
+    else
     {
         namedWindow("cpu surf matches", 0);
         imshow("cpu surf matches", img_matches);
diff --git a/samples/ocl/tvl1_optical_flow.cpp b/samples/ocl/tvl1_optical_flow.cpp
new file mode 100644
index 0000000000..2b770e484e
--- /dev/null
+++ b/samples/ocl/tvl1_optical_flow.cpp
@@ -0,0 +1,264 @@
+#include <iostream>
+#include <vector>
+#include <iomanip>
+
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/video/video.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+typedef unsigned char uchar;
+#define LOOP_NUM 10
+int64 work_begin = 0;
+int64 work_end = 0;
+
+static void workBegin()
+{
+    work_begin = getTickCount();
+}
+static void workEnd()
+{
+    work_end += (getTickCount() - work_begin);
+}
+static double getTime()
+{
+    return work_end * 1000. / getTickFrequency();
+}
+
+template <typename T> inline T clamp (T x, T a, T b)
+{
+    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
+}
+
+template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
+{
+    x = clamp(x, a, b);
+    return c + (d - c) * (x - a) / (b - a);
+}
+
+static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
+{
+    float maxDisplacement = 1.0f;
+
+    for (int i = 0; i < u.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+        for (int j = 0; j < u.cols; ++j)
+        {
+            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
+
+            if (d > maxDisplacement)
+                maxDisplacement = d;
+        }
+    }
+
+    flowField.create(u.size(), CV_8UC4);
+
+    for (int i = 0; i < flowField.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+
+        Vec4b* row = flowField.ptr<Vec4b>(i);
+
+        for (int j = 0; j < flowField.cols; ++j)
+        {
+            row[j][0] = 0;
+            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][3] = 255;
+        }
+    }
+}
+
+
+int main(int argc, const char* argv[])
+{
+    static std::vector<Info> ocl_info;
+    ocl::getDevice(ocl_info);
+    //if you want to use undefault device, set it here
+    setDevice(ocl_info[0]);
+
+    //set this to save kernel compile time from second time you run
+    ocl::setBinpath("./");
+    const char* keys =
+        "{ h   | help       | false           | print help message }"
+        "{ l   | left       |                 | specify left image }"
+        "{ r   | right      |                 | specify right image }"
+        "{ o   | output     | tvl1_output.jpg | specify output save path }"
+        "{ c   | camera     | 0               | enable camera capturing }"
+        "{ s   | use_cpu    | false           | use cpu or gpu to process the image }"
+        "{ v   | video      |                 | use video as input }";
+
+    CommandLineParser cmd(argc, argv, keys);
+
+    if (cmd.get<bool>("help"))
+    {
+        cout << "Usage: pyrlk_optical_flow [options]" << endl;
+        cout << "Avaible options:" << endl;
+        cmd.printMessage();
+        return 0;
+    }
+
+    bool defaultPicturesFail = false;
+    string fname0 = cmd.get<string>("l");
+    string fname1 = cmd.get<string>("r");
+    string vdofile = cmd.get<string>("v");
+    string outpath = cmd.get<string>("o");
+    bool useCPU = cmd.get<bool>("s");
+    bool useCamera = cmd.get<bool>("c");
+    int inputName = cmd.get<int>("c");
+
+    Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
+    Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
+
+
+    Mat flow, show_flow;
+    Mat flow_vec[2];
+    if (frame0.empty() || frame1.empty())
+    {
+        useCamera = true;
+        defaultPicturesFail = true;
+        VideoCapture capture( inputName );
+        if (!capture.isOpened())
+        {
+            cout << "Can't load input images" << endl;
+            return -1;
+        }
+    }
+
+
+    if (useCamera)
+    {
+        VideoCapture capture;
+        Mat frame, frameCopy;
+        Mat frame0Gray, frame1Gray;
+        Mat ptr0, ptr1;
+
+        if(vdofile == "")
+            capture.open( inputName );
+        else
+            capture.open(vdofile.c_str());
+
+        int c = inputName ;
+        if(!capture.isOpened())
+        {
+            if(vdofile == "")
+                cout << "Capture from CAM " << c << " didn't work" << endl;
+            else
+                cout << "Capture from file " << vdofile << " failed" <<endl;
+            if (defaultPicturesFail)
+            {
+                return -1;
+            }
+            goto nocamera;
+        }
+
+        cout << "In capture ..." << endl;
+        for(int i = 0;; i++)
+        {
+            if( !capture.read(frame) )
+                break;
+
+            if (i == 0)
+            {
+                frame.copyTo( frame0 );
+                cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+            }
+            else
+            {
+                if (i%2 == 1)
+                {
+                    frame.copyTo(frame1);
+                    cvtColor(frame1, frame1Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame0Gray;
+                    ptr1 = frame1Gray;
+                }
+                else
+                {
+                    frame.copyTo(frame0);
+                    cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+                    ptr0 = frame1Gray;
+                    ptr1 = frame0Gray;
+                }
+
+                if (useCPU)
+                {
+                    alg->calc(ptr0, ptr1, flow);
+                    split(flow, flow_vec);
+                }
+                else
+                {
+                    oclMat d_flowx, d_flowy;
+                    d_alg(oclMat(ptr0), oclMat(ptr1), d_flowx, d_flowy);
+                    d_flowx.download(flow_vec[0]);
+                    d_flowy.download(flow_vec[1]);
+                }
+                if (i%2 == 1)
+                    frame1.copyTo(frameCopy);
+                else
+                    frame0.copyTo(frameCopy);
+                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                imshow("PyrLK [Sparse]", show_flow);
+            }
+
+            if( waitKey( 10 ) >= 0 )
+                goto _cleanup_;
+        }
+
+        waitKey(0);
+
+_cleanup_:
+        capture.release();
+    }
+    else
+    {
+nocamera:
+        oclMat d_flowx, d_flowy;
+        for(int i = 0; i <= LOOP_NUM; i ++)
+        {
+            cout << "loop" << i << endl;
+
+            if (i > 0) workBegin();
+            if (useCPU)
+            {
+                alg->calc(frame0, frame1, flow);
+                split(flow, flow_vec);
+            }
+            else
+            {
+                d_alg(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
+                d_flowx.download(flow_vec[0]);
+                d_flowy.download(flow_vec[1]);
+            }
+            if (i > 0 && i <= LOOP_NUM)
+                workEnd();
+
+            if (i == LOOP_NUM)
+            {
+                if (useCPU)
+                    cout << "average CPU time (noCamera) : ";
+                else
+                    cout << "average GPU time (noCamera) : ";
+                cout << getTime() / LOOP_NUM << " ms" << endl;
+
+                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                imshow("PyrLK [Sparse]", show_flow);
+                imwrite(outpath, show_flow);
+            }
+        }
+    }
+
+    waitKey();
+
+    return 0;
+}
\ No newline at end of file