diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc45f6f394..c79ad2be6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,6 +76,10 @@ if(POLICY CMP0022)
   cmake_policy(SET CMP0022 OLD)
 endif()
 
+if(POLICY CMP0023)
+  cmake_policy(SET CMP0023 NEW)
+endif()
+
 if(POLICY CMP0026)
   # silence cmake 3.0+ warnings about reading LOCATION attribute
   cmake_policy(SET CMP0026 OLD)
diff --git a/apps/annotation/opencv_annotation.cpp b/apps/annotation/opencv_annotation.cpp
index febe9fc95d..5d1eedeccd 100644
--- a/apps/annotation/opencv_annotation.cpp
+++ b/apps/annotation/opencv_annotation.cpp
@@ -59,12 +59,7 @@ Adapted by: Puttemans Steven - April 2016 - Vectorize the process to enable bett
 
 #include <fstream>
 #include <iostream>
-
-#if defined(_WIN32)
-   #include <direct.h>
-#else
-   #include <sys/stat.h>
-#endif
+#include <map>
 
 using namespace std;
 using namespace cv;
@@ -249,34 +244,20 @@ int main( int argc, const char** argv )
     int resizeFactor = parser.get<int>("resizeFactor");
     int const maxWindowHeight = parser.get<int>("maxWindowHeight") > 0 ? parser.get<int>("maxWindowHeight") : -1;
 
-    // Check if the folder actually exists
-    // If -1 is returned then the folder actually exists, and thus you can continue
-    // In all other cases there was a folder creation and thus the folder did not exist
-    #if defined(_WIN32)
-    if(_mkdir(image_folder.c_str()) != -1){
-        // Generate an error message
-        cerr << "The image folder given does not exist. Please check again!" << endl;
-        // Remove the created folder again, to ensure a second run with same code fails again
-        _rmdir(image_folder.c_str());
-        return 0;
-    }
-    #else
-    if(mkdir(image_folder.c_str(), 0777) != -1){
-        // Generate an error message
-        cerr << "The image folder given does not exist. Please check again!" << endl;
-        // Remove the created folder again, to ensure a second run with same code fails again
-        remove(image_folder.c_str());
-        return 0;
-    }
-    #endif
-
     // Start by processing the data
     // Return the image filenames inside the image folder
-    vector< vector<Rect> > annotations;
+    map< String, vector<Rect> > annotations;
     vector<String> filenames;
     String folder(image_folder);
     glob(folder, filenames);
 
+    // Add key tips on how to use the software when running it
+    cout << "* mark rectangles with the left mouse button," << endl;
+    cout << "* press 'c' to accept a selection," << endl;
+    cout << "* press 'd' to delete the latest selection," << endl;
+    cout << "* press 'n' to proceed with next image," << endl;
+    cout << "* press 'esc' to stop." << endl;
+
     // Loop through each image stored in the images folder
     // Create and temporarily store the annotations
     // At the end write everything to the annotations file
@@ -306,7 +287,7 @@ int main( int argc, const char** argv )
                 current_annotations[j].height = current_annotations[j].height * resizeFactor;
             }
         }
-        annotations.push_back(current_annotations);
+        annotations[filenames[i]] = current_annotations;
 
         // Check if the ESC key was hit, then exit earlier then expected
         if(stop){
@@ -323,10 +304,11 @@ int main( int argc, const char** argv )
     }
 
     // Store the annotations, write to the output file
-    for(int i = 0; i < (int)annotations.size(); i++){
-        output << filenames[i] << " " << annotations[i].size();
-        for(int j=0; j < (int)annotations[i].size(); j++){
-            Rect temp = annotations[i][j];
+    for(map<String, vector<Rect> >::iterator it = annotations.begin(); it != annotations.end(); it++){
+        vector<Rect> &anno = it->second;
+        output << it->first << " " << anno.size();
+        for(size_t j=0; j < anno.size(); j++){
+            Rect temp = anno[j];
             output << " " << temp.x << " " << temp.y << " " << temp.width << " " << temp.height;
         }
         output << endl;
diff --git a/cmake/FindCUDA/run_nvcc.cmake b/cmake/FindCUDA/run_nvcc.cmake
index abdd3079e1..8d1ceb1663 100644
--- a/cmake/FindCUDA/run_nvcc.cmake
+++ b/cmake/FindCUDA/run_nvcc.cmake
@@ -72,7 +72,7 @@ set(generated_cubin_file_internal "@generated_cubin_file@") # path
 set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
 set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
 @CUDA_NVCC_FLAGS_CONFIG@
-set(nvcc_flags @nvcc_flags@) # list
+set(nvcc_flags "@nvcc_flags@") # list
 set(CUDA_NVCC_INCLUDE_ARGS "@CUDA_NVCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly).
 set(format_flag "@format_flag@") # string
 
diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake
index 8ecf4d82bd..e7dac80275 100644
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -172,3 +172,13 @@ elseif(MINGW)
     set(OpenCV_ARCH x86)
   endif()
 endif()
+
+# Fix handling of duplicated files in the same static library:
+# https://public.kitware.com/Bug/view.php?id=14874
+if(CMAKE_VERSION VERSION_LESS "3.1")
+  foreach(var CMAKE_C_ARCHIVE_APPEND CMAKE_CXX_ARCHIVE_APPEND)
+    if(${var} MATCHES "^<CMAKE_AR> r")
+      string(REPLACE "<CMAKE_AR> r" "<CMAKE_AR> q" ${var} "${${var}}")
+    endif()
+  endforeach()
+endif()
diff --git a/cmake/OpenCVFindLAPACK.cmake b/cmake/OpenCVFindLAPACK.cmake
index dfacf24328..09cd66cdae 100644
--- a/cmake/OpenCVFindLAPACK.cmake
+++ b/cmake/OpenCVFindLAPACK.cmake
@@ -1,78 +1,160 @@
-macro(_find_file_in_dirs VAR NAME DIRS)
-    find_path(${VAR} ${NAME} ${DIRS} NO_DEFAULT_PATH)
-    set(${VAR} ${${VAR}}/${NAME})
-    unset(${VAR} CACHE)
+macro(_find_header_file_in_dirs VAR NAME)
+  unset(${VAR})
+  unset(${VAR} CACHE)
+  if(" ${ARGN}" STREQUAL " ")
+    check_include_file("${NAME}" HAVE_${VAR})
+    if(HAVE_${VAR})
+      set(${VAR} "${NAME}") # fallback
+    else()
+      set(${VAR} "")
+    endif()
+  else()
+    find_path(${VAR} "${NAME}" ${ARGN} NO_DEFAULT_PATH)
+    if(${VAR})
+      set(${VAR} "${${VAR}}/${NAME}")
+      unset(${VAR} CACHE)
+    else()
+      unset(${VAR} CACHE)
+      set(${VAR} "")
+    endif()
+  endif()
+endmacro()
+
+macro(ocv_lapack_check)
+  string(REGEX REPLACE "[^a-zA-Z0-9_]" "_" _lapack_impl "${LAPACK_IMPL}")
+  message(STATUS "LAPACK(${LAPACK_IMPL}): LAPACK_LIBRARIES: ${LAPACK_LIBRARIES}")
+  _find_header_file_in_dirs(OPENCV_CBLAS_H_PATH_${_lapack_impl} "${LAPACK_CBLAS_H}" "${LAPACK_INCLUDE_DIR}")
+  _find_header_file_in_dirs(OPENCV_LAPACKE_H_PATH_${_lapack_impl} "${LAPACK_LAPACKE_H}" "${LAPACK_INCLUDE_DIR}")
+  if(NOT OPENCV_CBLAS_H_PATH_${_lapack_impl} OR NOT OPENCV_LAPACKE_H_PATH_${_lapack_impl})
+    message(WARNING "LAPACK(${LAPACK_IMPL}): CBLAS/LAPACK headers are not found in '${LAPACK_INCLUDE_DIR}'")
+    unset(LAPACK_LIBRARIES)
+  else()
+    # adding proxy opencv_lapack.h header
+    set(CBLAS_H_PROXY_PATH ${CMAKE_BINARY_DIR}/opencv_lapack.h)
+    set(_lapack_include_str "\#include \"${OPENCV_CBLAS_H_PATH_${_lapack_impl}}\"")
+    if(NOT "${OPENCV_CBLAS_H_PATH_${_lapack_impl}}" STREQUAL "${OPENCV_LAPACKE_H_PATH_${_lapack_impl}}")
+      set(_lapack_include_str "${_lapack_include_str}\n#include \"${OPENCV_LAPACKE_H_PATH_${_lapack_impl}}\"")
+    endif()
+    # update file contents (if required)
+    set(__content_str "")
+    if(EXISTS "${CBLAS_H_PROXY_PATH}")
+      file(READ "${CBLAS_H_PROXY_PATH}" __content_str)
+    endif()
+    if(NOT " ${__content_str}" STREQUAL " ${_lapack_include_str}")
+      file(WRITE "${CBLAS_H_PROXY_PATH}" "${_lapack_include_str}")
+    endif()
+
+    try_compile(__VALID_LAPACK
+        "${OpenCV_BINARY_DIR}"
+        "${OpenCV_SOURCE_DIR}/cmake/checks/lapack_check.cpp"
+        CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${LAPACK_INCLUDE_DIR}\;${CMAKE_BINARY_DIR}"
+                    "-DLINK_DIRECTORIES:STRING=${LAPACK_LINK_LIBRARIES}"
+                    "-DLINK_LIBRARIES:STRING=${LAPACK_LIBRARIES}"
+        OUTPUT_VARIABLE TRY_OUT
+    )
+    if(NOT __VALID_LAPACK)
+      #message(FATAL_ERROR "LAPACK: check build log:\n${TRY_OUT}")
+      message(STATUS "LAPACK(${LAPACK_IMPL}): Can't build LAPACK check code. This LAPACK version is not supported.")
+      unset(LAPACK_LIBRARIES)
+    else()
+      message(STATUS "LAPACK(${LAPACK_IMPL}): Support is enabled.")
+      ocv_include_directories(${LAPACK_INCLUDE_DIR})
+      set(HAVE_LAPACK 1)
+    endif()
+  endif()
 endmacro()
 
 if(WITH_LAPACK)
-    ocv_update(LAPACK_IMPL "Unknown")
-    if(NOT LAPACK_LIBRARIES)
-        include(cmake/OpenCVFindMKL.cmake)
-        if(HAVE_MKL)
-            set(LAPACK_INCLUDE_DIR  ${MKL_INCLUDE_DIRS})
-            set(LAPACK_LIBRARIES    ${MKL_LIBRARIES}   )
-            set(LAPACK_CBLAS_H      "mkl_cblas.h"      )
-            set(LAPACK_LAPACKE_H    "mkl_lapack.h"    )
-            set(LAPACK_IMPL         "MKL")
-        endif()
+  ocv_update(LAPACK_IMPL "Unknown")
+  if(NOT OPENCV_LAPACK_FIND_PACKAGE_ONLY)
+    if(NOT LAPACK_LIBRARIES AND NOT OPENCV_LAPACK_DISABLE_MKL)
+      include(cmake/OpenCVFindMKL.cmake)
+      if(HAVE_MKL)
+        set(LAPACK_INCLUDE_DIR  ${MKL_INCLUDE_DIRS})
+        set(LAPACK_LIBRARIES    ${MKL_LIBRARIES})
+        set(LAPACK_CBLAS_H      "mkl_cblas.h")
+        set(LAPACK_LAPACKE_H    "mkl_lapack.h")
+        set(LAPACK_IMPL         "MKL")
+        ocv_lapack_check()
+      endif()
     endif()
     if(NOT LAPACK_LIBRARIES)
-        include(cmake/OpenCVFindOpenBLAS.cmake)
-        if(OpenBLAS_FOUND)
-            set(LAPACK_INCLUDE_DIR  ${OpenBLAS_INCLUDE_DIR} )
-            set(LAPACK_LIBRARIES    ${OpenBLAS_LIB}         )
-            set(LAPACK_CBLAS_H      "cblas.h"      )
-            set(LAPACK_LAPACKE_H    "lapacke.h"    )
-            set(LAPACK_IMPL         "OpenBLAS")
-        endif()
+      include(cmake/OpenCVFindOpenBLAS.cmake)
+      if(OpenBLAS_FOUND)
+        set(LAPACK_INCLUDE_DIR  ${OpenBLAS_INCLUDE_DIR})
+        set(LAPACK_LIBRARIES    ${OpenBLAS_LIB})
+        set(LAPACK_CBLAS_H      "cblas.h")
+        set(LAPACK_LAPACKE_H    "lapacke.h")
+        set(LAPACK_IMPL         "OpenBLAS")
+        ocv_lapack_check()
+      endif()
     endif()
     if(NOT LAPACK_LIBRARIES AND UNIX)
-        include(cmake/OpenCVFindAtlas.cmake)
-        if(ATLAS_FOUND)
-            set(LAPACK_INCLUDE_DIR  ${Atlas_INCLUDE_DIR})
-            set(LAPACK_LIBRARIES    ${Atlas_LIBRARIES}  )
-            set(LAPACK_CBLAS_H      "cblas.h"      )
-            set(LAPACK_LAPACKE_H    "lapacke.h"    )
-            set(LAPACK_IMPL "Atlas")
-        endif()
+      include(cmake/OpenCVFindAtlas.cmake)
+      if(ATLAS_FOUND)
+        set(LAPACK_INCLUDE_DIR  ${Atlas_INCLUDE_DIR})
+        set(LAPACK_LIBRARIES    ${Atlas_LIBRARIES})
+        set(LAPACK_CBLAS_H      "cblas.h")
+        set(LAPACK_LAPACKE_H    "lapacke.h")
+        set(LAPACK_IMPL         "Atlas")
+        ocv_lapack_check()
+      endif()
     endif()
+  endif()
 
-    if(NOT LAPACK_LIBRARIES AND APPLE)
-        set(LAPACK_INCLUDE_DIR  "Accelerate")
-        set(LAPACK_LIBRARIES    "-framework Accelerate")
-        set(LAPACK_CBLAS_H      "cblas.h"      )
-        set(LAPACK_LAPACKE_H    "lapacke.h"    )
-        set(LAPACK_IMPL         "Apple")
+  if(NOT LAPACK_LIBRARIES)
+    if(WIN32 AND NOT OPENCV_LAPACK_SHARED_LIBS)
+      set(BLA_STATIC 1)
     endif()
-
-    set(LAPACK_INCLUDE_DIR  ${LAPACK_INCLUDE_DIR}            CACHE PATH      "Path to BLAS include dir" FORCE)
-    set(LAPACK_CBLAS_H      ${LAPACK_CBLAS_H}     CACHE STRING    "Alternative name of cblas.h" FORCE)
-    set(LAPACK_LAPACKE_H    ${LAPACK_LAPACKE_H}   CACHE STRING    "Alternative name of lapacke.h" FORCE)
-    set(LAPACK_LIBRARIES    ${LAPACK_LIBRARIES}            CACHE STRING    "Names of BLAS & LAPACK binaries (.so, .dll, .a, .lib)" FORCE)
-    set(LAPACK_IMPL ${LAPACK_IMPL} CACHE STRING "Lapack implementation id" FORCE)
-
-    if(LAPACK_LIBRARIES) #adding proxy cblas.h header
-        message(STATUS "LAPACK_IMPL: ${LAPACK_IMPL}, LAPACK_LIBRARIES: ${LAPACK_LIBRARIES}")
-        if("${LAPACK_IMPL}" STREQUAL "Apple")
-            set(CBLAS_H_PATH "Accelerate/Accelerate.h")
-            set(LAPACKE_H_PATH "Accelerate/Accelerate.h")
-        else()
-            _find_file_in_dirs(CBLAS_H_PATH "${LAPACK_CBLAS_H}" "${LAPACK_INCLUDE_DIR}")
-            _find_file_in_dirs(LAPACKE_H_PATH "${LAPACK_LAPACKE_H}" "${LAPACK_INCLUDE_DIR}")
-        endif()
-        if(NOT CBLAS_H_PATH OR NOT LAPACKE_H_PATH)
-            message(WARNING "CBLAS/LAPACK headers are not found in '${LAPACK_INCLUDE_DIR}'")
-        endif()
-        ocv_include_directories(${LAPACK_INCLUDE_DIR})
-        list(APPEND OPENCV_LINKER_LIBS ${LAPACK_LIBRARIES})
-        set(HAVE_LAPACK 1)
-
-        set(CBLAS_H_PROXY_PATH ${CMAKE_BINARY_DIR}/opencv_lapack.h)
-        set(_include_str "\#include \"${CBLAS_H_PATH}\"")
-        if("${CBLAS_H_PATH}" STREQUAL "${LAPACKE_H_PATH}")
-        else()
-            set(_include_str "${_include_str}\n\#include \"${LAPACKE_H_PATH}\"")
-        endif()
-        file(WRITE ${CBLAS_H_PROXY_PATH} ${_include_str})
+    find_package(LAPACK)
+    if(LAPACK_FOUND)
+      if(NOT DEFINED LAPACKE_INCLUDE_DIR)
+        find_path(LAPACKE_INCLUDE_DIR "lapacke.h")
+      endif()
+      if(NOT DEFINED MKL_LAPACKE_INCLUDE_DIR)
+        find_path(MKL_LAPACKE_INCLUDE_DIR "mkl_lapack.h")
+      endif()
+      if(MKL_LAPACKE_INCLUDE_DIR AND NOT OPENCV_LAPACK_DISABLE_MKL)
+        set(LAPACK_INCLUDE_DIR  ${MKL_LAPACKE_INCLUDE_DIR})
+        set(LAPACK_CBLAS_H      "mkl_cblas.h")
+        set(LAPACK_LAPACKE_H    "mkl_lapack.h")
+        set(LAPACK_IMPL         "LAPACK/MKL")
+        ocv_lapack_check()
+      endif()
+      if(LAPACKE_INCLUDE_DIR AND NOT HAVE_LAPACK)
+        set(LAPACK_INCLUDE_DIR  ${LAPACKE_INCLUDE_DIR})
+        set(LAPACK_CBLAS_H      "cblas.h")
+        set(LAPACK_LAPACKE_H    "lapacke.h")
+        set(LAPACK_IMPL         "LAPACK/Generic")
+        ocv_lapack_check()
+      elseif(APPLE)
+        set(LAPACK_CBLAS_H      "Accelerate/Accelerate.h")
+        set(LAPACK_LAPACKE_H    "Accelerate/Accelerate.h")
+        set(LAPACK_IMPL         "LAPACK/Apple")
+        ocv_lapack_check()
+      else()
+        unset(LAPACK_LIBRARIES)
+        unset(LAPACK_LIBRARIES CACHE)
+      endif()
     endif()
+  endif()
+
+  if(NOT LAPACK_LIBRARIES AND APPLE AND NOT OPENCV_LAPACK_FIND_PACKAGE_ONLY)
+    set(LAPACK_INCLUDE_DIR  "")
+    set(LAPACK_LIBRARIES    "-framework Accelerate")
+    set(LAPACK_CBLAS_H      "Accelerate/Accelerate.h")
+    set(LAPACK_LAPACKE_H    "Accelerate/Accelerate.h")
+    set(LAPACK_IMPL         "Apple")
+    ocv_lapack_check()
+  endif()
+
+  if(NOT HAVE_LAPACK AND LAPACK_LIBRARIES AND LAPACK_CBLAS_H AND LAPACK_LAPACKE_H)
+    ocv_lapack_check()
+  endif()
+
+  set(LAPACK_INCLUDE_DIR ${LAPACK_INCLUDE_DIR} CACHE PATH   "Path to BLAS include dir" FORCE)
+  set(LAPACK_CBLAS_H     ${LAPACK_CBLAS_H}     CACHE STRING "Alternative name of cblas.h" FORCE)
+  set(LAPACK_LAPACKE_H   ${LAPACK_LAPACKE_H}   CACHE STRING "Alternative name of lapacke.h" FORCE)
+  set(LAPACK_LIBRARIES   ${LAPACK_LIBRARIES}   CACHE STRING "Names of BLAS & LAPACK binaries (.so, .dll, .a, .lib)" FORCE)
+  set(LAPACK_IMPL        ${LAPACK_IMPL}        CACHE STRING "Lapack implementation id" FORCE)
 endif()
diff --git a/cmake/OpenCVFindMKL.cmake b/cmake/OpenCVFindMKL.cmake
index f43ce9c286..0f5462acd2 100644
--- a/cmake/OpenCVFindMKL.cmake
+++ b/cmake/OpenCVFindMKL.cmake
@@ -20,10 +20,8 @@ macro (mkl_find_lib VAR NAME DIRS)
 endmacro()
 
 macro(mkl_fail)
-    set(HAVE_MKL OFF CACHE BOOL "True if MKL found")
+    set(HAVE_MKL OFF)
     set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
-    unset(MKL_INCLUDE_DIRS CACHE)
-    unset(MKL_LIBRARIES CACHE)
     return()
 endmacro()
 
@@ -64,11 +62,16 @@ if(NOT MKL_ROOT_DIR OR NOT EXISTS ${MKL_ROOT_DIR}/include/mkl.h)
     find_path(MKL_ROOT_DIR include/mkl.h PATHS ${mkl_root_paths})
 endif()
 
-if(NOT MKL_ROOT_DIR)
+set(MKL_INCLUDE_DIRS ${MKL_ROOT_DIR}/include CACHE PATH "Path to MKL include directory")
+
+if(NOT MKL_ROOT_DIR
+    OR NOT EXISTS "${MKL_ROOT_DIR}"
+    OR NOT EXISTS "${MKL_INCLUDE_DIRS}"
+    OR NOT EXISTS "${MKL_INCLUDE_DIRS}/mkl_version.h"
+)
     mkl_fail()
 endif()
 
-set(MKL_INCLUDE_DIRS ${MKL_ROOT_DIR}/include)
 get_mkl_version(${MKL_INCLUDE_DIRS}/mkl_version.h)
 
 #determine arch
@@ -79,12 +82,13 @@ if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
     include(CheckTypeSize)
     CHECK_TYPE_SIZE(int _sizeof_int)
     if (_sizeof_int EQUAL 4)
-        set(MKL_LP64 "lp64")
+        set(MKL_ARCH_SUFFIX "lp64")
     else()
-        set(MKL_LP64 "ilp64")
+        set(MKL_ARCH_SUFFIX "ilp64")
     endif()
 else()
     set(MKL_ARCH "ia32")
+    set(MKL_ARCH_SUFFIX "c")
 endif()
 
 if(${MKL_VERSION_STR} VERSION_GREATER "11.3.0" OR ${MKL_VERSION_STR} VERSION_EQUAL "11.3.0")
@@ -94,7 +98,7 @@ if(${MKL_VERSION_STR} VERSION_GREATER "11.3.0" OR ${MKL_VERSION_STR} VERSION_EQU
 
     set(mkl_lib_list
         mkl_core
-        mkl_intel_${MKL_LP64})
+        mkl_intel_${MKL_ARCH_SUFFIX})
 
     if(MKL_WITH_TBB)
         list(APPEND mkl_lib_list mkl_tbb_thread tbb)
@@ -112,7 +116,6 @@ else()
     mkl_fail()
 endif()
 
-
 set(MKL_LIBRARIES "")
 foreach(lib ${mkl_lib_list})
     find_library(${lib} ${lib} ${mkl_lib_find_paths})
@@ -124,13 +127,11 @@ foreach(lib ${mkl_lib_list})
 endforeach()
 
 message(STATUS "Found MKL ${MKL_VERSION_STR} at: ${MKL_ROOT_DIR}")
-set(HAVE_MKL ON CACHE BOOL "True if MKL found")
+set(HAVE_MKL ON)
 set(MKL_ROOT_DIR ${MKL_ROOT_DIR} CACHE PATH "Path to MKL directory")
 set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIRS} CACHE PATH "Path to MKL include directory")
-if(NOT UNIX)
-    set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE FILEPATH "MKL libarries")
-else()
+set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE STRING "MKL libarries")
+if(UNIX AND NOT MKL_LIBRARIES_DONT_HACK)
     #it's ugly but helps to avoid cyclic lib problem
     set(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_LIBRARIES} ${MKL_LIBRARIES} "-lpthread" "-lm" "-ldl")
-    set(MKL_LIBRARIES ${MKL_LIBRARIES} CACHE STRING "MKL libarries")
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/OpenCVGenABI.cmake b/cmake/OpenCVGenABI.cmake
index 86aa78b17b..e9d6a33481 100644
--- a/cmake/OpenCVGenABI.cmake
+++ b/cmake/OpenCVGenABI.cmake
@@ -33,10 +33,12 @@ foreach(mod ${OPENCV_MODULES_BUILD})
       list(APPEND OPENCV_ABI_SKIP_HEADERS "${h}")
     endforeach()
     # libraries
-    set(lib_name "")
-    get_target_property(lib_name opencv_${mod} LOCATION)
-    get_filename_component(lib_name "${lib_name}" NAME)
-    list(APPEND OPENCV_ABI_SKIP_LIBRARIES "${lib_name}")
+    if(TARGET opencv_${mod}) # opencv_world
+      set(lib_name "")
+      get_target_property(lib_name opencv_${mod} LOCATION)
+      get_filename_component(lib_name "${lib_name}" NAME)
+      list(APPEND OPENCV_ABI_SKIP_LIBRARIES "${lib_name}")
+    endif()
   endif()
 endforeach()
 string(REPLACE ";" "\n    " OPENCV_ABI_SKIP_HEADERS "${OPENCV_ABI_SKIP_HEADERS}")
diff --git a/cmake/OpenCVGenInfoPlist.cmake b/cmake/OpenCVGenInfoPlist.cmake
index 2b78ae1e53..6dbdc5b9e7 100644
--- a/cmake/OpenCVGenInfoPlist.cmake
+++ b/cmake/OpenCVGenInfoPlist.cmake
@@ -2,8 +2,13 @@ set(OPENCV_APPLE_BUNDLE_NAME "OpenCV")
 set(OPENCV_APPLE_BUNDLE_ID "org.opencv")
 
 if(IOS)
-  configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.plist.in"
-                 "${CMAKE_BINARY_DIR}/ios/Info.plist")
+  if (APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
+    configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.Dynamic.plist.in"
+                   "${CMAKE_BINARY_DIR}/ios/Info.plist")
+  else()
+    configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.plist.in"
+                   "${CMAKE_BINARY_DIR}/ios/Info.plist")
+  endif()
 elseif(APPLE)
   configure_file("${OpenCV_SOURCE_DIR}/platforms/osx/Info.plist.in"
                  "${CMAKE_BINARY_DIR}/osx/Info.plist")
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 742a287ec9..6e6fe16c32 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -182,7 +182,7 @@ macro(ocv_add_module _name)
     # add self to the world dependencies
     if((NOT DEFINED OPENCV_MODULE_IS_PART_OF_WORLD
         AND NOT OPENCV_MODULE_${the_module}_CLASS STREQUAL "BINDINGS"
-        AND NOT OPENCV_PROCESSING_EXTRA_MODULES
+        AND (NOT OPENCV_PROCESSING_EXTRA_MODULES OR NOT OPENCV_WORLD_EXCLUDE_EXTRA_MODULES)
         AND (NOT BUILD_SHARED_LIBS OR NOT "x${OPENCV_MODULE_TYPE}" STREQUAL "xSTATIC"))
         OR OPENCV_MODULE_IS_PART_OF_WORLD
         )
@@ -224,12 +224,16 @@ macro(ocv_add_module _name)
     endif()
     if((NOT OPENCV_MODULE_${the_module}_IS_PART_OF_WORLD AND NOT ${the_module} STREQUAL opencv_world) OR NOT ${BUILD_opencv_world})
       project(${the_module})
+      add_definitions(
+        -D_USE_MATH_DEFINES  # M_PI constant in MSVS
+        -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS  # to use C libraries from C++ code (ffmpeg)
+      )
     endif()
   endif()
 endmacro()
 
 # excludes module from current configuration
-macro(ocv_module_disable module)
+macro(ocv_module_disable_ module)
   set(__modname ${module})
   if(NOT __modname MATCHES "^opencv_")
     set(__modname opencv_${module})
@@ -242,9 +246,12 @@ macro(ocv_module_disable module)
     # touch variable controlling build of the module to suppress "unused variable" CMake warning
   endif()
   unset(__modname)
-  return() # leave the current folder
 endmacro()
 
+macro(ocv_module_disable module)
+  ocv_module_disable_(${module})
+  return() # leave the current folder
+endmacro()
 
 # collect modules from specified directories
 # NB: must be called only once!
@@ -720,8 +727,10 @@ endmacro()
 #   ocv_create_module(<extra link dependencies>)
 #   ocv_create_module()
 macro(ocv_create_module)
-  ocv_debug_message("ocv_create_module(" ${ARGN} ")")
-  set(OPENCV_MODULE_${the_module}_LINK_DEPS "${OPENCV_MODULE_${the_module}_LINK_DEPS};${ARGN}" CACHE INTERNAL "")
+  ocv_debug_message("${the_module}: ocv_create_module(" ${ARGN} ")")
+  if(NOT " ${ARGN}" STREQUAL " ")
+    set(OPENCV_MODULE_${the_module}_LINK_DEPS "${OPENCV_MODULE_${the_module}_LINK_DEPS};${ARGN}" CACHE INTERNAL "")
+  endif()
   if(${BUILD_opencv_world} AND OPENCV_MODULE_${the_module}_IS_PART_OF_WORLD)
     # nothing
     set(the_module_target opencv_world)
diff --git a/cmake/OpenCVPCHSupport.cmake b/cmake/OpenCVPCHSupport.cmake
index 90437cb204..29f21d8015 100644
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@@ -303,9 +303,11 @@ ENDMACRO(ADD_PRECOMPILED_HEADER)
 
 MACRO(GET_NATIVE_PRECOMPILED_HEADER _targetName _input)
 
+  if(ENABLE_PRECOMPILED_HEADERS)
     if(CMAKE_GENERATOR MATCHES "^Visual.*$")
         set(${_targetName}_pch ${CMAKE_CURRENT_BINARY_DIR}/${_targetName}_pch.cpp)
     endif()
+  endif()
 
 ENDMACRO(GET_NATIVE_PRECOMPILED_HEADER)
 
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index cdf257d5fe..474f7db609 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -668,7 +668,11 @@ endfunction()
 
 # add install command
 function(ocv_install_target)
-  install(TARGETS ${ARGN})
+  if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
+    install(TARGETS ${ARGN} FRAMEWORK DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH})
+  else()
+    install(TARGETS ${ARGN})
+  endif()
 
   set(isPackage 0)
   unset(__package)
@@ -883,7 +887,14 @@ function(ocv_target_link_libraries target)
   if(";${LINK_DEPS};" MATCHES ";${target};")
     list(REMOVE_ITEM LINK_DEPS "${target}") # prevent "link to itself" warning (world problem)
   endif()
-  target_link_libraries(${target} ${LINK_DEPS})
+  if(NOT TARGET ${target})
+    if(NOT DEFINED OPENCV_MODULE_${target}_LOCATION)
+      message(FATAL_ERROR "ocv_target_link_libraries: invalid target: '${target}'")
+    endif()
+    set(OPENCV_MODULE_${target}_LINK_DEPS ${OPENCV_MODULE_${target}_LINK_DEPS} ${LINK_DEPS} CACHE INTERNAL "" FORCE)
+  else()
+    target_link_libraries(${target} ${LINK_DEPS})
+  endif()
 endfunction()
 
 function(_ocv_append_target_includes target)
@@ -931,6 +942,29 @@ function(ocv_add_library target)
     unset(sources)
   endif()
 
+  if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
+    message(STATUS "Setting Apple target properties for ${target}")
+
+    set(CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG 1)
+
+    set_target_properties(${target} PROPERTIES
+      FRAMEWORK TRUE
+      MACOSX_FRAMEWORK_IDENTIFIER org.opencv
+      MACOSX_FRAMEWORK_INFO_PLIST ${CMAKE_BINARY_DIR}/ios/Info.plist
+      # "current version" in semantic format in Mach-O binary file
+      VERSION ${OPENCV_LIBVERSION}
+      # "compatibility version" in semantic format in Mach-O binary file
+      SOVERSION ${OPENCV_LIBVERSION}
+      INSTALL_RPATH ""
+      INSTALL_NAME_DIR "@rpath"
+      BUILD_WITH_INSTALL_RPATH 1
+      LIBRARY_OUTPUT_NAME "opencv2"
+      XCODE_ATTRIBUTE_TARGETED_DEVICE_FAMILY "1,2"
+      #PUBLIC_HEADER "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h"
+      #XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY "iPhone Developer"
+    )
+  endif()
+
   _ocv_append_target_includes(${target})
 endfunction()
 
diff --git a/cmake/checks/lapack_check.cpp b/cmake/checks/lapack_check.cpp
new file mode 100644
index 0000000000..0457c44d68
--- /dev/null
+++ b/cmake/checks/lapack_check.cpp
@@ -0,0 +1,14 @@
+#include "opencv_lapack.h"
+
+static char* check_fn1 = (char*)sgesv_;
+static char* check_fn2 = (char*)sposv_;
+static char* check_fn3 = (char*)spotrf_;
+static char* check_fn4 = (char*)sgesdd_;
+
+int main(int argc, char* argv[])
+{
+    (void)argv;
+    if(argc > 1000)
+        return check_fn1[0] + check_fn2[0] + check_fn3[0] + check_fn4[0];
+    return 0;
+}
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 2e4ac7066a..9414b66f09 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -41,7 +41,7 @@ ALIASES               += next_tutorial{1}="**Next  Tutorial:** \ref \1 \n"
 ALIASES               += youtube{1}="@htmlonly[block]<div align='center'><iframe title='my title' width='560' height='349' src='http://www.youtube.com/embed/\1?rel=0' frameborder='0' align='middle' allowfullscreen></iframe></div>@endhtmlonly"
 TCL_SUBST              =
 OPTIMIZE_OUTPUT_FOR_C  = NO
-OPTIMIZE_OUTPUT_JAVA   = YES
+OPTIMIZE_OUTPUT_JAVA   = NO
 OPTIMIZE_FOR_FORTRAN   = NO
 OPTIMIZE_OUTPUT_VHDL   = NO
 EXTENSION_MAPPING      =
@@ -106,7 +106,7 @@ FILE_PATTERNS          =
 RECURSIVE              = YES
 EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       = *.inl.hpp *.impl.hpp *_detail.hpp */cudev/**/detail/*.hpp
+EXCLUDE_PATTERNS       = *.inl.hpp *.impl.hpp *_detail.hpp */cudev/**/detail/*.hpp *.m
 EXCLUDE_SYMBOLS        = cv::DataType<*> int void
 EXAMPLE_PATH           = @CMAKE_DOXYGEN_EXAMPLE_PATH@
 EXAMPLE_PATTERNS       = *
@@ -251,7 +251,7 @@ PREDEFINED             = __cplusplus=1 \
                          CV_DEFAULT(x)=" = x" \
                          CV_NEON=1 \
                          CV_SSE2=1 \
-                         FLANN_DEPRECATED=
+                         CV_DEPRECATED=
 EXPAND_AS_DEFINED      =
 SKIP_FUNCTION_MACROS   = YES
 TAGFILES               =
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index 149f36f520..b2675719c9 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -17,7 +17,6 @@
       <tab type="globals" visible="yes" title="Global objects" intro=""/>
     </tab>
     <tab type="examples" visible="yes" title="" intro=""/>
-    <tab type="user" url="/3.0-last-rst" title="Sphinx Documentation"/>
   </navindex>
 
   <!-- Layout definition for a class page -->
diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py
index 85b3ea4955..34512bad27 100755
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@@ -70,9 +70,9 @@ def main():
         opts, args = getopt.getopt(sys.argv[1:], "Ho:c:r:T:u:s:R:w:h:a:", ["help","output=","columns=","rows=",
                                                                       "type=","units=","square_size=","radius_rate=",
                                                                       "page_width=","page_height=", "page_size="])
-    except getopt.error, msg:
-        print msg
-        print "for help use --help"
+    except getopt.error as msg:
+        print(msg)
+        print("for help use --help")
         sys.exit(2)
     output = "out.svg"
     columns = 8
@@ -89,7 +89,7 @@ def main():
     # process options
     for o, a in opts:
         if o in ("-H", "--help"):
-            print __doc__
+            print(__doc__)
             sys.exit(0)
         elif o in ("-r", "--rows"):
             rows = int(a)
diff --git a/doc/pattern_tools/svgfig.py b/doc/pattern_tools/svgfig.py
index c690c9ff08..5e1b1df45d 100755
--- a/doc/pattern_tools/svgfig.py
+++ b/doc/pattern_tools/svgfig.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # svgfig.py copyright (C) 2008 Jim Pivarski <jpivarski@gmail.com>
 #
 # This program is free software; you can redistribute it and/or
@@ -21,6 +19,15 @@
 import re, codecs, os, platform, copy, itertools, math, cmath, random, sys, copy
 _epsilon = 1e-5
 
+if sys.version_info >= (3,0):
+  long = int
+  basestring = (str,bytes)
+
+# Fix Python 2.x.
+try:
+    UNICODE_EXISTS = bool(type(unicode))
+except NameError:
+    unicode = lambda s: str(s)
 
 if re.search("windows", platform.system(), re.I):
     try:
@@ -49,20 +56,21 @@ def rgb(r, g, b, maximum=1.):
                               max(0, min(b*255./maximum, 255)))
 
 def attr_preprocess(attr):
+    attrCopy = attr.copy()
     for name in attr.keys():
         name_colon = re.sub("__", ":", name)
         if name_colon != name:
-            attr[name_colon] = attr[name]
-            del attr[name]
+            attrCopy[name_colon] = attrCopy[name]
+            del attrCopy[name]
             name = name_colon
 
         name_dash = re.sub("_", "-", name)
         if name_dash != name:
-            attr[name_dash] = attr[name]
-            del attr[name]
+            attrCopy[name_dash] = attrCopy[name]
+            del attrCopy[name]
             name = name_dash
 
-    return attr
+    return attrCopy
 
 
 class SVG:
@@ -128,7 +136,7 @@ class SVG:
     """
     def __init__(self, *t_sub, **attr):
         if len(t_sub) == 0:
-            raise TypeError, "SVG element must have a t (SVG type)"
+            raise TypeError( "SVG element must have a t (SVG type)")
 
         # first argument is t (SVG type)
         self.t = t_sub[0]
@@ -262,7 +270,7 @@ class SVG:
 
         Returns a breadth-first generator over the SVG.  If depth_limit
         is a number, stop recursion at that depth."""
-        raise NotImplementedError, "Got an algorithm for breadth-first searching a tree without effectively copying the tree?"
+        raise NotImplementedError( "Got an algorithm for breadth-first searching a tree without effectively copying the tree?")
 
     def __iter__(self):
         return self.depth_first()
@@ -558,7 +566,7 @@ def canvas_outline(*sub, **attr):
     svg = canvas(*sub, **attr)
     match = re.match(r"[, \t]*([0-9e.+\-]+)[, \t]+([0-9e.+\-]+)[, \t]+([0-9e.+\-]+)[, \t]+([0-9e.+\-]+)[, \t]*", svg["viewBox"])
     if match is None:
-        raise ValueError, "canvas viewBox is incorrectly formatted"
+        raise ValueError( "canvas viewBox is incorrectly formatted")
     x, y, width, height = [float(x) for x in match.groups()]
     svg.prepend(SVG("rect", x=x, y=y, width=width, height=height, stroke="none", fill="cornsilk"))
     svg.append(SVG("rect", x=x, y=y, width=width, height=height, stroke="black", fill="none"))
@@ -675,7 +683,7 @@ def totrans(expr, vars=("x", "y"), globals=None, locals=None):
             return output
 
         else:
-            raise TypeError, "must be a function of 2 or 1 variables"
+            raise TypeError( "must be a function of 2 or 1 variables")
 
     if len(vars) == 2:
         g = math.__dict__
@@ -696,7 +704,7 @@ def totrans(expr, vars=("x", "y"), globals=None, locals=None):
         return output2
 
     else:
-        raise TypeError, "vars must have 2 or 1 elements"
+        raise TypeError( "vars must have 2 or 1 elements")
 
 
 def window(xmin, xmax, ymin, ymax, x=0, y=0, width=100, height=100,
@@ -735,10 +743,10 @@ def window(xmin, xmax, ymin, ymax, x=0, y=0, width=100, height=100,
     iy2 = ymax
 
     if xlogbase is not None and (ix1 <= 0. or ix2 <= 0.):
-        raise ValueError, "x range incompatible with log scaling: (%g, %g)" % (ix1, ix2)
+        raise ValueError ("x range incompatible with log scaling: (%g, %g)" % (ix1, ix2))
 
     if ylogbase is not None and (iy1 <= 0. or iy2 <= 0.):
-        raise ValueError, "y range incompatible with log scaling: (%g, %g)" % (iy1, iy2)
+        raise ValueError ("y range incompatible with log scaling: (%g, %g)" % (iy1, iy2))
 
     def maybelog(t, it1, it2, ot1, ot2, logbase):
         if t <= 0.:
@@ -813,7 +821,7 @@ class Fig:
 
         self.trans = kwds["trans"]; del kwds["trans"]
         if len(kwds) != 0:
-            raise TypeError, "Fig() got unexpected keyword arguments %s" % kwds.keys()
+            raise TypeError ("Fig() got unexpected keyword arguments %s" % kwds.keys())
 
     def SVG(self, trans=None):
         """Apply the transformation "trans" and return an SVG object.
@@ -931,7 +939,7 @@ class Plot:
         self.text_attr = kwds["text_attr"]; del kwds["text_attr"]
         self.axis_attr = kwds["axis_attr"]; del kwds["axis_attr"]
         if len(kwds) != 0:
-            raise TypeError, "Plot() got unexpected keyword arguments %s" % kwds.keys()
+            raise TypeError ("Plot() got unexpected keyword arguments %s" % kwds.keys())
 
     def SVG(self, trans=None):
         """Apply the transformation "trans" and return an SVG object."""
@@ -1039,7 +1047,7 @@ class Frame:
         self.axis_attr.update(kwds["axis_attr"]); del kwds["axis_attr"]
 
         if len(kwds) != 0:
-            raise TypeError, "Frame() got unexpected keyword arguments %s" % kwds.keys()
+            raise TypeError( "Frame() got unexpected keyword arguments %s" % kwds.keys())
 
     def SVG(self):
         """Apply the window transformation and return an SVG object."""
@@ -1101,7 +1109,7 @@ class Frame:
 def pathtoPath(svg):
     """Converts SVG("path", d="...") into Path(d=[...])."""
     if not isinstance(svg, SVG) or svg.t != "path":
-        raise TypeError, "Only SVG <path /> objects can be converted into Paths"
+        raise TypeError ("Only SVG <path /> objects can be converted into Paths")
     attr = dict(svg.attr)
     d = attr["d"]
     del attr["d"]
@@ -1235,7 +1243,7 @@ class Path:
                 errstring = "Path command \"%s\" requires a number at index %d" % (command, index)
                 num1, index, pathdata = self.parse_number(index, pathdata)
                 if num1 is None:
-                    raise ValueError, errstring
+                    raise ValueError ( errstring)
 
                 while num1 is not None:
                     output.append((command, num1))
@@ -1248,11 +1256,11 @@ class Path:
                 num2, index, pathdata = self.parse_number(index, pathdata)
 
                 if num1 is None:
-                    raise ValueError, errstring
+                    raise ValueError ( errstring)
 
                 while num1 is not None:
                     if num2 is None:
-                        raise ValueError, errstring
+                        raise ValueError ( errstring)
                     output.append((command, num1, num2, False))
 
                     num1, index, pathdata = self.parse_number(index, pathdata)
@@ -1267,11 +1275,11 @@ class Path:
                 num4, index, pathdata = self.parse_number(index, pathdata)
 
                 if num1 is None:
-                    raise ValueError, errstring
+                    raise ValueError ( errstring )
 
                 while num1 is not None:
                     if num2 is None or num3 is None or num4 is None:
-                        raise ValueError, errstring
+                        raise ValueError (errstring)
                     output.append((command, num1, num2, False, num3, num4, False))
 
                     num1, index, pathdata = self.parse_number(index, pathdata)
@@ -1290,11 +1298,11 @@ class Path:
                 num6, index, pathdata = self.parse_number(index, pathdata)
 
                 if num1 is None:
-                    raise ValueError, errstring
+                    raise ValueError(errstring)
 
                 while num1 is not None:
                     if num2 is None or num3 is None or num4 is None or num5 is None or num6 is None:
-                        raise ValueError, errstring
+                        raise ValueError(errstring)
 
                     output.append((command, num1, num2, False, num3, num4, False, num5, num6, False))
 
@@ -1317,11 +1325,11 @@ class Path:
                 num7, index, pathdata = self.parse_number(index, pathdata)
 
                 if num1 is None:
-                    raise ValueError, errstring
+                    raise ValueError(errstring)
 
                 while num1 is not None:
                     if num2 is None or num3 is None or num4 is None or num5 is None or num6 is None or num7 is None:
-                        raise ValueError, errstring
+                        raise ValueError(errstring)
 
                     output.append((command, num1, num2, False, num3, num4, num5, num6, num7, False))
 
@@ -1344,7 +1352,7 @@ class Path:
         output = []
         for datum in self.d:
             if not isinstance(datum, (tuple, list)):
-                raise TypeError, "pathdata elements must be tuples/lists"
+                raise TypeError("pathdata elements must be tuples/lists")
 
             command = datum[0]
 
@@ -1722,7 +1730,7 @@ class Curve:
         try:
             # the best way to keep all the information while sampling is to make a linked list
             if not (self.low < self.high):
-                raise ValueError, "low must be less than high"
+                raise ValueError("low must be less than high")
             low, high = self.Sample(float(self.low)), self.Sample(float(self.high))
             low.link(None, high)
             high.link(low, None)
@@ -1913,10 +1921,10 @@ class Poly:
                     vx[i], vy[i] = 0., 0.
 
         else:
-            raise ValueError, "mode must be \"lines\", \"bezier\", \"velocity\", \"foreback\", \"smooth\", or an abbreviation"
+            raise ValueError("mode must be \"lines\", \"bezier\", \"velocity\", \"foreback\", \"smooth\", or an abbreviation")
 
         d = []
-        indexes = range(len(self.d))
+        indexes = list(range(len(self.d)))
         if self.loop and len(self.d) > 0:
             indexes.append(0)
 
@@ -2220,7 +2228,7 @@ class Line(Curve):
                     defs.append(make_marker(self.arrow_start, "arrow_start"))
                     line.attr["marker-start"] = "url(#%s)" % self.arrow_start
                 else:
-                    raise TypeError, "arrow_start must be False/None or an id string for the new marker"
+                    raise TypeError("arrow_start must be False/None or an id string for the new marker")
 
             if self.arrow_end != False and self.arrow_end is not None:
                 if isinstance(self.arrow_end, SVG):
@@ -2230,7 +2238,7 @@ class Line(Curve):
                     defs.append(make_marker(self.arrow_end, "arrow_end"))
                     line.attr["marker-end"] = "url(#%s)" % self.arrow_end
                 else:
-                    raise TypeError, "arrow_end must be False/None or an id string for the new marker"
+                    raise TypeError("arrow_end must be False/None or an id string for the new marker")
 
             return SVG("g", defs, line)
 
@@ -2316,7 +2324,7 @@ class LineGlobal:
                     defs.append(make_marker(self.arrow_start, "arrow_start"))
                     line.attr["marker-start"] = "url(#%s)" % self.arrow_start
                 else:
-                    raise TypeError, "arrow_start must be False/None or an id string for the new marker"
+                    raise TypeError("arrow_start must be False/None or an id string for the new marker")
 
             if self.arrow_end != False and self.arrow_end is not None:
                 if isinstance(self.arrow_end, SVG):
@@ -2326,7 +2334,7 @@ class LineGlobal:
                     defs.append(make_marker(self.arrow_end, "arrow_end"))
                     line.attr["marker-end"] = "url(#%s)" % self.arrow_end
                 else:
-                    raise TypeError, "arrow_end must be False/None or an id string for the new marker"
+                    raise TypeError("arrow_end must be False/None or an id string for the new marker")
 
             return SVG("g", defs, line)
 
@@ -2681,7 +2689,7 @@ class Ticks:
                 elif isinstance(self.arrow_start, basestring):
                     defs.append(make_marker(self.arrow_start, "arrow_start"))
                 else:
-                    raise TypeError, "arrow_start must be False/None or an id string for the new marker"
+                    raise TypeError("arrow_start must be False/None or an id string for the new marker")
 
             if self.arrow_end != False and self.arrow_end is not None:
                 if isinstance(self.arrow_end, SVG):
@@ -2689,7 +2697,7 @@ class Ticks:
                 elif isinstance(self.arrow_end, basestring):
                     defs.append(make_marker(self.arrow_end, "arrow_end"))
                 else:
-                    raise TypeError, "arrow_end must be False/None or an id string for the new marker"
+                    raise TypeError("arrow_end must be False/None or an id string for the new marker")
 
             output.append(defs)
 
@@ -2757,7 +2765,7 @@ class Ticks:
             format = self.labels
 
         else:
-            raise TypeError, "labels must be None/False, True, a format string, or a number->string function"
+            raise TypeError("labels must be None/False, True, a format string, or a number->string function")
 
         # Now for the ticks
         ticks = self.ticks
@@ -2793,7 +2801,7 @@ class Ticks:
                 return ticks, []
 
             else:
-                raise TypeError, "miniticks must be None/False, True, a number of desired miniticks, or a list of numbers"
+                raise TypeError("miniticks must be None/False, True, a number of desired miniticks, or a list of numbers")
 
         # Cases 3 & 4: ticks is iterable
         elif getattr(ticks, "__iter__", False):
@@ -2830,10 +2838,10 @@ class Ticks:
                 return ticks, []
 
             else:
-                raise TypeError, "miniticks must be None/False, True, a number of desired miniticks, or a list of numbers"
+                raise TypeError("miniticks must be None/False, True, a number of desired miniticks, or a list of numbers")
 
         else:
-            raise TypeError, "ticks must be None/False, a number of desired ticks, a list of numbers, or a dictionary of explicit markers"
+            raise TypeError("ticks must be None/False, a number of desired ticks, a list of numbers, or a dictionary of explicit markers")
 
     def compute_ticks(self, N, format):
         """Return less than -N or exactly N optimal linear ticks.
@@ -2841,9 +2849,9 @@ class Ticks:
         Normally only used internally.
         """
         if self.low >= self.high:
-            raise ValueError, "low must be less than high"
+            raise ValueError("low must be less than high")
         if N == 1:
-            raise ValueError, "N can be 0 or >1 to specify the exact number of ticks or negative to specify a maximum"
+            raise ValueError("N can be 0 or >1 to specify the exact number of ticks or negative to specify a maximum")
 
         eps = _epsilon * (self.high - self.low)
 
@@ -2948,7 +2956,7 @@ class Ticks:
         original_ticks.sort()
 
         if self.low > original_ticks[0] + _epsilon or self.high < original_ticks[-1] - _epsilon:
-            raise ValueError, "original_ticks {%g...%g} extend beyond [%g, %g]" % (original_ticks[0], original_ticks[-1], self.low, self.high)
+            raise ValueError("original_ticks {%g...%g} extend beyond [%g, %g]" % (original_ticks[0], original_ticks[-1], self.low, self.high))
 
         granularities = []
         for i in range(len(original_ticks)-1):
@@ -2975,9 +2983,9 @@ class Ticks:
         Normally only used internally.
         """
         if self.low >= self.high:
-            raise ValueError, "low must be less than high"
+            raise ValueError("low must be less than high")
         if N == 1:
-            raise ValueError, "N can be 0 or >1 to specify the exact number of ticks or negative to specify a maximum"
+            raise ValueError("N can be 0 or >1 to specify the exact number of ticks or negative to specify a maximum")
 
         eps = _epsilon * (self.high - self.low)
 
@@ -3032,7 +3040,7 @@ class Ticks:
         Normally only used internally.
         """
         if self.low >= self.high:
-            raise ValueError, "low must be less than high"
+            raise ValueError("low must be less than high")
 
         lowN = math.floor(math.log(self.low, base))
         highN = math.ceil(math.log(self.high, base))
@@ -3166,7 +3174,7 @@ class LineAxis(Line, Ticks):
     def interpret(self):
         if self.exclude is not None and not (isinstance(self.exclude, (tuple, list)) and len(self.exclude) == 2 and
                                              isinstance(self.exclude[0], (int, long, float)) and isinstance(self.exclude[1], (int, long, float))):
-            raise TypeError, "exclude must either be None or (low, high)"
+            raise TypeError("exclude must either be None or (low, high)")
 
         ticks, miniticks = Ticks.interpret(self)
         if self.exclude is None:
diff --git a/doc/py_tutorials/py_imgproc/py_houghlines/py_houghlines.markdown b/doc/py_tutorials/py_imgproc/py_houghlines/py_houghlines.markdown
index 5b569ed074..f42d6ad226 100644
--- a/doc/py_tutorials/py_imgproc/py_houghlines/py_houghlines.markdown
+++ b/doc/py_tutorials/py_imgproc/py_houghlines/py_houghlines.markdown
@@ -5,36 +5,36 @@ Goal
 ----
 
 In this chapter,
-    -   We will understand the concept of Hough Tranform.
-    -   We will see how to use it detect lines in an image.
-    -   We will see following functions: **cv2.HoughLines()**, **cv2.HoughLinesP()**
+    -   We will understand the concept of the Hough Transform.
+    -   We will see how to use it to detect lines in an image.
+    -   We will see the following functions: **cv2.HoughLines()**, **cv2.HoughLinesP()**
 
 Theory
 ------
 
-Hough Transform is a popular technique to detect any shape, if you can represent that shape in
+The Hough Transform is a popular technique to detect any shape, if you can represent that shape in a
 mathematical form. It can detect the shape even if it is broken or distorted a little bit. We will
 see how it works for a line.
 
-A line can be represented as \f$y = mx+c\f$ or in parametric form, as
-\f$\rho = x \cos \theta + y \sin \theta\f$ where \f$\rho\f$ is the perpendicular distance from origin to the
-line, and \f$\theta\f$ is the angle formed by this perpendicular line and horizontal axis measured in
-counter-clockwise ( That direction varies on how you represent the coordinate system. This
-representation is used in OpenCV). Check below image:
+A line can be represented as \f$y = mx+c\f$ or in a parametric form, as
+\f$\rho = x \cos \theta + y \sin \theta\f$ where \f$\rho\f$ is the perpendicular distance from the origin to the
+line, and \f$\theta\f$ is the angle formed by this perpendicular line and the horizontal axis measured in
+counter-clockwise (That direction varies on how you represent the coordinate system. This
+representation is used in OpenCV). Check the image below:
 
 ![image](images/houghlines1.svg)
 
-So if line is passing below the origin, it will have a positive rho and angle less than 180. If it
-is going above the origin, instead of taking angle greater than 180, angle is taken less than 180,
+So if the line is passing below the origin, it will have a positive rho and an angle less than 180. If it
+is going above the origin, instead of taking an angle greater than 180, the angle is taken less than 180,
 and rho is taken negative. Any vertical line will have 0 degree and horizontal lines will have 90
 degree.
 
-Now let's see how Hough Transform works for lines. Any line can be represented in these two terms,
-\f$(\rho, \theta)\f$. So first it creates a 2D array or accumulator (to hold values of two parameters)
+Now let's see how the Hough Transform works for lines. Any line can be represented in these two terms,
+\f$(\rho, \theta)\f$. So first it creates a 2D array or accumulator (to hold the values of the two parameters)
 and it is set to 0 initially. Let rows denote the \f$\rho\f$ and columns denote the \f$\theta\f$. Size of
-array depends on the accuracy you need. Suppose you want the accuracy of angles to be 1 degree, you
+array depends on the accuracy you need. Suppose you want the accuracy of angles to be 1 degree, you will
 need 180 columns. For \f$\rho\f$, the maximum distance possible is the diagonal length of the image. So
-taking one pixel accuracy, number of rows can be diagonal length of the image.
+taking one pixel accuracy, the number of rows can be the diagonal length of the image.
 
 Consider a 100x100 image with a horizontal line at the middle. Take the first point of the line. You
 know its (x,y) values. Now in the line equation, put the values \f$\theta = 0,1,2,....,180\f$ and check
@@ -42,57 +42,34 @@ the \f$\rho\f$ you get. For every \f$(\rho, \theta)\f$ pair, you increment value
 in its corresponding \f$(\rho, \theta)\f$ cells. So now in accumulator, the cell (50,90) = 1 along with
 some other cells.
 
-Now take the second point on the line. Do the same as above. Increment the the values in the cells
+Now take the second point on the line. Do the same as above. Increment the values in the cells
 corresponding to `(rho, theta)` you got. This time, the cell (50,90) = 2. What you actually
 do is voting the \f$(\rho, \theta)\f$ values. You continue this process for every point on the line. At
 each point, the cell (50,90) will be incremented or voted up, while other cells may or may not be
 voted up. This way, at the end, the cell (50,90) will have maximum votes. So if you search the
 accumulator for maximum votes, you get the value (50,90) which says, there is a line in this image
-at distance 50 from origin and at angle 90 degrees. It is well shown in below animation (Image
+at a distance 50 from the origin and at angle 90 degrees. It is well shown in the below animation (Image
 Courtesy: [Amos Storkey](http://homepages.inf.ed.ac.uk/amos/hough.html) )
 
 ![](images/houghlinesdemo.gif)
 
-This is how hough transform for lines works. It is simple, and may be you can implement it using
+This is how hough transform works for lines. It is simple, and may be you can implement it using
 Numpy on your own. Below is an image which shows the accumulator. Bright spots at some locations
-denotes they are the parameters of possible lines in the image. (Image courtesy: [Wikipedia](http://en.wikipedia.org/wiki/Hough_transform))
+denote they are the parameters of possible lines in the image. (Image courtesy: [Wikipedia](http://en.wikipedia.org/wiki/Hough_transform) )
 
 ![](images/houghlines2.jpg)
 
 Hough Transform in OpenCV
 =========================
 
-Everything explained above is encapsulated in the OpenCV function, \*\*cv2.HoughLines()\*\*. It simply returns an array of :math:(rho,
+Everything explained above is encapsulated in the OpenCV function, **cv2.HoughLines()**. It simply returns an array of :math:(rho,
 theta)\` values. \f$\rho\f$ is measured in pixels and \f$\theta\f$ is measured in radians. First parameter,
-Input image should be a binary image, so apply threshold or use canny edge detection before finding
+Input image should be a binary image, so apply threshold or use canny edge detection before
 applying hough transform. Second and third parameters are \f$\rho\f$ and \f$\theta\f$ accuracies
-respectively. Fourth argument is the threshold, which means minimum vote it should get for it to be
-considered as a line. Remember, number of votes depend upon number of points on the line. So it
+respectively. Fourth argument is the threshold, which means the minimum vote it should get to be
+considered as a line. Remember, number of votes depends upon the number of points on the line. So it
 represents the minimum length of line that should be detected.
-@code{.py}
-import cv2
-import numpy as np
-
-img = cv2.imread('sudoku.png')
-gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
-edges = cv2.Canny(gray,50,150,apertureSize = 3)
-
-lines = cv2.HoughLines(edges,1,np.pi/180,200)
-for line in lines:
-    rho,theta = line[0]
-    a = np.cos(theta)
-    b = np.sin(theta)
-    x0 = a*rho
-    y0 = b*rho
-    x1 = int(x0 + 1000*(-b))
-    y1 = int(y0 + 1000*(a))
-    x2 = int(x0 - 1000*(-b))
-    y2 = int(y0 - 1000*(a))
-
-    cv2.line(img,(x1,y1),(x2,y2),(0,0,255),2)
-
-cv2.imwrite('houghlines3.jpg',img)
-@endcode
+@include hough_line_transform.py
 Check the results below:
 
 ![image](images/houghlines3.jpg)
@@ -101,36 +78,23 @@ Probabilistic Hough Transform
 -----------------------------
 
 In the hough transform, you can see that even for a line with two arguments, it takes a lot of
-computation. Probabilistic Hough Transform is an optimization of Hough Transform we saw. It doesn't
-take all the points into consideration, instead take only a random subset of points and that is
-sufficient for line detection. Just we have to decrease the threshold. See below image which compare
-Hough Transform and Probabilistic Hough Transform in hough space. (Image Courtesy : [Franck
-Bettinger's home page](http://phdfb1.free.fr/robot/mscthesis/node14.html)
+computation. Probabilistic Hough Transform is an optimization of the Hough Transform we saw. It doesn't
+take all the points into consideration. Instead, it takes only a random subset of points which is
+sufficient for line detection. Just we have to decrease the threshold. See image below which compares
+Hough Transform and Probabilistic Hough Transform in Hough space. (Image Courtesy :
+[Franck Bettinger's home page](http://phdfb1.free.fr/robot/mscthesis/node14.html) )
 
 ![image](images/houghlines4.png)
 
 OpenCV implementation is based on Robust Detection of Lines Using the Progressive Probabilistic
-Hough Transform by Matas, J. and Galambos, C. and Kittler, J.V.. The function used is
+Hough Transform by Matas, J. and Galambos, C. and Kittler, J.V. @cite Matas00. The function used is
 **cv2.HoughLinesP()**. It has two new arguments.
 -   **minLineLength** - Minimum length of line. Line segments shorter than this are rejected.
--   **maxLineGap** - Maximum allowed gap between line segments to treat them as single line.
+-   **maxLineGap** - Maximum allowed gap between line segments to treat them as a single line.
 
 Best thing is that, it directly returns the two endpoints of lines. In previous case, you got only
 the parameters of lines, and you had to find all the points. Here, everything is direct and simple.
-@code{.py}
-import cv2
-import numpy as np
-
-img = cv2.imread('sudoku.png')
-gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
-edges = cv2.Canny(gray,50,150,apertureSize = 3)
-lines = cv2.HoughLinesP(edges,1,np.pi/180,100,minLineLength=100,maxLineGap=10)
-for line in lines:
-    x1,y1,x2,y2 = line[0]
-    cv2.line(img,(x1,y1),(x2,y2),(0,255,0),2)
-
-cv2.imwrite('houghlines5.jpg',img)
-@endcode
+@include probabilistic_hough_line_transform.py
 See the results below:
 
 ![image](images/houghlines5.jpg)
diff --git a/doc/py_tutorials/py_setup/py_setup_in_fedora/py_setup_in_fedora.markdown b/doc/py_tutorials/py_setup/py_setup_in_fedora/py_setup_in_fedora.markdown
index da65dd4772..a4ec9bbd36 100644
--- a/doc/py_tutorials/py_setup/py_setup_in_fedora/py_setup_in_fedora.markdown
+++ b/doc/py_tutorials/py_setup/py_setup_in_fedora/py_setup_in_fedora.markdown
@@ -102,13 +102,10 @@ yum install eigen3-devel
 @endcode
 If you want to build **documentation** ( *Yes, you can create offline version of OpenCV's complete
 official documentation in your system in HTML with full search facility so that you need not access
-internet always if any question, and it is quite FAST!!!* ), you need to install **Sphinx** (a
-documentation generation tool) and **pdflatex** (if you want to create a PDF version of it). ( Also
-while configuring installation with CMake, don't forget to pass -D BUILD_DOCS=ON. More details
-below.)
+internet always if any question, and it is quite FAST!!!* ), you need to install **Doxygen** (a
+documentation generation tool).
 @code{.sh}
-yum install python-sphinx
-yum install texlive
+yum install doxygen
 @endcode
 ### Downloading OpenCV
 
@@ -174,6 +171,7 @@ setup you got, make sure that following fields are filled (below is the some imp
 configuration I got). These fields should be filled appropriately in your system also. Otherwise
 some problem has happened. So check if you have correctly performed above steps.
 @code{.sh}
+...
 --   GUI:
 --     GTK+ 2.x:                    YES (ver 2.24.19)
 --     GThread :                    YES (ver 2.36.3)
@@ -205,15 +203,7 @@ some problem has happened. So check if you have correctly performed above steps.
 --     numpy:                       /usr/lib/python2.7/site-packages/numpy/core/include (ver 1.7.1)
 --     packages path:               lib/python2.7/site-packages
 
---   Documentation:
---     Build Documentation:         YES
---     Sphinx:                      /usr/bin/sphinx-build (ver 1.1.3)
---     PdfLaTeX compiler:           /usr/bin/pdflatex
---
---   Tests and samples:
---     Tests:                       NO
---     Performance tests:           NO
---     C/C++ Examples:              NO
+...
 @endcode
 Many other flags and settings are there. It is left for you for further exploration.
 
diff --git a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
index bb0ffd8978..a035199ee9 100644
--- a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
+++ b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
@@ -10,6 +10,7 @@ In this tutorial you will learn how to:
 -   Initialize a matrix with zeros
 -   Learn what @ref cv::saturate_cast does and why it is useful
 -   Get some cool info about pixel transformations
+-   Improve the brightness of an image on a practical example
 
 Theory
 ------
@@ -53,87 +54,29 @@ Code
 ----
 
 -   The following code performs the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ :
-@code{.cpp}
-#include <opencv2/opencv.hpp>
-#include <iostream>
-
-using namespace cv;
-
-double alpha; /*< Simple contrast control */
-int beta;  /*< Simple brightness control */
-
-int main( int argc, char** argv )
-{
-    /// Read image given by user
-    Mat image = imread( argv[1] );
-    Mat new_image = Mat::zeros( image.size(), image.type() );
-
-    /// Initialize values
-    std::cout<<" Basic Linear Transforms "<<std::endl;
-    std::cout<<"-------------------------"<<std::endl;
-    std::cout<<"* Enter the alpha value [1.0-3.0]: ";std::cin>>alpha;
-    std::cout<<"* Enter the beta value [0-100]: "; std::cin>>beta;
-
-    /// Do the operation new_image(i,j) = alpha*image(i,j) + beta
-    for( int y = 0; y < image.rows; y++ ) {
-        for( int x = 0; x < image.cols; x++ ) {
-            for( int c = 0; c < 3; c++ ) {
-                new_image.at<Vec3b>(y,x)[c] =
-                saturate_cast<uchar>( alpha*( image.at<Vec3b>(y,x)[c] ) + beta );
-            }
-        }
-    }
-
-    /// Create Windows
-    namedWindow("Original Image", 1);
-    namedWindow("New Image", 1);
-
-    /// Show stuff
-    imshow("Original Image", image);
-    imshow("New Image", new_image);
-
-    /// Wait until user press some key
-    waitKey();
-    return 0;
-}
-@endcode
+@include BasicLinearTransforms.cpp
 
 Explanation
 -----------
 
 -#  We begin by creating parameters to save \f$\alpha\f$ and \f$\beta\f$ to be entered by the user:
-    @code{.cpp}
-    double alpha;
-    int beta;
-    @endcode
+    @snippet BasicLinearTransforms.cpp basic-linear-transform-parameters
+
 -#  We load an image using @ref cv::imread and save it in a Mat object:
-    @code{.cpp}
-    Mat image = imread( argv[1] );
-    @endcode
+    @snippet BasicLinearTransforms.cpp basic-linear-transform-load
 -#  Now, since we will make some transformations to this image, we need a new Mat object to store
     it. Also, we want this to have the following features:
 
     -   Initial pixel values equal to zero
     -   Same size and type as the original image
-    @code{.cpp}
-    Mat new_image = Mat::zeros( image.size(), image.type() );
-    @endcode
+    @snippet BasicLinearTransforms.cpp basic-linear-transform-output
     We observe that @ref cv::Mat::zeros returns a Matlab-style zero initializer based on
     *image.size()* and *image.type()*
 
 -#  Now, to perform the operation \f$g(i,j) = \alpha \cdot f(i,j) + \beta\f$ we will access to each
     pixel in image. Since we are operating with BGR images, we will have three values per pixel (B,
     G and R), so we will also access them separately. Here is the piece of code:
-    @code{.cpp}
-    for( int y = 0; y < image.rows; y++ ) {
-        for( int x = 0; x < image.cols; x++ ) {
-            for( int c = 0; c < 3; c++ ) {
-                new_image.at<Vec3b>(y,x)[c] =
-                  saturate_cast<uchar>( alpha*( image.at<Vec3b>(y,x)[c] ) + beta );
-            }
-        }
-    }
-    @endcode
+    @snippet BasicLinearTransforms.cpp basic-linear-transform-operation
     Notice the following:
     -   To access each pixel in the images we are using this syntax: *image.at\<Vec3b\>(y,x)[c]*
         where *y* is the row, *x* is the column and *c* is R, G or B (0, 1 or 2).
@@ -142,15 +85,7 @@ Explanation
         values are valid.
 
 -#  Finally, we create windows and show the images, the usual way.
-    @code{.cpp}
-    namedWindow("Original Image", 1);
-    namedWindow("New Image", 1);
-
-    imshow("Original Image", image);
-    imshow("New Image", new_image);
-
-    waitKey(0);
-    @endcode
+    @snippet BasicLinearTransforms.cpp basic-linear-transform-display
 
 @note
     Instead of using the **for** loops to access each pixel, we could have simply used this command:
@@ -176,3 +111,89 @@ Result
 -   We get this:
 
     ![](images/Basic_Linear_Transform_Tutorial_Result_big.jpg)
+
+Practical example
+----
+
+In this paragraph, we will put into practice what we have learned to correct an underexposed image by adjusting the brightness
+and the contrast of the image. We will also see another technique to correct the brightness of an image called
+gamma correction.
+
+### Brightness and contrast adjustments
+
+Increasing (/ decreasing) the \f$\beta\f$ value will add (/ subtract) a constant value to every pixel. Pixel values outside of the [0 ; 255]
+range will be saturated (i.e. a pixel value higher (/ lesser) than 255 (/ 0) will be clamp to 255 (/ 0)).
+
+![In light gray, histogram of the original image, in dark gray when brightness = 80 in Gimp](images/Basic_Linear_Transform_Tutorial_hist_beta.png)
+
+The histogram represents for each color level the number of pixels with that color level. A dark image will have many pixels with
+low color value and thus the histogram will present a peak in his left part. When adding a constant bias, the histogram is shifted to the
+right as we have added a constant bias to all the pixels.
+
+The \f$\alpha\f$ parameter will modify how the levels spread. If \f$ \alpha < 1 \f$, the color levels will be compressed and the result
+will be an image with less contrast.
+
+![In light gray, histogram of the original image, in dark gray when contrast < 0 in Gimp](images/Basic_Linear_Transform_Tutorial_hist_alpha.png)
+
+Note that these histograms have been obtained using the Brightness-Contrast tool in the Gimp software. The brightness tool should be
+identical to the \f$\beta\f$ bias parameters but the contrast tool seems to differ to the \f$\alpha\f$ gain where the output range
+seems to be centered with Gimp (as you can notice in the previous histogram).
+
+It can occur that playing with the \f$\beta\f$ bias will improve the brightness but in the same time the image will appear with a
+slight veil as the contrast will be reduced. The \f$\alpha\f$ gain can be used to diminue this effect but due to the saturation,
+we will lose some details in the original bright regions.
+
+### Gamma correction
+
+[Gamma correction](https://en.wikipedia.org/wiki/Gamma_correction) can be used to correct the brightness of an image by using a non
+linear transformation between the input values and the mapped output values:
+
+\f[O = \left( \frac{I}{255} \right)^{\gamma} \times 255\f]
+
+As this relation is non linear, the effect will not be the same for all the pixels and will depend to their original value.
+
+![Plot for different values of gamma](images/Basic_Linear_Transform_Tutorial_gamma.png)
+
+When \f$ \gamma < 1 \f$, the original dark regions will be brighter and the histogram will be shifted to the right whereas it will
+be the opposite with \f$ \gamma > 1 \f$.
+
+### Correct an underexposed image
+
+The following image has been corrected with: \f$ \alpha = 1.3 \f$ and \f$ \beta = 40 \f$.
+
+![By Visem (Own work) [CC BY-SA 3.0], via Wikimedia Commons](images/Basic_Linear_Transform_Tutorial_linear_transform_correction.jpg)
+
+The overall brightness has been improved but you can notice that the clouds are now greatly saturated due to the numerical saturation
+of the implementation used ([highlight clipping](https://en.wikipedia.org/wiki/Clipping_(photography)) in photography).
+
+The following image has been corrected with: \f$ \gamma = 0.4 \f$.
+
+![By Visem (Own work) [CC BY-SA 3.0], via Wikimedia Commons](images/Basic_Linear_Transform_Tutorial_gamma_correction.jpg)
+
+The gamma correction should tend to add less saturation effect as the mapping is non linear and there is no numerical saturation possible as in the previous method.
+
+![Left: histogram after alpha, beta correction ; Center: histogram of the original image ; Right: histogram after the gamma correction](images/Basic_Linear_Transform_Tutorial_histogram_compare.png)
+
+The previous figure compares the histograms for the three images (the y-ranges are not the same between the three histograms).
+You can notice that most of the pixel values are in the lower part of the histogram for the original image. After \f$ \alpha \f$,
+\f$ \beta \f$ correction, we can observe a big peak at 255 due to the saturation as well as a shift in the right.
+After gamma correction, the histogram is shifted to the right but the pixels in the dark regions are more shifted
+(see the gamma curves [figure](Basic_Linear_Transform_Tutorial_gamma.png)) than those in the bright regions.
+
+In this tutorial, you have seen two simple methods to adjust the contrast and the brightness of an image. **They are basic techniques
+and are not intended to be used as a replacement of a raster graphics editor!**
+
+### Code
+
+Code for the tutorial is [here](https://github.com/opencv/opencv/blob/master/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp).
+Code for the gamma correction:
+
+@snippet changing_contrast_brightness_image.cpp changing-contrast-brightness-gamma-correction
+
+A look-up table is used to improve the performance of the computation as only 256 values needs to be calculated once.
+
+### Additional resources
+
+-   [Gamma correction in graphics rendering](https://learnopengl.com/#!Advanced-Lighting/Gamma-Correction)
+-   [Gamma correction and images displayed on CRT monitors](http://www.graphics.cornell.edu/~westin/gamma/gamma.html)
+-   [Digital exposure techniques](http://www.cambridgeincolour.com/tutorials/digital-exposure-techniques.htm)
diff --git a/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_gamma.png b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_gamma.png
new file mode 100644
index 0000000000..50cd5860ac
Binary files /dev/null and b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_gamma.png differ
diff --git a/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_gamma_correction.jpg b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_gamma_correction.jpg
new file mode 100644
index 0000000000..7ade82d85e
Binary files /dev/null and b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_gamma_correction.jpg differ
diff --git a/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_hist_alpha.png b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_hist_alpha.png
new file mode 100644
index 0000000000..21185a16d3
Binary files /dev/null and b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_hist_alpha.png differ
diff --git a/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_hist_beta.png b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_hist_beta.png
new file mode 100644
index 0000000000..d8ef844691
Binary files /dev/null and b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_hist_beta.png differ
diff --git a/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_histogram_compare.png b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_histogram_compare.png
new file mode 100644
index 0000000000..c2494dc22e
Binary files /dev/null and b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_histogram_compare.png differ
diff --git a/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_linear_transform_correction.jpg b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_linear_transform_correction.jpg
new file mode 100644
index 0000000000..2cad6558ce
Binary files /dev/null and b/doc/tutorials/core/basic_linear_transform/images/Basic_Linear_Transform_Tutorial_linear_transform_correction.jpg differ
diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
new file mode 100644
index 0000000000..f2a511fc21
--- /dev/null
+++ b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
@@ -0,0 +1,183 @@
+How to use the OpenCV parallel_for_ to parallelize your code {#tutorial_how_to_use_OpenCV_parallel_for_}
+==================================================================
+
+Goal
+----
+
+The goal of this tutorial is to show you how to use the OpenCV `parallel_for_` framework to easily
+parallelize your code. To illustrate the concept, we will write a program to draw a Mandelbrot set
+exploiting almost all the CPU load available.
+The full tutorial code is [here](https://github.com/opencv/opencv/blob/master/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp).
+If you want more information about multithreading, you will have to refer to a reference book or course as this tutorial is intended
+to remain simple.
+
+Precondition
+----
+
+The first precondition is to have OpenCV built with a parallel framework.
+In OpenCV 3.2, the following parallel frameworks are available in that order:
+1.   Intel Threading Building Blocks (3rdparty library, should be explicitly enabled)
+2.   C= Parallel C/C++ Programming Language Extension (3rdparty library, should be explicitly enabled)
+3.   OpenMP (integrated to compiler, should be explicitly enabled)
+4.   APPLE GCD (system wide, used automatically (APPLE only))
+5.   Windows RT concurrency (system wide, used automatically (Windows RT only))
+6.   Windows concurrency (part of runtime, used automatically (Windows only - MSVC++ >= 10))
+7.   Pthreads (if available)
+
+As you can see, several parallel frameworks can be used in the OpenCV library. Some parallel libraries
+are third party libraries and have to be explictly built and enabled in CMake (e.g. TBB, C=), others are
+automatically available with the platform (e.g. APPLE GCD) but chances are that you should be enable to
+have access to a parallel framework either directly or by enabling the option in CMake and rebuild the library.
+
+The second (weak) precondition is more related to the task you want to achieve as not all computations
+are suitable / can be adatapted to be run in a parallel way. To remain simple, tasks that can be splitted
+into multiple elementary operations with no memory dependency (no possible race condition) are easily
+parallelizable. Computer vision processing are often easily parallelizable as most of the time the processing of
+one pixel does not depend to the state of other pixels.
+
+Simple example: drawing a Mandelbrot set
+----
+
+We will use the example of drawing a Mandelbrot set to show how from a regular sequential code you can easily adapt
+the code to parallize the computation.
+
+Theory
+-----------
+
+The Mandelbrot set definition has been named in tribute to the mathematician Benoit Mandelbrot by the mathematician
+Adrien Douady. It has been famous outside of the mathematics field as the image representation is an example of a
+class of fractals, a mathematical set that exhibits a repeating pattern displayed at every scale (even more, a
+Mandelbrot set is self-similar as the whole shape can be repeatedly seen at different scale). For a more in-depth
+introduction, you can look at the corresponding [Wikipedia article](https://en.wikipedia.org/wiki/Mandelbrot_set).
+Here, we will just introduce the formula to draw the Mandelbrot set (from the mentioned Wikipedia article).
+
+> The Mandelbrot set is the set of values of \f$ c \f$ in the complex plane for which the orbit of 0 under iteration
+> of the quadratic map
+> \f[\begin{cases} z_0 = 0 \\ z_{n+1} = z_n^2 + c \end{cases}\f]
+> remains bounded.
+> That is, a complex number \f$ c \f$ is part of the Mandelbrot set if, when starting with \f$ z_0 = 0 \f$ and applying
+> the iteration repeatedly, the absolute value of \f$ z_n \f$ remains bounded however large \f$ n \f$ gets.
+> This can also be represented as
+> \f[\limsup_{n\to\infty}|z_{n+1}|\leqslant2\f]
+
+Pseudocode
+-----------
+
+A simple algorithm to generate a representation of the Mandelbrot set is called the
+["escape time algorithm"](https://en.wikipedia.org/wiki/Mandelbrot_set#Escape_time_algorithm).
+For each pixel in the rendered image, we test using the recurrence relation if the complex number is bounded or not
+under a maximum number of iterations. Pixels that do not belong to the Mandelbrot set will escape quickly whereas
+we assume that the pixel is in the set after a fixed maximum number of iterations. A high value of iterations will
+produce a more detailed image but the computation time will increase accordingly. We use the number of iterations
+needed to "escape" to depict the pixel value in the image.
+
+```
+For each pixel (Px, Py) on the screen, do:
+{
+  x0 = scaled x coordinate of pixel (scaled to lie in the Mandelbrot X scale (-2, 1))
+  y0 = scaled y coordinate of pixel (scaled to lie in the Mandelbrot Y scale (-1, 1))
+  x = 0.0
+  y = 0.0
+  iteration = 0
+  max_iteration = 1000
+  while (x*x + y*y < 2*2  AND  iteration < max_iteration) {
+    xtemp = x*x - y*y + x0
+    y = 2*x*y + y0
+    x = xtemp
+    iteration = iteration + 1
+  }
+  color = palette[iteration]
+  plot(Px, Py, color)
+}
+```
+
+To relate between the pseudocode and the theory, we have:
+*   \f$ z = x + iy \f$
+*   \f$ z^2 = x^2 + i2xy - y^2 \f$
+*   \f$ c = x_0 + iy_0 \f$
+
+![](images/how_to_use_OpenCV_parallel_for_640px-Mandelset_hires.png)
+
+On this figure, we recall that the real part of a complex number is on the x-axis and the imaginary part on the y-axis.
+You can see that the whole shape can be repeatedly visible if we zoom at particular locations.
+
+Implementation
+-----------
+
+Escape time algorithm implementation
+--------------------------
+
+@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-escape-time-algorithm
+
+Here, we used the [`std::complex`](http://en.cppreference.com/w/cpp/numeric/complex) template class to represent a
+complex number. This function performs the test to check if the pixel is in set or not and returns the "escaped" iteration.
+
+Sequential Mandelbrot implementation
+--------------------------
+
+@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-sequential
+
+In this implementation, we sequentially iterate over the pixels in the rendered image to perform the test to check if the
+pixel is likely to belong to the Mandelbrot set or not.
+
+Another thing to do is to transform the pixel coordinate into the Mandelbrot set space with:
+
+@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-transformation
+
+Finally, to assign the grayscale value to the pixels, we use the following rule:
+*   a pixel is black if it reaches the maximum number of iterations (pixel is assumed to be in the Mandelbrot set),
+*   otherwise we assign a grayscale value depending on the escaped iteration and scaled to fit the grayscale range.
+
+@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-grayscale-value
+
+Using a linear scale transformation is not enough to perceive the grayscale variation. To overcome this, we will boost
+the perception by using a square root scale transformation (borrowed from Jeremy D. Frens in his
+[blog post](http://www.programming-during-recess.net/2016/06/26/color-schemes-for-mandelbrot-sets/)):
+\f$ f \left( x \right) = \sqrt{\frac{x}{\text{maxIter}}} \times 255 \f$
+
+![](images/how_to_use_OpenCV_parallel_for_sqrt_scale_transformation.png)
+
+The green curve corresponds to a simple linear scale transformation, the blue one to a square root scale transformation
+and you can observe how the lowest values will be boosted when looking at the slope at these positions.
+
+Parallel Mandelbrot implementation
+--------------------------
+
+When looking at the sequential implementation, we can notice that each pixel is computed independently. To optimize the
+computation, we can perform multiple pixel calculations in parallel, by exploiting the multi-core architecture of modern
+processor. To achieve this easily, we will use the OpenCV @ref cv::parallel_for_ framework.
+
+@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-parallel
+
+The first thing is to declare a custom class that inherits from @ref cv::ParallelLoopBody and to override the
+`virtual void operator ()(const cv::Range& range) const`.
+
+The range in the `operator ()` represents the subset of pixels that will be treated by an individual thread.
+This splitting is done automatically to distribuate equally the computation load. We have to convert the pixel index coordinate
+to a 2D `[row, col]` coordinate. Also note that we have to keep a reference on the mat image to be able to modify in-place
+the image.
+
+The parallel execution is called with:
+
+@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-parallel-call
+
+Here, the range represents the total number of operations to be executed, so the total number of pixels in the image.
+To set the number of threads, you can use: @ref cv::setNumThreads. You can also specify the number of splitting using the
+nstripes parameter in @ref cv::parallel_for_. For instance, if your processor has 4 threads, setting `cv::setNumThreads(2)`
+or setting `nstripes=2` should be the same as by default it will use all the processor threads available but will split the
+workload only on two threads.
+
+Results
+-----------
+
+You can find the full tutorial code [here](https://github.com/opencv/opencv/blob/master/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp).
+The performance of the parallel implementation depends of the type of CPU you have. For instance, on 4 cores / 8 threads
+CPU, you can expect a speed-up of around 6.9X. There are many factors to explain why we do not achieve a speed-up of almost 8X.
+Main reasons should be mostly due to:
+*   the overhead to create and manage the threads,
+*   background processes running in parallel,
+*   the difference between 4 hardware cores with 2 logical threads for each core and 8 hardware cores.
+
+The resulting image produced by the tutorial code (you can modify the code to use more iterations and assign a pixel color
+depending on the escaped iteration and using a color palette to get more aesthetic images):
+![Mandelbrot set with xMin=-2.1, xMax=0.6, yMin=-1.2, yMax=1.2, maxIterations=500](images/how_to_use_OpenCV_parallel_for_Mandelbrot.png)
diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_640px-Mandelset_hires.png b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_640px-Mandelset_hires.png
new file mode 100644
index 0000000000..2b63916d77
Binary files /dev/null and b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_640px-Mandelset_hires.png differ
diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_Mandelbrot.png b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_Mandelbrot.png
new file mode 100644
index 0000000000..40eb579e78
Binary files /dev/null and b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_Mandelbrot.png differ
diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_sqrt_scale_transformation.png b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_sqrt_scale_transformation.png
new file mode 100644
index 0000000000..00b727f866
Binary files /dev/null and b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/images/how_to_use_OpenCV_parallel_for_sqrt_scale_transformation.png differ
diff --git a/doc/tutorials/core/table_of_content_core.markdown b/doc/tutorials/core/table_of_content_core.markdown
index 70d9c81a1b..2b9afb8b19 100644
--- a/doc/tutorials/core/table_of_content_core.markdown
+++ b/doc/tutorials/core/table_of_content_core.markdown
@@ -106,3 +106,10 @@ understanding how to manipulate the images on a pixel level.
     *Author:* Elena Gvozdeva
 
     You will see how to use the IPP Async with OpenCV.
+
+
+-   @subpage tutorial_how_to_use_OpenCV_parallel_for_
+
+    *Compatibility:* \>= OpenCV 2.4.3
+
+    You will see how to use the OpenCV parallel_for_ to easily parallelize your code.
diff --git a/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
index 64f763bd59..0332808cbb 100644
--- a/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
+++ b/doc/tutorials/gpu/gpu-thrust-interop/gpu_thrust_interop.markdown
@@ -1,4 +1,4 @@
-Using a cv::cuda::GpuMat with thrust
+Using a cv::cuda::GpuMat with thrust {#tutorial_gpu_thrust_interop}
 ===========================================
 
 Goal
@@ -67,4 +67,4 @@ Next we will determine how many values are greater than 0 by using thrust::count
 @snippet samples/cpp/tutorial_code/gpu/gpu-thrust-interop/main.cu pred_greater
 
 We will use those results to create an output buffer for storing the copied values, we will then use copy_if with the same predicate to populate the output buffer.
-Lastly we will download the values into a CPU mat for viewing.
\ No newline at end of file
+Lastly we will download the values into a CPU mat for viewing.
diff --git a/doc/tutorials/gpu/table_of_content_gpu.markdown b/doc/tutorials/gpu/table_of_content_gpu.markdown
index fe4e2c8801..163f5e3b3f 100644
--- a/doc/tutorials/gpu/table_of_content_gpu.markdown
+++ b/doc/tutorials/gpu/table_of_content_gpu.markdown
@@ -13,3 +13,10 @@ run the OpenCV algorithms.
     This will give a good grasp on how to approach coding on the GPU module, once you already know
     how to handle the other modules. As a test case it will port the similarity methods from the
     tutorial @ref tutorial_video_input_psnr_ssim to the GPU.
+
+-   @subpage tutorial_gpu_thrust_interop
+
+    *Compatibility:* \>= OpenCV 3.0
+
+    This tutorial will show you how to wrap a GpuMat into a thrust iterator in order to be able to
+    use the functions in the thrust library.
diff --git a/doc/tutorials/introduction/windows_install/images/MiktexInstall.png b/doc/tutorials/introduction/windows_install/images/MiktexInstall.png
deleted file mode 100644
index 193a403991..0000000000
Binary files a/doc/tutorials/introduction/windows_install/images/MiktexInstall.png and /dev/null differ
diff --git a/doc/tutorials/introduction/windows_install/images/Sphinx_Install.png b/doc/tutorials/introduction/windows_install/images/Sphinx_Install.png
deleted file mode 100644
index da2e06c0bf..0000000000
Binary files a/doc/tutorials/introduction/windows_install/images/Sphinx_Install.png and /dev/null differ
diff --git a/doc/tutorials/introduction/windows_install/images/WindowsBuildDoc.png b/doc/tutorials/introduction/windows_install/images/WindowsBuildDoc.png
deleted file mode 100644
index 18cad032c3..0000000000
Binary files a/doc/tutorials/introduction/windows_install/images/WindowsBuildDoc.png and /dev/null differ
diff --git a/doc/tutorials/introduction/windows_install/images/cmsdstartwindows.jpg b/doc/tutorials/introduction/windows_install/images/cmsdstartwindows.jpg
deleted file mode 100644
index e0d9530d5c..0000000000
Binary files a/doc/tutorials/introduction/windows_install/images/cmsdstartwindows.jpg and /dev/null differ
diff --git a/doc/tutorials/introduction/windows_install/windows_install.markdown b/doc/tutorials/introduction/windows_install/windows_install.markdown
index e4e969f432..a5fed69629 100644
--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@@ -85,11 +85,8 @@ of them, you need to download and install them on your system.
     image file format.
 -   The OpenNI Framework contains a set of open source APIs that provide support for natural interaction with devices via methods such as voice command recognition, hand gestures, and body
     motion tracking. Prebuilt binaries can be found [here](http://structure.io/openni). The source code of [OpenNI](https://github.com/OpenNI/OpenNI) and [OpenNI2](https://github.com/OpenNI/OpenNI2) are also available on Github.
--   [Miktex]( http://miktex.org/2.9/setup) is the best [TEX](https://secure.wikimedia.org/wikipedia/en/wiki/TeX) implementation on
-    the Windows OS. It is required to build the *OpenCV documentation*.
--   [Sphinx](http://sphinx.pocoo.org/) is a python documentation generator and is the tool that will actually create the
-    *OpenCV documentation*. This on its own requires a couple of tools installed, We will cover this
-    in depth at the @ref tutorial_windows_install_sphinx "How to Install Sphinx" section.
+-   [Doxygen](http://www.stack.nl/~dimitri/doxygen/) is a documentation generator and is the tool that will actually create the
+    *OpenCV documentation*.
 
 Now we will describe the steps to follow for a full build (using all the above frameworks, tools and
 libraries). If you do not need the support for some of these you can just freely skip this section.
@@ -122,36 +119,10 @@ libraries). If you do not need the support for some of these you can just freely
         couple other python extensions. Luckily installing all these may be automated by a nice tool
         called [Setuptools](http://pypi.python.org/pypi/setuptools#downloads). Download and install
         again.
-        @anchor tutorial_windows_install_sphinx
-    -#  Installing Sphinx is easy once you have installed *Setuptools*. This contains a little
-        application that will automatically connect to the python databases and download the latest
-        version of many python scripts. Start up a command window (enter *cmd* into the windows
-        start menu and press enter) and use the *CD* command to navigate to your Python folders
-        Script sub-folder. Here just pass to the *easy_install.exe* as argument the name of the
-        program you want to install. Add the *sphinx* argument.
-
-        ![](images/cmsdstartwindows.jpg)
-
-        ![](images/Sphinx_Install.png)
-
-        @note
-        The *CD* navigation command works only inside a drive. For example if you are somewhere in the
-        *C:* drive you cannot use it this to go to another drive (like for example *D:*). To do so you
-        first need to change drives letters. For this simply enter the command *D:*. Then you can use
-        the *CD* to navigate to specific folder inside the drive. Bonus tip: you can clear the screen by
-        using the *CLS* command.
-
-        This will also install its prerequisites [Jinja2](http://jinja.pocoo.org/docs/) and
-        [Pygments](http://pygments.org/).
 
     -#  The easiest way to install Numpy is to just download its binaries from the [sourceforge page](http://sourceforge.net/projects/numpy/files/NumPy/).
         Make sure your download and install
         exactly the binary for your python version (so for version `2.7`).
-    -#  Download the [Miktex](http://miktex.org/2.9/setup) and install it. Again just follow the wizard. At the fourth step make
-        sure you select for the *"Install missing packages on-the-fly"* the *Yes* option, as you can
-        see on the image below. Again this will take quite some time so be patient.
-
-        ![](images/MiktexInstall.png)
 
     -#  For the [Intel Threading Building Blocks (*TBB*)](http://threadingbuildingblocks.org/file.php?fid=77)
         download the source files and extract
@@ -291,12 +262,9 @@ libraries). If you do not need the support for some of these you can just freely
 
     ![](images/OpenCVBuildResultWindows.jpg)
 
-    For the documentation, you need to explicitly issue the build commands on the *doc* project for
-    the PDF files and on the *doc_html* for the HTML ones. Each of these will call *Sphinx* to do
-    all the hard work. You can find the generated documentation inside the `Build/Doc/_html` for the
-    HTML pages and within the `Build/Doc` the PDF manuals.
-
-    ![](images/WindowsBuildDoc.png)
+    For the documentation, you need to explicitly issue the build commands on the *doxygen* project for
+    the HTML docuementation. It will call *Doxygen* to do
+    all the hard work. You can find the generated documentation inside the `build/doc/doxygen/html`.
 
     To collect the header and the binary files, that you will use during your own projects, into a
     separate directory (simillary to how the pre-built binaries ship) you need to explicitly build
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 362e81f8bf..5a0e020d31 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -979,8 +979,8 @@ This means that, given ( \f$R_1\f$,\f$T_1\f$ ), it should be possible to compute
 need to know the position and orientation of the second camera relative to the first camera. This is
 what the described function does. It computes ( \f$R\f$,\f$T\f$ ) so that:
 
-\f[R_2=R*R_1
-T_2=R*T_1 + T,\f]
+\f[R_2=R*R_1\f]
+\f[T_2=R*T_1 + T,\f]
 
 Optionally, it computes the essential matrix E:
 
diff --git a/modules/calib3d/misc/java/test/Calib3dTest.java b/modules/calib3d/misc/java/test/Calib3dTest.java
index add668f190..67193d9586 100644
--- a/modules/calib3d/misc/java/test/Calib3dTest.java
+++ b/modules/calib3d/misc/java/test/Calib3dTest.java
@@ -499,7 +499,7 @@ public class Calib3dTest extends OpenCVTestCase {
     }
 
     public void testSolvePnPListOfPoint3ListOfPointMatMatMatMat() {
-        Mat intrinsics = Mat.eye(3, 3, CvType.CV_32F);
+        Mat intrinsics = Mat.eye(3, 3, CvType.CV_64F);
         intrinsics.put(0, 0, 400);
         intrinsics.put(1, 1, 400);
         intrinsics.put(0, 2, 640 / 2);
diff --git a/modules/calib3d/src/compat_ptsetreg.cpp b/modules/calib3d/src/compat_ptsetreg.cpp
index 774129e421..6e67000b3b 100644
--- a/modules/calib3d/src/compat_ptsetreg.cpp
+++ b/modules/calib3d/src/compat_ptsetreg.cpp
@@ -313,11 +313,7 @@ void CvLevMarq::step()
     if( !err )
         completeSymm( _JtJN, completeSymmFlag );
 
-#if 1
     _JtJN.diag() *= 1. + lambda;
-#else
-    _JtJN.diag() += lambda;
-#endif
     solve(_JtJN, _JtErr, nonzero_param, solveMethod);
 
     int j = 0;
diff --git a/modules/calib3d/src/fisheye.cpp b/modules/calib3d/src/fisheye.cpp
index df68cf1905..f371a615aa 100644
--- a/modules/calib3d/src/fisheye.cpp
+++ b/modules/calib3d/src/fisheye.cpp
@@ -548,19 +548,6 @@ void cv::fisheye::estimateNewCameraMatrixForUndistortRectify(InputArray K, Input
     pptr[6] = Vec2d(0, h);
     pptr[7] = Vec2d(0, h/2);
 
-#if 0
-    const int N = 10;
-    cv::Mat points(1, N * 4, CV_64FC2);
-    Vec2d* pptr = points.ptr<Vec2d>();
-    for(int i = 0, k = 0; i < 10; ++i)
-    {
-        pptr[k++] = Vec2d(w/2,   0) - Vec2d(w/8,   0) + Vec2d(w/4/N*i,   0);
-        pptr[k++] = Vec2d(w/2, h-1) - Vec2d(w/8, h-1) + Vec2d(w/4/N*i, h-1);
-        pptr[k++] = Vec2d(0,   h/2) - Vec2d(0,   h/8) + Vec2d(0,   h/4/N*i);
-        pptr[k++] = Vec2d(w-1, h/2) - Vec2d(w-1, h/8) + Vec2d(w-1, h/4/N*i);
-    }
-#endif
-
     fisheye::undistortPoints(points, points, K, D, R);
     cv::Scalar center_mass = mean(points);
     cv::Vec2d cn(center_mass.val);
@@ -586,17 +573,6 @@ void cv::fisheye::estimateNewCameraMatrixForUndistortRectify(InputArray K, Input
         maxx = std::max(maxx, std::abs(pptr[i][0]-cn[0]));
     }
 
-#if 0
-    double minx = -DBL_MAX, miny = -DBL_MAX, maxx = DBL_MAX, maxy = DBL_MAX;
-    for(size_t i = 0; i < points.total(); ++i)
-    {
-        if (i % 4 == 0) miny = std::max(miny, pptr[i][1]);
-        if (i % 4 == 1) maxy = std::min(maxy, pptr[i][1]);
-        if (i % 4 == 2) minx = std::max(minx, pptr[i][0]);
-        if (i % 4 == 3) maxx = std::min(maxx, pptr[i][0]);
-    }
-#endif
-
     double f1 = w * 0.5/(minx);
     double f2 = w * 0.5/(maxx);
     double f3 = h * 0.5 * aspect_ratio/(miny);
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index e205580921..1e9b8ec6e4 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -77,8 +77,14 @@ bool solvePnP( InputArray _opoints, InputArray _ipoints,
     }
     else
     {
-        _rvec.create(3, 1, CV_64F);
-        _tvec.create(3, 1, CV_64F);
+        int mtype = CV_64F;
+        // use CV_32F if all PnP inputs are CV_32F and outputs are empty
+        if (_ipoints.depth() == _cameraMatrix.depth() && _ipoints.depth() == _opoints.depth() &&
+            _rvec.empty() && _tvec.empty())
+            mtype = _opoints.depth();
+
+        _rvec.create(3, 1, mtype);
+        _tvec.create(3, 1, mtype);
     }
     rvec = _rvec.getMat();
     tvec = _tvec.getMat();
diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index cd861310b9..eb075d8c73 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -49,6 +49,7 @@
 #include <stdio.h>
 #include <limits>
 #include "opencl_kernels_calib3d.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
 namespace cv
 {
@@ -203,8 +204,8 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
         tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
     uchar val0 = tab[0 + OFS];
 
-#if CV_SSE2
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+#if CV_SIMD128
+    bool useSIMD = hasSIMD128();
 #endif
 
     for( y = 0; y < size.height-1; y += 2 )
@@ -219,71 +220,34 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
         dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
         x = 1;
 
-#if CV_NEON
-        int16x8_t ftz = vdupq_n_s16 ((short) ftzero);
-        uint8x8_t ftz2 = vdup_n_u8 (cv::saturate_cast<uchar>(ftzero*2));
-
-        for(; x <=size.width-9; x += 8 )
-        {
-            uint8x8_t c0 = vld1_u8 (srow0 + x - 1);
-            uint8x8_t c1 = vld1_u8 (srow1 + x - 1);
-            uint8x8_t d0 = vld1_u8 (srow0 + x + 1);
-            uint8x8_t d1 = vld1_u8 (srow1 + x + 1);
-
-            int16x8_t t0 = vreinterpretq_s16_u16 (vsubl_u8 (d0, c0));
-            int16x8_t t1 = vreinterpretq_s16_u16 (vsubl_u8 (d1, c1));
-
-            uint8x8_t c2 = vld1_u8 (srow2 + x - 1);
-            uint8x8_t c3 = vld1_u8 (srow3 + x - 1);
-            uint8x8_t d2 = vld1_u8 (srow2 + x + 1);
-            uint8x8_t d3 = vld1_u8 (srow3 + x + 1);
-
-            int16x8_t t2 = vreinterpretq_s16_u16 (vsubl_u8 (d2, c2));
-            int16x8_t t3 = vreinterpretq_s16_u16 (vsubl_u8 (d3, c3));
-
-            int16x8_t v0 = vaddq_s16 (vaddq_s16 (t2, t0), vaddq_s16 (t1, t1));
-            int16x8_t v1 = vaddq_s16 (vaddq_s16 (t3, t1), vaddq_s16 (t2, t2));
-
-
-            uint8x8_t v0_u8 = vqmovun_s16 (vaddq_s16 (v0, ftz));
-            uint8x8_t v1_u8 = vqmovun_s16 (vaddq_s16 (v1, ftz));
-            v0_u8 =  vmin_u8 (v0_u8, ftz2);
-            v1_u8 =  vmin_u8 (v1_u8, ftz2);
-            vqmovun_s16 (vaddq_s16 (v1, ftz));
-
-            vst1_u8 (dptr0 + x, v0_u8);
-            vst1_u8 (dptr1 + x, v1_u8);
-        }
-#elif CV_SSE2
+#if CV_SIMD128
         if( useSIMD )
         {
-            __m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero),
-            ftz2 = _mm_set1_epi8(cv::saturate_cast<uchar>(ftzero*2));
-            for( ; x <= size.width-9; x += 8 )
+            v_int16x8 ftz = v_setall_s16((short) ftzero);
+            v_int16x8 ftz2 = v_setall_s16((short)(ftzero*2));
+            v_int16x8 z = v_setzero_s16();
+
+            for(; x <= size.width-8; x += 8 )
             {
-                __m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
-                __m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
-                __m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
-                __m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);
+                v_int16x8 s00 = v_reinterpret_as_s16(v_load_expand(srow0 + x + 1));
+                v_int16x8 s01 = v_reinterpret_as_s16(v_load_expand(srow0 + x - 1));
+                v_int16x8 s10 = v_reinterpret_as_s16(v_load_expand(srow1 + x + 1));
+                v_int16x8 s11 = v_reinterpret_as_s16(v_load_expand(srow1 + x - 1));
+                v_int16x8 s20 = v_reinterpret_as_s16(v_load_expand(srow2 + x + 1));
+                v_int16x8 s21 = v_reinterpret_as_s16(v_load_expand(srow2 + x - 1));
+                v_int16x8 s30 = v_reinterpret_as_s16(v_load_expand(srow3 + x + 1));
+                v_int16x8 s31 = v_reinterpret_as_s16(v_load_expand(srow3 + x - 1));
 
-                d0 = _mm_sub_epi16(d0, c0);
-                d1 = _mm_sub_epi16(d1, c1);
+                v_int16x8 d0 = s00 - s01;
+                v_int16x8 d1 = s10 - s11;
+                v_int16x8 d2 = s20 - s21;
+                v_int16x8 d3 = s30 - s31;
 
-                __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
-                __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
-                __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
-                __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);
+                v_uint16x8 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
+                v_uint16x8 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
 
-                d2 = _mm_sub_epi16(d2, c2);
-                d3 = _mm_sub_epi16(d3, c3);
-
-                __m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
-                __m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
-                v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
-                v0 = _mm_min_epu8(v0, ftz2);
-
-                _mm_storel_epi64((__m128i*)(dptr0 + x), v0);
-                _mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
+                v_pack_store(dptr0 + x, v0);
+                v_pack_store(dptr1 + x, v1);
             }
         }
 #endif
@@ -299,18 +263,18 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
         }
     }
 
-#if CV_NEON
-    uint8x16_t val0_16 = vdupq_n_u8 (val0);
-#endif
-
     for( ; y < size.height; y++ )
     {
         uchar* dptr = dst.ptr<uchar>(y);
         x = 0;
-    #if CV_NEON
-        for(; x <= size.width-16; x+=16 )
-            vst1q_u8 (dptr + x, val0_16);
-    #endif
+#if CV_SIMD128
+        if( useSIMD )
+        {
+            v_uint8x16 val0_16 = v_setall_u8(val0);
+            for(; x <= size.width-16; x+=16 )
+                v_store(dptr + x, val0_16);
+        }
+#endif
         for(; x < size.width; x++ )
             dptr[x] = val0;
     }
@@ -320,8 +284,8 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
 static const int DISPARITY_SHIFT_16S = 4;
 static const int DISPARITY_SHIFT_32S = 8;
 
-#if CV_SSE2
-static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
+#if CV_SIMD128
+static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                                             Mat& disp, Mat& cost, StereoBMParams& state,
                                             uchar* buf, int _dy0, int _dy1 )
 {
@@ -354,7 +318,7 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
     const int TABSZ = 256;
     uchar tab[TABSZ];
-    const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8);
+    const v_int16x8 d0_8 = v_int16x8(0,1,2,3,4,5,6,7), dd_8 = v_setall_s16(8);
 
     sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
     hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
@@ -377,18 +341,20 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
         for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
         {
             int lval = lptr[0];
-            __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
+            v_uint8x16 lv = v_setall_u8((uchar)lval);
             for( d = 0; d < ndisp; d += 16 )
             {
-                __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
-                __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
-                __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
-                __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
-                _mm_store_si128((__m128i*)(cbuf + d), diff);
-                hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));
-                hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));
-                _mm_store_si128((__m128i*)(hsad + d), hsad_l);
-                _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
+                v_uint8x16 rv = v_load(rptr + d);
+                v_uint16x8 hsad_l = v_load(hsad + d);
+                v_uint16x8 hsad_h = v_load(hsad + d + 8);
+                v_uint8x16 diff = v_absdiff(lv, rv);
+                v_store(cbuf + d, diff);
+                v_uint16x8 diff0, diff1;
+                v_expand(diff, diff0, diff1);
+                hsad_l += diff0;
+                hsad_h += diff1;
+                v_store(hsad + d, hsad_l);
+                v_store(hsad + d + 8, hsad_h);
             }
             htext[y] += tab[lval];
         }
@@ -419,21 +385,24 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
         {
             int lval = lptr[0];
-            __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
+            v_uint8x16 lv = v_setall_u8((uchar)lval);
             for( d = 0; d < ndisp; d += 16 )
             {
-                __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
-                __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
-                __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
-                __m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d));
-                __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
-                __m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z));
-                _mm_store_si128((__m128i*)(cbuf + d), diff);
-                diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z));
-                hsad_h = _mm_add_epi16(hsad_h, diff_h);
-                hsad_l = _mm_add_epi16(hsad_l, diff);
-                _mm_store_si128((__m128i*)(hsad + d), hsad_l);
-                _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
+                v_uint8x16 rv = v_load(rptr + d);
+                v_uint16x8 hsad_l = v_load(hsad + d);
+                v_uint16x8 hsad_h = v_load(hsad + d + 8);
+                v_uint8x16 cbs = v_load(cbuf_sub + d);
+                v_uint8x16 diff = v_absdiff(lv, rv);
+                v_int16x8 diff_l, diff_h, cbs_l, cbs_h;
+                v_store(cbuf + d, diff);
+                v_expand(v_reinterpret_as_s8(diff), diff_l, diff_h);
+                v_expand(v_reinterpret_as_s8(cbs), cbs_l, cbs_h);
+                diff_l -= cbs_l;
+                diff_h -= cbs_h;
+                hsad_h = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_h) + diff_h);
+                hsad_l = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_l) + diff_l);
+                v_store(hsad + d, hsad_l);
+                v_store(hsad + d + 8, hsad_h);
             }
             htext[y] += tab[lval] - tab[lptr_sub[0]];
         }
@@ -450,16 +419,16 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
 
         hsad = hsad0 + (1 - dy0)*ndisp;
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
-            for( d = 0; d < ndisp; d += 16 )
+            for( d = 0; d <= ndisp-16; d += 16 )
             {
-                __m128i s0 = _mm_load_si128((__m128i*)(sad + d));
-                __m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8));
-                __m128i t0 = _mm_load_si128((__m128i*)(hsad + d));
-                __m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8));
-                s0 = _mm_add_epi16(s0, t0);
-                s1 = _mm_add_epi16(s1, t1);
-                _mm_store_si128((__m128i*)(sad + d), s0);
-                _mm_store_si128((__m128i*)(sad + d + 8), s1);
+                v_uint16x8 s0 = v_load(sad + d);
+                v_uint16x8 s1 = v_load(sad + d + 8);
+                v_uint16x8 t0 = v_load(hsad + d);
+                v_uint16x8 t1 = v_load(hsad + d + 8);
+                s0 = s0 + t0;
+                s1 = s1 + t1;
+                v_store(sad + d, s0);
+                v_store(sad + d + 8, s1);
             }
         int tsum = 0;
         for( y = -wsz2-1; y < wsz2; y++ )
@@ -471,38 +440,38 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
             int minsad = INT_MAX, mind = -1;
             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
-            __m128i minsad8 = _mm_set1_epi16(SHRT_MAX);
-            __m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask;
+            v_int16x8 minsad8 = v_setall_s16(SHRT_MAX);
+            v_int16x8 mind8 = v_setall_s16(0), d8 = d0_8;
 
             for( d = 0; d < ndisp; d += 16 )
             {
-                __m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d));
-                __m128i u1 = _mm_load_si128((__m128i*)(hsad + d));
+                v_int16x8 u0 = v_reinterpret_as_s16(v_load(hsad_sub + d));
+                v_int16x8 u1 = v_reinterpret_as_s16(v_load(hsad + d));
 
-                __m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8));
-                __m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8));
+                v_int16x8 v0 = v_reinterpret_as_s16(v_load(hsad_sub + d + 8));
+                v_int16x8 v1 = v_reinterpret_as_s16(v_load(hsad + d + 8));
 
-                __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
-                __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
+                v_int16x8 usad8 = v_reinterpret_as_s16(v_load(sad + d));
+                v_int16x8 vsad8 = v_reinterpret_as_s16(v_load(sad + d + 8));
 
-                u1 = _mm_sub_epi16(u1, u0);
-                v1 = _mm_sub_epi16(v1, v0);
-                usad8 = _mm_add_epi16(usad8, u1);
-                vsad8 = _mm_add_epi16(vsad8, v1);
+                u1 -= u0;
+                v1 -= v0;
+                usad8 += u1;
+                vsad8 += v1;
 
-                mask = _mm_cmpgt_epi16(minsad8, usad8);
-                minsad8 = _mm_min_epi16(minsad8, usad8);
-                mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
+                v_int16x8 mask = minsad8 > usad8;
+                minsad8 = v_min(minsad8, usad8);
+                mind8 = v_max(mind8, (mask& d8));
 
-                _mm_store_si128((__m128i*)(sad + d), usad8);
-                _mm_store_si128((__m128i*)(sad + d + 8), vsad8);
+                v_store(sad + d, v_reinterpret_as_u16(usad8));
+                v_store(sad + d + 8, v_reinterpret_as_u16(vsad8));
 
-                mask = _mm_cmpgt_epi16(minsad8, vsad8);
-                minsad8 = _mm_min_epi16(minsad8, vsad8);
+                mask = minsad8 > vsad8;
+                minsad8 = v_min(minsad8, vsad8);
 
-                d8 = _mm_add_epi16(d8, dd_8);
-                mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
-                d8 = _mm_add_epi16(d8, dd_8);
+                d8 = d8 + dd_8;
+                mind8 = v_max(mind8, (mask & d8));
+                d8 = d8 + dd_8;
             }
 
             tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
@@ -513,8 +482,8 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
             }
 
             ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
-            _mm_store_si128((__m128i*)minsad_buf, minsad8);
-            _mm_store_si128((__m128i*)mind_buf, mind8);
+            v_store(minsad_buf, v_reinterpret_as_u16(minsad8));
+            v_store(mind_buf, v_reinterpret_as_u16(mind8));
             for( d = 0; d < 8; d++ )
                 if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
                 {
@@ -525,27 +494,27 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
             if( uniquenessRatio > 0 )
             {
                 int thresh = minsad + (minsad * uniquenessRatio/100);
-                __m128i thresh4 = _mm_set1_epi32(thresh + 1);
-                __m128i d1 = _mm_set1_epi32(mind-1), d2 = _mm_set1_epi32(mind+1);
-                __m128i dd_4 = _mm_set1_epi32(4);
-                __m128i d4 = _mm_set_epi32(3,2,1,0);
-                __m128i z = _mm_setzero_si128();
+                v_int32x4 thresh4 = v_setall_s32(thresh + 1);
+                v_int32x4 d1 = v_setall_s32(mind-1), d2 = v_setall_s32(mind+1);
+                v_int32x4 dd_4 = v_setall_s32(4);
+                v_int32x4 d4 = v_int32x4(0,1,2,3);
+                v_int32x4 mask4;
 
                 for( d = 0; d < ndisp; d += 8 )
                 {
-                    __m128i usad4 = _mm_loadu_si128((__m128i*)(sad + d));
-                    __m128i vsad4 = _mm_unpackhi_epi16(usad4, z);
-                    usad4 = _mm_unpacklo_epi16(usad4, z);
-                    mask = _mm_cmpgt_epi32( thresh4, usad4);
-                    mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi32(d1,d4), _mm_cmpgt_epi32(d4,d2)));
-                    if( _mm_movemask_epi8(mask) )
+                    v_int16x8 sad8 = v_reinterpret_as_s16(v_load(sad + d));
+                    v_int32x4 sad4_l, sad4_h;
+                    v_expand(sad8, sad4_l, sad4_h);
+                    mask4 = thresh4 > sad4_l;
+                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
+                    if( v_signmask(mask4) )
                         break;
-                    d4 = _mm_add_epi16(d4, dd_4);
-                    mask = _mm_cmpgt_epi32( thresh4, vsad4);
-                    mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi32(d1,d4), _mm_cmpgt_epi32(d4,d2)));
-                    if( _mm_movemask_epi8(mask) )
+                    d4 += dd_4;
+                    mask4 = thresh4 > sad4_h;
+                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
+                    if( v_signmask(mask4) )
                         break;
-                    d4 = _mm_add_epi16(d4, dd_4);
+                    d4 += dd_4;
                 }
                 if( d < ndisp )
                 {
@@ -570,7 +539,7 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
 
 template <typename mType>
 static void
-findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
+findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                            Mat& disp, Mat& cost, const StereoBMParams& state,
                            uchar* buf, int _dy0, int _dy1, const int disp_shift )
 {
@@ -590,13 +559,12 @@ findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
     int uniquenessRatio = state.uniquenessRatio;
     mType FILTERED = (mType)((mindisp - 1) << disp_shift);
 
-#if CV_NEON
-    CV_Assert (ndisp % 8 == 0);
-    int32_t d0_4_temp [4];
-    for (int i = 0; i < 4; i ++)
-        d0_4_temp[i] = i;
-    int32x4_t d0_4 = vld1q_s32 (d0_4_temp);
-    int32x4_t dd_4 = vdupq_n_s32 (4);
+#if CV_SIMD128
+    bool useSIMD = hasSIMD128();
+    if( useSIMD )
+    {
+        CV_Assert (ndisp % 8 == 0);
+    }
 #endif
 
     int *sad, *hsad0, *hsad, *hsad_sub, *htext;
@@ -633,29 +601,46 @@ findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
         for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
         {
             int lval = lptr[0];
-        #if CV_NEON
-            int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
-
-            for( d = 0; d < ndisp; d += 8 )
+            d = 0;
+#if CV_SIMD128
+            if( useSIMD )
             {
-                int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
-                int32x4_t hsad_l = vld1q_s32 (hsad + d);
-                int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
-                int16x8_t diff = vabdq_s16 (lv, rv);
-                vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
-                hsad_l = vaddq_s32 (hsad_l, vmovl_s16(vget_low_s16 (diff)));
-                hsad_h = vaddq_s32 (hsad_h, vmovl_s16(vget_high_s16 (diff)));
-                vst1q_s32 ((hsad + d), hsad_l);
-                vst1q_s32 ((hsad + d + 4), hsad_h);
+                v_uint8x16 lv = v_setall_u8((uchar)lval);
+
+                for( ; d <= ndisp - 16; d += 16 )
+                {
+                    v_uint8x16 rv = v_load(rptr + d);
+                    v_int32x4 hsad_0 = v_load(hsad + d);
+                    v_int32x4 hsad_1 = v_load(hsad + d + 4);
+                    v_int32x4 hsad_2 = v_load(hsad + d + 8);
+                    v_int32x4 hsad_3 = v_load(hsad + d + 12);
+                    v_uint8x16 diff = v_absdiff(lv, rv);
+                    v_store(cbuf + d, diff);
+
+                    v_uint16x8 diff0, diff1;
+                    v_uint32x4 diff00, diff01, diff10, diff11;
+                    v_expand(diff, diff0, diff1);
+                    v_expand(diff0, diff00, diff01);
+                    v_expand(diff1, diff10, diff11);
+
+                    hsad_0 += v_reinterpret_as_s32(diff00);
+                    hsad_1 += v_reinterpret_as_s32(diff01);
+                    hsad_2 += v_reinterpret_as_s32(diff10);
+                    hsad_3 += v_reinterpret_as_s32(diff11);
+
+                    v_store(hsad + d, hsad_0);
+                    v_store(hsad + d + 4, hsad_1);
+                    v_store(hsad + d + 8, hsad_2);
+                    v_store(hsad + d + 12, hsad_3);
+                }
             }
-        #else
-            for( d = 0; d < ndisp; d++ )
+#endif
+            for( ; d < ndisp; d++ )
             {
                 int diff = std::abs(lval - rptr[d]);
                 cbuf[d] = (uchar)diff;
                 hsad[d] = (int)(hsad[d] + diff);
             }
-        #endif
             htext[y] += tab[lval];
         }
     }
@@ -685,31 +670,53 @@ findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
         {
             int lval = lptr[0];
-        #if CV_NEON
-            int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
-            for( d = 0; d < ndisp; d += 8 )
+            d = 0;
+#if CV_SIMD128
+            if( useSIMD )
             {
-                int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
-                int32x4_t hsad_l = vld1q_s32 (hsad + d);
-                int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
-                int16x8_t cbs = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (cbuf_sub + d)));
-                int16x8_t diff = vabdq_s16 (lv, rv);
-                int32x4_t diff_h = vsubl_s16 (vget_high_s16 (diff), vget_high_s16 (cbs));
-                int32x4_t diff_l = vsubl_s16 (vget_low_s16 (diff), vget_low_s16 (cbs));
-                vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
-                hsad_h = vaddq_s32 (hsad_h, diff_h);
-                hsad_l = vaddq_s32 (hsad_l, diff_l);
-                vst1q_s32 ((hsad + d), hsad_l);
-                vst1q_s32 ((hsad + d + 4), hsad_h);
+                v_uint8x16 lv = v_setall_u8((uchar)lval);
+                for( ; d <= ndisp - 16; d += 16 )
+                {
+                    v_uint8x16 rv = v_load(rptr + d);
+                    v_int32x4 hsad_0 = v_load(hsad + d);
+                    v_int32x4 hsad_1 = v_load(hsad + d + 4);
+                    v_int32x4 hsad_2 = v_load(hsad + d + 8);
+                    v_int32x4 hsad_3 = v_load(hsad + d + 12);
+                    v_uint8x16 cbs = v_load(cbuf_sub + d);
+                    v_uint8x16 diff = v_absdiff(lv, rv);
+                    v_store(cbuf + d, diff);
+
+                    v_uint16x8 diff0, diff1, cbs0, cbs1;
+                    v_int32x4 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
+                    v_expand(diff, diff0, diff1);
+                    v_expand(cbs, cbs0, cbs1);
+                    v_expand(v_reinterpret_as_s16(diff0), diff00, diff01);
+                    v_expand(v_reinterpret_as_s16(diff1), diff10, diff11);
+                    v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
+                    v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
+
+                    v_int32x4 diff_0 = diff00 - cbs00;
+                    v_int32x4 diff_1 = diff01 - cbs01;
+                    v_int32x4 diff_2 = diff10 - cbs10;
+                    v_int32x4 diff_3 = diff11 - cbs11;
+                    hsad_0 += diff_0;
+                    hsad_1 += diff_1;
+                    hsad_2 += diff_2;
+                    hsad_3 += diff_3;
+
+                    v_store(hsad + d, hsad_0);
+                    v_store(hsad + d + 4, hsad_1);
+                    v_store(hsad + d + 8, hsad_2);
+                    v_store(hsad + d + 12, hsad_3);
+                }
             }
-        #else
-            for( d = 0; d < ndisp; d++ )
+#endif
+            for( ; d < ndisp; d++ )
             {
                 int diff = std::abs(lval - rptr[d]);
                 cbuf[d] = (uchar)diff;
                 hsad[d] = hsad[d] + diff - cbuf_sub[d];
             }
-        #endif
             htext[y] += tab[lval] - tab[lptr_sub[0]];
         }
 
@@ -726,22 +733,25 @@ findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
         hsad = hsad0 + (1 - dy0)*ndisp;
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
         {
-        #if CV_NEON
-            for( d = 0; d <= ndisp-8; d += 8 )
+            d = 0;
+#if CV_SIMD128
+            if( useSIMD )
             {
-                int32x4_t s0 = vld1q_s32 (sad + d);
-                int32x4_t s1 = vld1q_s32 (sad + d + 4);
-                int32x4_t t0 = vld1q_s32 (hsad + d);
-                int32x4_t t1 = vld1q_s32 (hsad + d + 4);
-                s0 = vaddq_s32 (s0, t0);
-                s1 = vaddq_s32 (s1, t1);
-                vst1q_s32 (sad + d, s0);
-                vst1q_s32 (sad + d + 4, s1);
+                for( d = 0; d <= ndisp-8; d += 8 )
+                {
+                    v_int32x4 s0 = v_load(sad + d);
+                    v_int32x4 s1 = v_load(sad + d + 4);
+                    v_int32x4 t0 = v_load(hsad + d);
+                    v_int32x4 t1 = v_load(hsad + d + 4);
+                    s0 += t0;
+                    s1 += t1;
+                    v_store(sad + d, s0);
+                    v_store(sad + d + 4, s1);
+                }
             }
-        #else
-            for( d = 0; d < ndisp; d++ )
+#endif
+            for( ; d < ndisp; d++ )
                 sad[d] = (int)(sad[d] + hsad[d]);
-        #endif
         }
         int tsum = 0;
         for( y = -wsz2-1; y < wsz2; y++ )
@@ -753,62 +763,55 @@ findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
             int minsad = INT_MAX, mind = -1;
             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
-        #if CV_NEON
-            int32x4_t minsad4 = vdupq_n_s32 (INT_MAX);
-            int32x4_t mind4 = vdupq_n_s32(0), d4 = d0_4;
-
-            for( d = 0; d <= ndisp-8; d += 8 )
+            d = 0;
+#if CV_SIMD128
+            if( useSIMD )
             {
-                int32x4_t u0 = vld1q_s32 (hsad_sub + d);
-                int32x4_t u1 = vld1q_s32 (hsad + d);
+                v_int32x4 d0_4 = v_int32x4(0, 1, 2, 3);
+                v_int32x4 dd_4 = v_setall_s32(4);
+                v_int32x4 minsad4 = v_setall_s32(INT_MAX);
+                v_int32x4 mind4 = v_setall_s32(0), d4 = d0_4;
 
-                int32x4_t v0 = vld1q_s32 (hsad_sub + d + 4);
-                int32x4_t v1 = vld1q_s32 (hsad + d + 4);
+                for( ; d <= ndisp - 8; d += 8 )
+                {
+                    v_int32x4 u0 = v_load(hsad_sub + d);
+                    v_int32x4 u1 = v_load(hsad + d);
 
-                int32x4_t usad4 = vld1q_s32(sad + d);
-                int32x4_t vsad4 = vld1q_s32(sad + d + 4);
+                    v_int32x4 v0 = v_load(hsad_sub + d + 4);
+                    v_int32x4 v1 = v_load(hsad + d + 4);
 
-                u1 = vsubq_s32 (u1, u0);
-                v1 = vsubq_s32 (v1, v0);
-                usad4 = vaddq_s32 (usad4, u1);
-                vsad4 = vaddq_s32 (vsad4, v1);
+                    v_int32x4 usad4 = v_load(sad + d);
+                    v_int32x4 vsad4 = v_load(sad + d + 4);
 
-                uint32x4_t mask = vcgtq_s32 (minsad4, usad4);
-                minsad4 = vminq_s32 (minsad4, usad4);
-                mind4 = vbslq_s32(mask, d4, mind4);
+                    u1 -= u0;
+                    v1 -= v0;
+                    usad4 += u1;
+                    vsad4 += v1;
 
-                vst1q_s32 (sad + d, usad4);
-                vst1q_s32 (sad + d + 4, vsad4);
-                d4 = vaddq_s32 (d4, dd_4);
+                    v_store(sad + d, usad4);
+                    v_store(sad + d + 4, vsad4);
 
-                mask = vcgtq_s32 (minsad4, vsad4);
-                minsad4 = vminq_s32 (minsad4, vsad4);
-                mind4 = vbslq_s32(mask, d4, mind4);
+                    v_int32x4 mask = minsad4 > usad4;
+                    minsad4 = v_min(minsad4, usad4);
+                    mind4 = v_select(mask, d4, mind4);
+                    d4 += dd_4;
 
-                d4 = vaddq_s32 (d4, dd_4);
+                    mask = minsad4 > vsad4;
+                    minsad4 = v_min(minsad4, vsad4);
+                    mind4 = v_select(mask, d4, mind4);
+                    d4 += dd_4;
+                }
 
+                int CV_DECL_ALIGNED(16) minsad_buf[4], mind_buf[4];
+                v_store(minsad_buf, minsad4);
+                v_store(mind_buf, mind4);
+                if(minsad_buf[0] < minsad || (minsad == minsad_buf[0] && mind_buf[0] < mind)) { minsad = minsad_buf[0]; mind = mind_buf[0]; }
+                if(minsad_buf[1] < minsad || (minsad == minsad_buf[1] && mind_buf[1] < mind)) { minsad = minsad_buf[1]; mind = mind_buf[1]; }
+                if(minsad_buf[2] < minsad || (minsad == minsad_buf[2] && mind_buf[2] < mind)) { minsad = minsad_buf[2]; mind = mind_buf[2]; }
+                if(minsad_buf[3] < minsad || (minsad == minsad_buf[3] && mind_buf[3] < mind)) { minsad = minsad_buf[3]; mind = mind_buf[3]; }
             }
-            int32x2_t mind4_h = vget_high_s32 (mind4);
-            int32x2_t mind4_l = vget_low_s32 (mind4);
-            int32x2_t minsad4_h = vget_high_s32 (minsad4);
-            int32x2_t minsad4_l = vget_low_s32 (minsad4);
-
-            uint32x2_t mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
-            mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
-            minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);
-
-            mind4_l = vext_s32 (mind4_h,mind4_h,1);
-            minsad4_l = vext_s32 (minsad4_h,minsad4_h,1);
-
-            mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
-            mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
-            minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);
-
-            mind = (int) vget_lane_s32 (mind4_h, 0);
-            minsad = sad[mind];
-
-        #else
-            for( d = 0; d < ndisp; d++ )
+#endif
+            for( ; d < ndisp; d++ )
             {
                 int currsad = sad[d] + hsad[d] - hsad_sub[d];
                 sad[d] = currsad;
@@ -818,7 +821,6 @@ findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
                     mind = d;
                 }
             }
-        #endif
 
             tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
             if( tsum < textureThreshold )
@@ -855,19 +857,6 @@ findStereoCorrespondenceBM_( const Mat& left, const Mat& right,
     }
 }
 
-static void
-findStereoCorrespondenceBM( const Mat& left, const Mat& right,
-                            Mat& disp, Mat& cost, const StereoBMParams& state,
-                            uchar* buf, int _dy0, int _dy1 )
-{
-    if(disp.type() == CV_16S)
-        findStereoCorrespondenceBM_<short>(left, right, disp, cost, state,
-                                           buf, _dy0, _dy1, DISPARITY_SHIFT_16S );
-     else
-        findStereoCorrespondenceBM_<int>(left, right, disp, cost, state,
-                                         buf, _dy0, _dy1, DISPARITY_SHIFT_32S );
-}
-
 #ifdef HAVE_OPENCL
 static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state)
 {
@@ -972,6 +961,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
                              bool _useShorts, Rect _validDisparityRect,
                              Mat& _slidingSumBuf, Mat& _cost )
     {
+        CV_Assert( _disp.type() == CV_16S || _disp.type() == CV_32S );
         left = &_left; right = &_right;
         disp = &_disp; state = _state;
         nstripes = _nstripes; stripeBufSize = _stripeBufSize;
@@ -979,6 +969,9 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
         validDisparityRect = _validDisparityRect;
         slidingSumBuf = &_slidingSumBuf;
         cost = &_cost;
+#if CV_SIMD128
+        useSIMD = hasSIMD128();
+#endif
     }
 
     void operator()( const Range& range ) const
@@ -1012,12 +1005,19 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
         Mat disp_i = disp->rowRange(row0, row1);
         Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
 
-#if CV_SSE2
-        if( useShorts )
-            findStereoCorrespondenceBM_SSE2( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
+#if CV_SIMD128
+        if( useSIMD && useShorts )
+        {
+            findStereoCorrespondenceBM_SIMD( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
+        }
         else
 #endif
-            findStereoCorrespondenceBM( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
+        {
+            if( disp_i.type() == CV_16S )
+                findStereoCorrespondenceBM<short>( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1, DISPARITY_SHIFT_16S );
+            else
+                findStereoCorrespondenceBM<int>( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1, DISPARITY_SHIFT_32S );
+        }
 
         if( state->disp12MaxDiff >= 0 )
             validateDisparity( disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff );
@@ -1043,6 +1043,7 @@ protected:
     size_t stripeBufSize;
     bool useShorts;
     Rect validDisparityRect;
+    bool useSIMD;
 };
 
 class StereoBMImpl : public StereoBM
@@ -1168,12 +1169,7 @@ public:
         if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
             bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar));
 
-#if CV_SSE2
-        bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
-#else
-        const bool useShorts = false;
-#endif
-
+        bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21;
         const double SAD_overhead_coeff = 10.0;
         double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
         double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
diff --git a/modules/calib3d/src/triangulate.cpp b/modules/calib3d/src/triangulate.cpp
index 33c7fd774d..5bdf10e4b8 100644
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@@ -114,52 +114,6 @@ cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvMa
         cvmSet(points4D,2,i,matrV(3,2));/* Z */
         cvmSet(points4D,3,i,matrV(3,3));/* W */
     }
-
-#if 0
-    double err = 0;
-    /* Points was reconstructed. Try to reproject points */
-    /* We can compute reprojection error if need */
-    {
-        int i;
-        CvMat point3D;
-        double point3D_dat[4];
-        point3D = cvMat(4,1,CV_64F,point3D_dat);
-
-        CvMat point2D;
-        double point2D_dat[3];
-        point2D = cvMat(3,1,CV_64F,point2D_dat);
-
-        for( i = 0; i < numPoints; i++ )
-        {
-            double W = cvmGet(points4D,3,i);
-
-            point3D_dat[0] = cvmGet(points4D,0,i)/W;
-            point3D_dat[1] = cvmGet(points4D,1,i)/W;
-            point3D_dat[2] = cvmGet(points4D,2,i)/W;
-            point3D_dat[3] = 1;
-
-            /* !!! Project this point for each camera */
-            for( int currCamera = 0; currCamera < 2; currCamera++ )
-            {
-                cvMatMul(projMatrs[currCamera], &point3D, &point2D);
-
-                float x,y;
-                float xr,yr,wr;
-                x = (float)cvmGet(projPoints[currCamera],0,i);
-                y = (float)cvmGet(projPoints[currCamera],1,i);
-
-                wr = (float)point2D_dat[2];
-                xr = (float)(point2D_dat[0]/wr);
-                yr = (float)(point2D_dat[1]/wr);
-
-                float deltaX,deltaY;
-                deltaX = (float)fabs(x-xr);
-                deltaY = (float)fabs(y-yr);
-                err += deltaX*deltaX + deltaY*deltaY;
-            }
-        }
-    }
-#endif
 }
 
 
diff --git a/modules/calib3d/test/test_chesscorners.cpp b/modules/calib3d/test/test_chesscorners.cpp
index 781eec2ffb..64a2d69c37 100644
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@@ -268,14 +268,6 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
 
 #ifndef WRITE_POINTS
             double err = calcError(v, expected);
-#if 0
-            if( err > rough_success_error_level )
-            {
-                ts.printf( cvtest::TS::LOG, "bad accuracy of corner guesses\n" );
-                ts.set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
-                continue;
-            }
-#endif
             max_rough_error = MAX( max_rough_error, err );
 #endif
             if( pattern == CHESSBOARD )
@@ -287,14 +279,12 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
             err = calcError(v, expected);
             sum_error += err;
             count++;
-#if 1
             if( err > precise_success_error_level )
             {
                 ts->printf( cvtest::TS::LOG, "Image %s: bad accuracy of adjusted corners %f\n", img_file.c_str(), err );
                 ts->set_failed_test_info( cvtest::TS::FAIL_BAD_ACCURACY );
                 return;
             }
-#endif
             ts->printf(cvtest::TS::LOG, "Error on %s is %f\n", img_file.c_str(), err);
             max_precise_error = MAX( max_precise_error, err );
 #endif
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 0485a08ad3..41da825457 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(the_description "The Core Functionality")
 ocv_add_module(core
                "${OPENCV_HAL_LINKER_LIBS}"
-               PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}"
+               PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}" "${LAPACK_LIBRARIES}"
                OPTIONAL opencv_cudev
                WRAP java python)
 
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 699b1667b4..9e2b1ed7fd 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -188,8 +188,16 @@ enum CpuFeatures {
 #  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
 #    ifdef _MSC_VER
 #      include <nmmintrin.h>
+#      if defined(_M_X64)
+#        define CV_POPCNT_U64 _mm_popcnt_u64
+#      endif
+#      define CV_POPCNT_U32 _mm_popcnt_u32
 #    else
 #      include <popcntintrin.h>
+#      if defined(__x86_64__)
+#        define CV_POPCNT_U64 __builtin_popcountll
+#      endif
+#      define CV_POPCNT_U32 __builtin_popcount
 #    endif
 #    define CV_POPCNT 1
 #  endif
@@ -361,6 +369,16 @@ Cv64suf;
 #  define CV_EXPORTS
 #endif
 
+#ifndef CV_DEPRECATED
+#  if defined(__GNUC__)
+#    define CV_DEPRECATED __attribute__ ((deprecated))
+#  elif defined(_MSC_VER)
+#    define CV_DEPRECATED __declspec(deprecated)
+#  else
+#    define CV_DEPRECATED
+#  endif
+#endif
+
 #ifndef CV_EXTERN_C
 #  ifdef __cplusplus
 #    define CV_EXTERN_C extern "C"
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 93ca397817..e15c97d528 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -149,7 +149,7 @@ Element-wise binary and unary operations.
 
 Most of these operations return only one value.
 
-- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
+- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
 
 ### Other math
@@ -574,6 +574,49 @@ Scheme:
 For 32-bit integer and 32-bit floating point types. */
 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
 
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+/** @brief Count the 1 bits in the vector and return 4 values
+
+Scheme:
+@code
+{A1 A2 A3 ...} => popcount(A1)
+@endcode
+Any types but result will be in v_uint32x4*/
+template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
+{
+    v_uint8x16 b;
+    b = v_reinterpret_as_u8(a);
+    for( int i = 0; i < v_uint8x16::nlanes; i++ )
+    {
+        b.s[i] = popCountTable[b.s[i]];
+    }
+    v_uint32x4 c;
+    for( int i = 0; i < v_uint32x4::nlanes; i++ )
+    {
+        c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
+    }
+    return c;
+}
+
+
 //! @cond IGNORED
 template<typename _Tp, int n>
 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index b000733a5f..2bcff2bc15 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -813,6 +813,22 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
 
+#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
+inline v_uint32x4 v_popcount(const _Tpvec& a) \
+{ \
+    uint8x16_t t = vcntq_u8(cast(a.val)); \
+    uint16x8_t t0 = vpaddlq_u8(t);  /* 16 -> 8 */ \
+    uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
+    return v_uint32x4(t1); \
+}
+
+OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
+OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
+OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
+OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
+OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
+
 inline int v_signmask(const v_uint8x16& a)
 {
     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index fc81dac35d..9ff10c9b47 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1121,6 +1121,28 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
 
+#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
+inline v_uint32x4 v_popcount(const _Tpvec& a) \
+{ \
+    __m128i m1 = _mm_set1_epi32(0x55555555); \
+    __m128i m2 = _mm_set1_epi32(0x33333333); \
+    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
+    __m128i p = a.val; \
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
+    p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
+    p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
+    return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
+}
+
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
+
 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
 inline int v_signmask(const _Tpvec& a) \
 { \
diff --git a/modules/core/misc/java/src/java/core+MatOfRect2d.java b/modules/core/misc/java/src/java/core+MatOfRect2d.java
new file mode 100644
index 0000000000..71c4b1aef6
--- /dev/null
+++ b/modules/core/misc/java/src/java/core+MatOfRect2d.java
@@ -0,0 +1,81 @@
+package org.opencv.core;
+
+import java.util.Arrays;
+import java.util.List;
+
+
+public class MatOfRect2d extends Mat {
+    // 64FC4
+    private static final int _depth = CvType.CV_64F;
+    private static final int _channels = 4;
+
+    public MatOfRect2d() {
+        super();
+    }
+
+    protected MatOfRect2d(long addr) {
+        super(addr);
+        if( !empty() && checkVector(_channels, _depth) < 0 )
+            throw new IllegalArgumentException("Incompatible Mat");
+        //FIXME: do we need release() here?
+    }
+
+    public static MatOfRect2d fromNativeAddr(long addr) {
+        return new MatOfRect2d(addr);
+    }
+
+    public MatOfRect2d(Mat m) {
+        super(m, Range.all());
+        if( !empty() && checkVector(_channels, _depth) < 0 )
+            throw new IllegalArgumentException("Incompatible Mat");
+        //FIXME: do we need release() here?
+    }
+
+    public MatOfRect2d(Rect2d...a) {
+        super();
+        fromArray(a);
+    }
+
+    public void alloc(int elemNumber) {
+        if(elemNumber>0)
+            super.create(elemNumber, 1, CvType.makeType(_depth, _channels));
+    }
+
+    public void fromArray(Rect2d...a) {
+        if(a==null || a.length==0)
+            return;
+        int num = a.length;
+        alloc(num);
+        double buff[] = new double[num * _channels];
+        for(int i=0; i<num; i++) {
+            Rect2d r = a[i];
+            buff[_channels*i+0] = (double) r.x;
+            buff[_channels*i+1] = (double) r.y;
+            buff[_channels*i+2] = (double) r.width;
+            buff[_channels*i+3] = (double) r.height;
+        }
+        put(0, 0, buff); //TODO: check ret val!
+    }
+
+
+    public Rect2d[] toArray() {
+        int num = (int) total();
+        Rect2d[] a = new Rect2d[num];
+        if(num == 0)
+            return a;
+        double buff[] = new double[num * _channels];
+        get(0, 0, buff); //TODO: check ret val!
+        for(int i=0; i<num; i++)
+            a[i] = new Rect2d(buff[i*_channels], buff[i*_channels+1], buff[i*_channels+2], buff[i*_channels+3]);
+        return a;
+    }
+    public void fromList(List<Rect2d> lr) {
+        Rect2d ap[] = lr.toArray(new Rect2d[0]);
+        fromArray(ap);
+    }
+
+    public List<Rect2d> toList() {
+        Rect2d[] ar = toArray();
+        return Arrays.asList(ar);
+    }
+}
diff --git a/modules/core/misc/java/src/java/core+Rect2d.java b/modules/core/misc/java/src/java/core+Rect2d.java
new file mode 100644
index 0000000000..cb83a97727
--- /dev/null
+++ b/modules/core/misc/java/src/java/core+Rect2d.java
@@ -0,0 +1,100 @@
+package org.opencv.core;
+
+//javadoc:Rect2d_
+public class Rect2d {
+
+    public double x, y, width, height;
+
+    public Rect2d(double x, double y, double width, double height) {
+        this.x = x;
+        this.y = y;
+        this.width = width;
+        this.height = height;
+    }
+
+    public Rect2d() {
+        this(0, 0, 0, 0);
+    }
+
+    public Rect2d(Point p1, Point p2) {
+        x = (double) (p1.x < p2.x ? p1.x : p2.x);
+        y = (double) (p1.y < p2.y ? p1.y : p2.y);
+        width = (double) (p1.x > p2.x ? p1.x : p2.x) - x;
+        height = (double) (p1.y > p2.y ? p1.y : p2.y) - y;
+    }
+
+    public Rect2d(Point p, Size s) {
+        this((double) p.x, (double) p.y, (double) s.width, (double) s.height);
+    }
+
+    public Rect2d(double[] vals) {
+        set(vals);
+    }
+
+    public void set(double[] vals) {
+        if (vals != null) {
+            x = vals.length > 0 ? (double) vals[0] : 0;
+            y = vals.length > 1 ? (double) vals[1] : 0;
+            width = vals.length > 2 ? (double) vals[2] : 0;
+            height = vals.length > 3 ? (double) vals[3] : 0;
+        } else {
+            x = 0;
+            y = 0;
+            width = 0;
+            height = 0;
+        }
+    }
+
+    public Rect2d clone() {
+        return new Rect2d(x, y, width, height);
+    }
+
+    public Point tl() {
+        return new Point(x, y);
+    }
+
+    public Point br() {
+        return new Point(x + width, y + height);
+    }
+
+    public Size size() {
+        return new Size(width, height);
+    }
+
+    public double area() {
+        return width * height;
+    }
+
+    public boolean contains(Point p) {
+        return x <= p.x && p.x < x + width && y <= p.y && p.y < y + height;
+    }
+
+    @Override
+    public int hashCode() {
+        final int prime = 31;
+        int result = 1;
+        long temp;
+        temp = Double.doubleToLongBits(height);
+        result = prime * result + (int) (temp ^ (temp >>> 32));
+        temp = Double.doubleToLongBits(width);
+        result = prime * result + (int) (temp ^ (temp >>> 32));
+        temp = Double.doubleToLongBits(x);
+        result = prime * result + (int) (temp ^ (temp >>> 32));
+        temp = Double.doubleToLongBits(y);
+        result = prime * result + (int) (temp ^ (temp >>> 32));
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) return true;
+        if (!(obj instanceof Rect2d)) return false;
+        Rect2d it = (Rect2d) obj;
+        return x == it.x && y == it.y && width == it.width && height == it.height;
+    }
+
+    @Override
+    public String toString() {
+        return "{" + x + ", " + y + ", " + width + "x" + height + "}";
+    }
+}
diff --git a/modules/core/src/hal_internal.cpp b/modules/core/src/hal_internal.cpp
index b2b6dc3626..345ca42dc6 100644
--- a/modules/core/src/hal_internal.cpp
+++ b/modules/core/src/hal_internal.cpp
@@ -98,7 +98,7 @@ set_value(fptype *dst, size_t dst_ld, fptype value, size_t m, size_t n)
 template <typename fptype> static inline int
 lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* info)
 {
-    int lda = a_step / sizeof(fptype), sign = 0;
+    int lda = (int)(a_step / sizeof(fptype)), sign = 0;
     int* piv = new int[m];
 
     transpose_square_inplace(a, lda, m);
@@ -114,7 +114,7 @@ lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int*
         }
         else
         {
-            int ldb = b_step / sizeof(fptype);
+            int ldb = (int)(b_step / sizeof(fptype));
             fptype* tmpB = new fptype[m*n];
 
             transpose(b, ldb, tmpB, m, m, n);
@@ -153,7 +153,7 @@ template <typename fptype> static inline int
 lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, bool* info)
 {
     int lapackStatus = 0;
-    int lda = a_step / sizeof(fptype);
+    int lda = (int)(a_step / sizeof(fptype));
     char L[] = {'L', '\0'};
 
     if(b)
@@ -167,7 +167,7 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n
         }
         else
         {
-            int ldb = b_step / sizeof(fptype);
+            int ldb = (int)(b_step / sizeof(fptype));
             fptype* tmpB = new fptype[m*n];
             transpose(b, ldb, tmpB, m, m, n);
 
@@ -197,9 +197,9 @@ lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n
 template <typename fptype> static inline int
 lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype* vt, size_t v_step, int m, int n, int flags, int* info)
 {
-    int lda = a_step / sizeof(fptype);
-    int ldv = v_step / sizeof(fptype);
-    int ldu = u_step / sizeof(fptype);
+    int lda = (int)(a_step / sizeof(fptype));
+    int ldv = (int)(v_step / sizeof(fptype));
+    int ldu = (int)(u_step / sizeof(fptype));
     int lwork = -1;
     int* iworkBuf = new int[8*std::min(m, n)];
     fptype work1 = 0;
@@ -256,7 +256,7 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype
 template <typename fptype> static inline int
 lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_step, fptype* dst, int* info)
 {
-    int lda = a_step / sizeof(fptype);
+    int lda = (int)(a_step / sizeof(fptype));
     char mode[] = { 'N', '\0' };
     if(m < n)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -303,7 +303,7 @@ lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_ste
         {
             std::vector<fptype> tmpBMemHolder(m*k);
             fptype* tmpB = &tmpBMemHolder.front();
-            int ldb = b_step / sizeof(fptype);
+            int ldb = (int)(b_step / sizeof(fptype));
             transpose(b, ldb, tmpB, m, m, k);
 
             if (typeid(fptype) == typeid(float))
@@ -357,10 +357,10 @@ template <typename fptype> static inline int
 lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
             const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
 {
-    int ldsrc1 = src1_step / sizeof(fptype);
-    int ldsrc2 = src2_step / sizeof(fptype);
-    int ldsrc3 = src3_step / sizeof(fptype);
-    int lddst = dst_step / sizeof(fptype);
+    int ldsrc1 = (int)(src1_step / sizeof(fptype));
+    int ldsrc2 = (int)(src2_step / sizeof(fptype));
+    int ldsrc3 = (int)(src3_step / sizeof(fptype));
+    int lddst = (int)(dst_step / sizeof(fptype));
     int c_m, c_n, d_m;
     CBLAS_TRANSPOSE transA, transB;
 
@@ -434,10 +434,10 @@ template <typename fptype> static inline int
 lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
             const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
 {
-    int ldsrc1 = src1_step / sizeof(std::complex<fptype>);
-    int ldsrc2 = src2_step / sizeof(std::complex<fptype>);
-    int ldsrc3 = src3_step / sizeof(std::complex<fptype>);
-    int lddst = dst_step / sizeof(std::complex<fptype>);
+    int ldsrc1 = (int)(src1_step / sizeof(std::complex<fptype>));
+    int ldsrc2 = (int)(src2_step / sizeof(std::complex<fptype>));
+    int ldsrc3 = (int)(src3_step / sizeof(std::complex<fptype>));
+    int lddst = (int)(dst_step / sizeof(std::complex<fptype>));
     int c_m, c_n, d_m;
     CBLAS_TRANSPOSE transA, transB;
     std::complex<fptype> cAlpha(alpha, 0.0);
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index bf16c2f398..946875f5b1 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -1475,6 +1475,26 @@ icvYMLParseValue( CvFileStorage* fs, char* ptr, CvFileNode* node,
             ptr++;
             value_type |= CV_NODE_USER;
         }
+        if ( d == '<') //support of full type heading from YAML 1.2
+        {
+            const char* yamlTypeHeading = "<tag:yaml.org,2002:";
+            const size_t headingLenght = strlen(yamlTypeHeading);
+
+            char* typeEndPtr = ++ptr;
+
+            do d = *++typeEndPtr;
+            while( cv_isprint(d) && d != ' ' && d != '>' );
+
+            if ( d == '>' && (size_t)(typeEndPtr - ptr) > headingLenght )
+            {
+                if ( memcmp(ptr, yamlTypeHeading, headingLenght) == 0 )
+                {
+                    value_type |= CV_NODE_USER;
+                    *typeEndPtr = ' ';
+                    ptr += headingLenght - 1;
+                }
+            }
+        }
 
         endptr = ptr++;
         do d = *++endptr;
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 5cdae20871..39e0fa7648 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -44,6 +44,7 @@
 #include "precomp.hpp"
 #include <climits>
 #include <limits>
+#include "opencv2/core/hal/intrin.hpp"
 
 #include "opencl_kernels_core.hpp"
 
@@ -4238,21 +4239,8 @@ int normHamming(const uchar* a, int n)
 {
     int i = 0;
     int result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t bitsSet = vcntq_u8 (A_vec);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#elif CV_AVX2
+#if CV_AVX2
+    if(USE_AVX2)
     {
         __m256i _r0 = _mm256_setzero_si256();
         __m256i _0 = _mm256_setzero_si256();
@@ -4273,12 +4261,45 @@ int normHamming(const uchar* a, int n)
         _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
         result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
     }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
-            popCountTable[a[i+2]] + popCountTable[a[i+3]];
-    for( ; i < n; i++ )
+#endif // CV_AVX2
+
+#if CV_POPCNT
+    if(checkHardwareSupport(CV_CPU_POPCNT))
+    {
+#  if defined CV_POPCNT_U64
+        for(; i <= n - 8; i += 8)
+        {
+            result += (int)CV_POPCNT_U64(*(uint64*)(a + i));
+        }
+#  endif
+        for(; i <= n - 4; i += 4)
+        {
+            result += CV_POPCNT_U32(*(uint*)(a + i));
+        }
+    }
+#endif // CV_POPCNT
+
+#if CV_SIMD128
+    if(hasSIMD128())
+    {
+        v_uint32x4 t = v_setzero_u32();
+        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
+        {
+            t += v_popcount(v_load(a + i));
+        }
+        result += v_reduce_sum(t);
+    }
+#endif // CV_SIMD128
+
+    for(; i <= n - 4; i += 4)
+    {
+        result += popCountTable[a[i]] + popCountTable[a[i+1]] +
+        popCountTable[a[i+2]] + popCountTable[a[i+3]];
+    }
+    for(; i < n; i++)
+    {
         result += popCountTable[a[i]];
+    }
     return result;
 }
 
@@ -4286,23 +4307,8 @@ int normHamming(const uchar* a, const uchar* b, int n)
 {
     int i = 0;
     int result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t B_vec = vld1q_u8 (b + i);
-            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
-            uint8x16_t bitsSet = vcntq_u8 (AxorB);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#elif CV_AVX2
+#if CV_AVX2
+    if(USE_AVX2)
     {
         __m256i _r0 = _mm256_setzero_si256();
         __m256i _0 = _mm256_setzero_si256();
@@ -4326,12 +4332,45 @@ int normHamming(const uchar* a, const uchar* b, int n)
         _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
         result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
     }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
-                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
-    for( ; i < n; i++ )
+#endif // CV_AVX2
+
+#if CV_POPCNT
+    if(checkHardwareSupport(CV_CPU_POPCNT))
+    {
+#  if defined CV_POPCNT_U64
+        for(; i <= n - 8; i += 8)
+        {
+            result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i));
+        }
+#  endif
+        for(; i <= n - 4; i += 4)
+        {
+            result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
+        }
+    }
+#endif // CV_POPCNT
+
+#if CV_SIMD128
+    if(hasSIMD128())
+    {
+        v_uint32x4 t = v_setzero_u32();
+        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
+        {
+            t += v_popcount(v_load(a + i) ^ v_load(b + i));
+        }
+        result += v_reduce_sum(t);
+    }
+#endif // CV_SIMD128
+
+    for(; i <= n - 4; i += 4)
+    {
+        result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    }
+    for(; i < n; i++)
+    {
         result += popCountTable[a[i] ^ b[i]];
+    }
     return result;
 }
 
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index 66b2083a37..0ec24ef1fb 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -404,6 +404,18 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_popcount()
+    {
+        static unsigned popcountTable[] = {0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33};
+        Data<R> dataA;
+        R a = dataA;
+
+        unsigned resB = (unsigned)v_reduce_sum(v_popcount(a));
+        EXPECT_EQ(popcountTable[R::nlanes], resB);
+
+        return *this;
+    }
+
     TheTest & test_absdiff()
     {
         typedef typename V_RegTrait128<LaneType>::u_reg Ru;
@@ -798,6 +810,7 @@ TEST(hal_intrin, uint8x16) {
         .test_min_max()
         .test_absdiff()
         .test_mask()
+        .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
         .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
         .test_unpack()
@@ -819,6 +832,7 @@ TEST(hal_intrin, int8x16) {
         .test_absdiff()
         .test_abs()
         .test_mask()
+        .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
         .test_unpack()
         .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
@@ -844,6 +858,7 @@ TEST(hal_intrin, uint16x8) {
         .test_absdiff()
         .test_reduce()
         .test_mask()
+        .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
         .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
         .test_unpack()
@@ -870,6 +885,7 @@ TEST(hal_intrin, int16x8) {
         .test_abs()
         .test_reduce()
         .test_mask()
+        .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
         .test_unpack()
         .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
@@ -894,6 +910,7 @@ TEST(hal_intrin, uint32x4) {
         .test_absdiff()
         .test_reduce()
         .test_mask()
+        .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
         .test_unpack()
         .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
@@ -910,6 +927,7 @@ TEST(hal_intrin, int32x4) {
         .test_mul()
         .test_abs()
         .test_cmp()
+        .test_popcount()
         .test_shift<1>().test_shift<8>()
         .test_logic()
         .test_min_max()
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index eb02a7b23d..880d5cb1d8 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -996,3 +996,20 @@ TEST(Core_InputOutput, filestorage_vec_vec_io)
         remove((fileName + formats[i]).c_str());
     }
 }
+
+TEST(Core_InputOutput, filestorage_yaml_advanvced_type_heading)
+{
+    String content = "%YAML:1.0\n cameraMatrix: !<tag:yaml.org,2002:opencv-matrix>\n"
+            "   rows: 1\n"
+            "   cols: 1\n"
+            "   dt: d\n"
+            "   data: [ 1. ]";
+
+    cv::FileStorage fs(content, cv::FileStorage::READ | cv::FileStorage::MEMORY);
+
+    cv::Mat inputMatrix;
+    cv::Mat actualMatrix = cv::Mat::eye(1, 1, CV_64F);
+    fs["cameraMatrix"] >> inputMatrix;
+
+    ASSERT_EQ(cv::norm(inputMatrix, actualMatrix, NORM_INF), 0.);
+}
diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp
index 21efde0103..1afd9cd763 100644
--- a/modules/cudafilters/src/filtering.cpp
+++ b/modules/cudafilters/src/filtering.cpp
@@ -1068,6 +1068,8 @@ namespace
     private:
         int windowSize;
         int partitions;
+        GpuMat devHist;
+        GpuMat devCoarseHist;
     };
 
     MedianFilter::MedianFilter(int srcType, int _windowSize, int _partitions) :
@@ -1099,9 +1101,8 @@ namespace
         // Note - these are hardcoded in the actual GPU kernel. Do not change these values.
         int histSize=256, histCoarseSize=8;
 
-        BufferPool pool(_stream);
-        GpuMat devHist = pool.getBuffer(1, src.cols*histSize*partitions,CV_32SC1);
-        GpuMat devCoarseHist = pool.getBuffer(1,src.cols*histCoarseSize*partitions,CV_32SC1);
+        devHist.create(1, src.cols*histSize*partitions, CV_32SC1);
+        devCoarseHist.create(1, src.cols*histCoarseSize*partitions, CV_32SC1);
 
         devHist.setTo(0, _stream);
         devCoarseHist.setTo(0, _stream);
diff --git a/modules/cudaoptflow/src/cuda/pyrlk.cu b/modules/cudaoptflow/src/cuda/pyrlk.cu
index 2f2865057f..e3cca57f3a 100644
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -51,6 +51,8 @@
 #include "opencv2/core/cuda/filters.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 
+#include <iostream>
+
 using namespace cv::cuda;
 using namespace cv::cuda::device;
 
@@ -923,15 +925,15 @@ namespace pyrlk
                 float x = xBase - c_halfWin_x + j + 0.5f;
                 float y = yBase - c_halfWin_y + i + 0.5f;
 
-                I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
+                I_patch[i * patchWidth + j] = tex2D(tex_If, x, y);
 
                 // Sharr Deriv
 
-                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
+                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) -
+                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1));
 
-                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
+                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) -
+                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1));
             }
         }
 
@@ -943,6 +945,7 @@ namespace pyrlk
         if (x >= cols || y >= rows)
             return;
 
+
         int A11i = 0;
         int A12i = 0;
         int A22i = 0;
@@ -970,7 +973,6 @@ namespace pyrlk
         {
             if (calcErr)
                 err(y, x) = numeric_limits<float>::max();
-
             return;
         }
 
@@ -1014,6 +1016,7 @@ namespace pyrlk
                 }
             }
 
+
             float2 delta;
             delta.x = A12 * b2 - A22 * b1;
             delta.y = A12 * b1 - A11 * b2;
@@ -1083,11 +1086,11 @@ namespace pyrlk
             funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
                 level, block, stream);
         }
-        static void dense(PtrStepSzb I, PtrStepSz<T> J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
+        static void dense(PtrStepSz<T> I, PtrStepSz<T> J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
         {
             dim3 block(16, 16);
             dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-            Tex_I<1, uchar>::bindTexture_(I);
+            Tex_I<1, T>::bindTexture_(I);
             Tex_J<1, T>::bindTexture_(J);
 
             int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
diff --git a/modules/cudaoptflow/src/pyrlk.cpp b/modules/cudaoptflow/src/pyrlk.cpp
index c7f706087b..d1704473c7 100644
--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
@@ -61,7 +61,7 @@ namespace pyrlk
         static void sparse(PtrStepSz<typename device::TypeVec<T, cn>::vec_type> I, PtrStepSz<typename device::TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
             int level, dim3 block, dim3 patch, cudaStream_t stream);
 
-        static void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+        static void dense(PtrStepSzf I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
             PtrStepSzf err, int2 winSize, cudaStream_t stream);
     };
 
@@ -236,7 +236,9 @@ namespace
         prevPyr_.resize(maxLevel_ + 1);
         nextPyr_.resize(maxLevel_ + 1);
 
-        prevPyr_[0] = prevImg;
+        //prevPyr_[0] = prevImg;
+
+        prevImg.convertTo(prevPyr_[0], CV_32F, stream);
         nextImg.convertTo(nextPyr_[0], CV_32F, stream);
 
         for (int level = 1; level <= maxLevel_; ++level)
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index e731dedca7..98d62a3b2c 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -44,6 +44,7 @@ The references are:
 #include "precomp.hpp"
 #include "fast_score.hpp"
 #include "opencl_kernels_features2d.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 #if defined _MSC_VER
@@ -58,9 +59,10 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
 {
     Mat img = _img.getMat();
     const int K = patternSize/2, N = patternSize + K + 1;
-#if CV_SSE2
+#if CV_SIMD128
     const int quarterPatternSize = patternSize/4;
-    (void)quarterPatternSize;
+    v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
+    bool hasSimd = hasSIMD128();
 #endif
     int i, j, k, pixel[25];
     makeOffsets(pixel, (int)img.step, patternSize);
@@ -69,12 +71,6 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
 
     threshold = std::min(std::max(threshold, 0), 255);
 
-#if CV_SSE2
-    __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
-    (void)K16;
-    (void)delta;
-    (void)t;
-#endif
     uchar threshold_tab[512];
     for( i = -255; i <= 255; i++ )
         threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);
@@ -99,66 +95,76 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
         if( i < img.rows - 3 )
         {
             j = 3;
-    #if CV_SSE2
-            if( patternSize == 16 )
+#if CV_SIMD128
+            if( hasSimd )
             {
-                for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
+                if( patternSize == 16 )
                 {
-                    __m128i m0, m1;
-                    __m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
-                    __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
-                    v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);
-
-                    __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
-                    __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
-                    __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
-                    __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
-                    m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
-                    m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
-                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
-                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
-                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
-                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
-                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
-                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
-                    m0 = _mm_or_si128(m0, m1);
-                    int mask = _mm_movemask_epi8(m0);
-                    if( mask == 0 )
-                        continue;
-                    if( (mask & 255) == 0 )
+                    for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
                     {
-                        j -= 8;
-                        ptr -= 8;
-                        continue;
-                    }
+                        v_uint8x16 v = v_load(ptr);
+                        v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
+                        v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
 
-                    __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
-                    for( k = 0; k < N; k++ )
-                    {
-                        __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
-                        m0 = _mm_cmpgt_epi8(x, v0);
-                        m1 = _mm_cmpgt_epi8(v1, x);
+                        v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
+                        v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
+                        v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
+                        v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
 
-                        c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
-                        c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);
+                        v_int8x16 m0, m1;
+                        m0 = (v0 < x0) & (v0 < x1);
+                        m1 = (x0 < v1) & (x1 < v1);
+                        m0 = m0 | ((v0 < x1) & (v0 < x2));
+                        m1 = m1 | ((x1 < v1) & (x2 < v1));
+                        m0 = m0 | ((v0 < x2) & (v0 < x3));
+                        m1 = m1 | ((x2 < v1) & (x3 < v1));
+                        m0 = m0 | ((v0 < x3) & (v0 < x0));
+                        m1 = m1 | ((x3 < v1) & (x0 < v1));
+                        m0 = m0 | m1;
 
-                        max0 = _mm_max_epu8(max0, c0);
-                        max1 = _mm_max_epu8(max1, c1);
-                    }
-
-                    max0 = _mm_max_epu8(max0, max1);
-                    int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));
-
-                    for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
-                        if(m & 1)
+                        int mask = v_signmask(m0);
+                        if( mask == 0 )
+                            continue;
+                        if( (mask & 255) == 0 )
                         {
-                            cornerpos[ncorners++] = j+k;
-                            if(nonmax_suppression)
-                                curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                            j -= 8;
+                            ptr -= 8;
+                            continue;
                         }
+
+                        v_int8x16 c0 = v_setzero_s8();
+                        v_int8x16 c1 = v_setzero_s8();
+                        v_uint8x16 max0 = v_setzero_u8();
+                        v_uint8x16 max1 = v_setzero_u8();
+                        for( k = 0; k < N; k++ )
+                        {
+                            v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
+                            m0 = v0 < x;
+                            m1 = x < v1;
+
+                            c0 = v_sub_wrap(c0, m0) & m0;
+                            c1 = v_sub_wrap(c1, m1) & m1;
+
+                            max0 = v_max(max0, v_reinterpret_as_u8(c0));
+                            max1 = v_max(max1, v_reinterpret_as_u8(c1));
+                        }
+
+                        max0 = v_max(max0, max1);
+                        int m = v_signmask(K16 < max0);
+
+                        for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
+                        {
+                            if(m & 1)
+                            {
+                                cornerpos[ncorners++] = j+k;
+                                if(nonmax_suppression)
+                                    curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                            }
+                        }
+                    }
                 }
             }
-    #endif
+#endif
             for( ; j < img.cols - 3; j++, ptr++ )
             {
                 int v = ptr[0];
diff --git a/modules/flann/include/opencv2/flann.hpp b/modules/flann/include/opencv2/flann.hpp
index 19a98f19e5..22c6ffcf19 100644
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@@ -59,7 +59,7 @@ can be found in @cite Muja2009 .
 namespace cvflann
 {
     CV_EXPORTS flann_distance_t flann_distance_type();
-    FLANN_DEPRECATED CV_EXPORTS void set_distance_type(flann_distance_t distance_type, int order);
+    CV_DEPRECATED CV_EXPORTS void set_distance_type(flann_distance_t distance_type, int order);
 }
 
 
@@ -230,7 +230,7 @@ public:
 
         ::cvflann::IndexParams getParameters() { return nnIndex->getParameters(); }
 
-        FLANN_DEPRECATED const ::cvflann::IndexParams* getIndexParameters() { return nnIndex->getIndexParameters(); }
+        CV_DEPRECATED const ::cvflann::IndexParams* getIndexParameters() { return nnIndex->getIndexParameters(); }
 
 private:
         ::cvflann::Index<Distance>* nnIndex;
@@ -344,7 +344,7 @@ public:
     typedef typename L2<T>::ElementType ElementType;
     typedef typename L2<T>::ResultType DistanceType;
 
-    FLANN_DEPRECATED Index_(const Mat& dataset, const ::cvflann::IndexParams& params)
+    CV_DEPRECATED Index_(const Mat& dataset, const ::cvflann::IndexParams& params)
     {
         printf("[WARNING] The cv::flann::Index_<T> class is deperecated, use cv::flann::GenericIndex<Distance> instead\n");
 
@@ -368,13 +368,13 @@ public:
         if (nnIndex_L1) nnIndex_L1->buildIndex();
         if (nnIndex_L2) nnIndex_L2->buildIndex();
     }
-    FLANN_DEPRECATED ~Index_()
+    CV_DEPRECATED ~Index_()
     {
         if (nnIndex_L1) delete nnIndex_L1;
         if (nnIndex_L2) delete nnIndex_L2;
     }
 
-    FLANN_DEPRECATED void knnSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, int knn, const ::cvflann::SearchParams& searchParams)
+    CV_DEPRECATED void knnSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, int knn, const ::cvflann::SearchParams& searchParams)
     {
         ::cvflann::Matrix<ElementType> m_query((ElementType*)&query[0], 1, query.size());
         ::cvflann::Matrix<int> m_indices(&indices[0], 1, indices.size());
@@ -383,7 +383,7 @@ public:
         if (nnIndex_L1) nnIndex_L1->knnSearch(m_query,m_indices,m_dists,knn,searchParams);
         if (nnIndex_L2) nnIndex_L2->knnSearch(m_query,m_indices,m_dists,knn,searchParams);
     }
-    FLANN_DEPRECATED void knnSearch(const Mat& queries, Mat& indices, Mat& dists, int knn, const ::cvflann::SearchParams& searchParams)
+    CV_DEPRECATED void knnSearch(const Mat& queries, Mat& indices, Mat& dists, int knn, const ::cvflann::SearchParams& searchParams)
     {
         CV_Assert(queries.type() == CvType<ElementType>::type());
         CV_Assert(queries.isContinuous());
@@ -401,7 +401,7 @@ public:
         if (nnIndex_L2) nnIndex_L2->knnSearch(m_queries,m_indices,m_dists,knn, searchParams);
     }
 
-    FLANN_DEPRECATED int radiusSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
+    CV_DEPRECATED int radiusSearch(const std::vector<ElementType>& query, std::vector<int>& indices, std::vector<DistanceType>& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
     {
         ::cvflann::Matrix<ElementType> m_query((ElementType*)&query[0], 1, query.size());
         ::cvflann::Matrix<int> m_indices(&indices[0], 1, indices.size());
@@ -411,7 +411,7 @@ public:
         if (nnIndex_L2) return nnIndex_L2->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
     }
 
-    FLANN_DEPRECATED int radiusSearch(const Mat& query, Mat& indices, Mat& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
+    CV_DEPRECATED int radiusSearch(const Mat& query, Mat& indices, Mat& dists, DistanceType radius, const ::cvflann::SearchParams& searchParams)
     {
         CV_Assert(query.type() == CvType<ElementType>::type());
         CV_Assert(query.isContinuous());
@@ -429,32 +429,32 @@ public:
         if (nnIndex_L2) return nnIndex_L2->radiusSearch(m_query,m_indices,m_dists,radius,searchParams);
     }
 
-    FLANN_DEPRECATED void save(String filename)
+    CV_DEPRECATED void save(String filename)
     {
         if (nnIndex_L1) nnIndex_L1->save(filename);
         if (nnIndex_L2) nnIndex_L2->save(filename);
     }
 
-    FLANN_DEPRECATED int veclen() const
+    CV_DEPRECATED int veclen() const
     {
         if (nnIndex_L1) return nnIndex_L1->veclen();
         if (nnIndex_L2) return nnIndex_L2->veclen();
     }
 
-    FLANN_DEPRECATED int size() const
+    CV_DEPRECATED int size() const
     {
         if (nnIndex_L1) return nnIndex_L1->size();
         if (nnIndex_L2) return nnIndex_L2->size();
     }
 
-    FLANN_DEPRECATED ::cvflann::IndexParams getParameters()
+    CV_DEPRECATED ::cvflann::IndexParams getParameters()
     {
         if (nnIndex_L1) return nnIndex_L1->getParameters();
         if (nnIndex_L2) return nnIndex_L2->getParameters();
 
     }
 
-    FLANN_DEPRECATED const ::cvflann::IndexParams* getIndexParameters()
+    CV_DEPRECATED const ::cvflann::IndexParams* getIndexParameters()
     {
         if (nnIndex_L1) return nnIndex_L1->getIndexParameters();
         if (nnIndex_L2) return nnIndex_L2->getIndexParameters();
@@ -505,7 +505,7 @@ int hierarchicalClustering(const Mat& features, Mat& centers, const ::cvflann::K
 /** @deprecated
 */
 template <typename ELEM_TYPE, typename DIST_TYPE>
-FLANN_DEPRECATED int hierarchicalClustering(const Mat& features, Mat& centers, const ::cvflann::KMeansIndexParams& params)
+CV_DEPRECATED int hierarchicalClustering(const Mat& features, Mat& centers, const ::cvflann::KMeansIndexParams& params)
 {
     printf("[WARNING] cv::flann::hierarchicalClustering<ELEM_TYPE,DIST_TYPE> is deprecated, use "
         "cv::flann::hierarchicalClustering<Distance> instead\n");
diff --git a/modules/flann/include/opencv2/flann/defines.h b/modules/flann/include/opencv2/flann/defines.h
index f0264f74e3..cab6ea9c0a 100644
--- a/modules/flann/include/opencv2/flann/defines.h
+++ b/modules/flann/include/opencv2/flann/defines.h
@@ -50,19 +50,6 @@
 #endif
 
 
-#ifdef FLANN_DEPRECATED
-#undef FLANN_DEPRECATED
-#endif
-#ifdef __GNUC__
-#define FLANN_DEPRECATED __attribute__ ((deprecated))
-#elif defined(_MSC_VER)
-#define FLANN_DEPRECATED __declspec(deprecated)
-#else
-#pragma message("WARNING: You need to implement FLANN_DEPRECATED for this compiler")
-#define FLANN_DEPRECATED
-#endif
-
-
 #undef FLANN_PLATFORM_32_BIT
 #undef FLANN_PLATFORM_64_BIT
 #if defined __amd64__ || defined __x86_64__ || defined _WIN64 || defined _M_X64
diff --git a/modules/flann/include/opencv2/flann/flann_base.hpp b/modules/flann/include/opencv2/flann/flann_base.hpp
index 98c33cf6c0..98901afec1 100644
--- a/modules/flann/include/opencv2/flann/flann_base.hpp
+++ b/modules/flann/include/opencv2/flann/flann_base.hpp
@@ -241,7 +241,7 @@ public:
     /**
      * \brief Returns actual index
      */
-    FLANN_DEPRECATED NNIndex<Distance>* getIndex()
+    CV_DEPRECATED NNIndex<Distance>* getIndex()
     {
         return nnIndex_;
     }
@@ -250,7 +250,7 @@ public:
      * \brief Returns index parameters.
      * \deprecated use getParameters() instead.
      */
-    FLANN_DEPRECATED  const IndexParams* getIndexParameters()
+    CV_DEPRECATED  const IndexParams* getIndexParameters()
     {
         return &index_params_;
     }
diff --git a/modules/flann/include/opencv2/flann/matrix.h b/modules/flann/include/opencv2/flann/matrix.h
index 51b6c6352c..f6092d134e 100644
--- a/modules/flann/include/opencv2/flann/matrix.h
+++ b/modules/flann/include/opencv2/flann/matrix.h
@@ -66,7 +66,7 @@ public:
     /**
      * Convenience function for deallocating the storage data.
      */
-    FLANN_DEPRECATED void free()
+    CV_DEPRECATED void free()
     {
         fprintf(stderr, "The cvflann::Matrix<T>::free() method is deprecated "
                 "and it does not do any memory deallocation any more.  You are"
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 9306f8ef5e..35ce8ca265 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -218,7 +218,7 @@ int cv::waitKey(int delay)
     if (use_legacy > 0)
         return code;
 #endif
-    return code & 0xff;
+    return (code != -1) ? (code & 0xff) : -1;
 }
 
 int cv::createTrackbar(const String& trackbarName, const String& winName,
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index 6359de6e51..79805b2ed3 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -85,7 +85,7 @@ enum ImwriteFlags {
        IMWRITE_JPEG_RST_INTERVAL   = 4,  //!< JPEG restart interval, 0 - 65535, default is 0 - no restart.
        IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is 0 - don't use.
        IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is 0 - don't use.
-       IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. Default value is 3. Also strategy is changed to IMWRITE_PNG_STRATEGY_DEFAULT (Z_DEFAULT_STRATEGY).
+       IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. If specified, strategy is changed to IMWRITE_PNG_STRATEGY_DEFAULT (Z_DEFAULT_STRATEGY). Default value is 1 (best speed setting).
        IMWRITE_PNG_STRATEGY        = 17, //!< One of cv::ImwritePNGFlags, default is IMWRITE_PNG_STRATEGY_DEFAULT.
        IMWRITE_PNG_BILEVEL         = 18, //!< Binary level PNG, 0 or 1, default is 0.
        IMWRITE_PXM_BINARY          = 32, //!< For PPM, PGM, or PBM, it can be a binary format flag, 0 or 1. Default value is 1.
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index dff1c3d477..54e134f568 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -3744,6 +3744,7 @@ CV_EXPORTS_W int connectedComponentsWithStats(InputArray image, OutputArray labe
 The function retrieves contours from the binary image using the algorithm @cite Suzuki85 . The contours
 are a useful tool for shape analysis and object detection and recognition. See squares.cpp in the
 OpenCV sample directory.
+@note Since opencv 3.2 source image is not modified by this function.
 
 @param image Source, an 8-bit single-channel image. Non-zero pixels are treated as 1's. Zero
 pixels remain 0's, so the image is treated as binary . You can use cv::compare, cv::inRange, cv::threshold ,
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index 7f3bad5012..da2e08ddb6 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -43,16 +43,98 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
 namespace cv
 {
 
+#if CV_AVX
+// load three 8-packed float vector and deinterleave
+// probably it's better to write down somewhere else
+static inline void load_deinterleave(const float* ptr, __m256& a, __m256& b, __m256& c)
+{
+    __m256 s0 = _mm256_loadu_ps(ptr);                    // a0, b0, c0, a1, b1, c1, a2, b2,
+    __m256 s1 = _mm256_loadu_ps(ptr + 8);                // c2, a3, b3, c3, a4, b4, c4, a5,
+    __m256 s2 = _mm256_loadu_ps(ptr + 16);               // b5, c5, a6, b6, c6, a7, b7, c7,
+    __m256 s3 = _mm256_permute2f128_ps(s1, s2, 0x21);    // a4, b4, c4, a5, b5, c5, a6, b6,
+    __m256 s4 = _mm256_permute2f128_ps(s2, s2, 0x33);    // c6, a7, b7, c7, c6, a7, b7, c7,
+
+    __m256 v00 = _mm256_unpacklo_ps(s0, s3);             // a0, a4, b0, b4, b1, b5, c1, c5,
+    __m256 v01 = _mm256_unpackhi_ps(s0, s3);             // c0, c4, a1, a5, a2, a6, b2, b6,
+    __m256 v02 = _mm256_unpacklo_ps(s1, s4);             // c2, c6, a3, a7, x,  x,  x,  x,
+    __m256 v03 = _mm256_unpackhi_ps(s1, s4);             // b3, b7, c3, c7, x,  x,  x,  x,
+    __m256 v04 = _mm256_permute2f128_ps(v02, v03, 0x20); // c2, c6, a3, a7, b3, b7, c3, c7,
+    __m256 v05 = _mm256_permute2f128_ps(v01, v03, 0x21); // a2, a6, b2, b6, b3, b7, c3, c7,
+
+    __m256 v10 = _mm256_unpacklo_ps(v00, v05);           // a0, a2, a4, a6, b1, b3, b5, b7,
+    __m256 v11 = _mm256_unpackhi_ps(v00, v05);           // b0, b2, b4, b6, c1, c3, c5, c7,
+    __m256 v12 = _mm256_unpacklo_ps(v01, v04);           // c0, c2, c4, c6, x,  x,  x,  x,
+    __m256 v13 = _mm256_unpackhi_ps(v01, v04);           // a1, a3, a5, a7, x,  x,  x,  x,
+    __m256 v14 = _mm256_permute2f128_ps(v11, v12, 0x20); // b0, b2, b4, b6, c0, c2, c4, c6,
+    __m256 v15 = _mm256_permute2f128_ps(v10, v11, 0x31); // b1, b3, b5, b7, c1, c3, c5, c7,
+
+    __m256 v20 = _mm256_unpacklo_ps(v14, v15);           // b0, b1, b2, b3, c0, c1, c2, c3,
+    __m256 v21 = _mm256_unpackhi_ps(v14, v15);           // b4, b5, b6, b7, c4, c5, c6, c7,
+    __m256 v22 = _mm256_unpacklo_ps(v10, v13);           // a0, a1, a2, a3, x,  x,  x,  x,
+    __m256 v23 = _mm256_unpackhi_ps(v10, v13);           // a4, a5, a6, a7, x,  x,  x,  x,
+
+    a = _mm256_permute2f128_ps(v22, v23, 0x20);          // a0, a1, a2, a3, a4, a5, a6, a7,
+    b = _mm256_permute2f128_ps(v20, v21, 0x20);          // b0, b1, b2, b3, b4, b5, b6, b7,
+    c = _mm256_permute2f128_ps(v20, v21, 0x31);          // c0, c1, c2, c3, c4, c5, c6, c7,
+}
+
+// realign four 3-packed vector to three 4-packed vector
+static inline void v_pack4x3to3x4(const __m128i& s0, const __m128i& s1, const __m128i& s2, const __m128i& s3, __m128i& d0, __m128i& d1, __m128i& d2)
+{
+    d0 = _mm_or_si128(s0, _mm_slli_si128(s1, 12));
+    d1 = _mm_or_si128(_mm_srli_si128(s1, 4), _mm_slli_si128(s2, 8));
+    d2 = _mm_or_si128(_mm_srli_si128(s2, 8), _mm_slli_si128(s3, 4));
+}
+
+// separate high and low 128 bit and cast to __m128i
+static inline void v_separate_lo_hi(const __m256& src, __m128i& lo, __m128i& hi)
+{
+    lo = _mm_castps_si128(_mm256_castps256_ps128(src));
+    hi = _mm_castps_si128(_mm256_extractf128_ps(src, 1));
+}
+
+// interleave three 8-float vector and store
+static inline void store_interleave(float* ptr, const __m256& a, const __m256& b, const __m256& c)
+{
+    __m128i a0, a1, b0, b1, c0, c1;
+    v_separate_lo_hi(a, a0, a1);
+    v_separate_lo_hi(b, b0, b1);
+    v_separate_lo_hi(c, c0, c1);
+
+    v_uint32x4 z = v_setzero_u32();
+    v_uint32x4 u0, u1, u2, u3;
+    v_transpose4x4(v_uint32x4(a0), v_uint32x4(b0), v_uint32x4(c0), z, u0, u1, u2, u3);
+    v_pack4x3to3x4(u0.val, u1.val, u2.val, u3.val, a0, b0, c0);
+    v_transpose4x4(v_uint32x4(a1), v_uint32x4(b1), v_uint32x4(c1), z, u0, u1, u2, u3);
+    v_pack4x3to3x4(u0.val, u1.val, u2.val, u3.val, a1, b1, c1);
+
+#if !defined(__GNUC__) || defined(__INTEL_COMPILER)
+    _mm256_storeu_ps(ptr, _mm256_setr_m128(_mm_castsi128_ps(a0), _mm_castsi128_ps(b0)));
+    _mm256_storeu_ps(ptr + 8, _mm256_setr_m128(_mm_castsi128_ps(c0), _mm_castsi128_ps(a1)));
+    _mm256_storeu_ps(ptr + 16,  _mm256_setr_m128(_mm_castsi128_ps(b1), _mm_castsi128_ps(c1)));
+#else
+    // GCC: workaround for missing AVX intrinsic: "_mm256_setr_m128()"
+    _mm256_storeu_ps(ptr, _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(a0)), _mm_castsi128_ps(b0), 1));
+    _mm256_storeu_ps(ptr + 8, _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(c0)), _mm_castsi128_ps(a1), 1));
+    _mm256_storeu_ps(ptr + 16,  _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(b1)), _mm_castsi128_ps(c1), 1));
+#endif
+}
+#endif // CV_AVX
+
 static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
 {
     int i, j;
     Size size = _cov.size();
-#if CV_SSE
-    volatile bool simd = checkHardwareSupport(CV_CPU_SSE);
+#if CV_AVX
+    bool haveAvx = checkHardwareSupport(CV_CPU_AVX);
+#endif
+#if CV_SIMD128
+    bool haveSimd = hasSIMD128();
 #endif
 
     if( _cov.isContinuous() && _dst.isContinuous() )
@@ -66,45 +148,40 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
         const float* cov = _cov.ptr<float>(i);
         float* dst = _dst.ptr<float>(i);
         j = 0;
-    #if CV_SSE
-        if( simd )
+#if CV_AVX
+        if( haveAvx )
         {
-            __m128 half = _mm_set1_ps(0.5f);
-            for( ; j <= size.width - 4; j += 4 )
+            __m256 half = _mm256_set1_ps(0.5f);
+            for( ; j <= size.width - 8; j += 8 )
             {
-                __m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x
-                __m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x
-                __m128 t2 = _mm_loadu_ps(cov + j*3 + 6); // a2 b2 c2 x
-                __m128 t3 = _mm_loadu_ps(cov + j*3 + 9); // a3 b3 c3 x
-                __m128 a, b, c, t;
-                t = _mm_unpacklo_ps(t0, t1); // a0 a1 b0 b1
-                c = _mm_unpackhi_ps(t0, t1); // c0 c1 x x
-                b = _mm_unpacklo_ps(t2, t3); // a2 a3 b2 b3
-                c = _mm_movelh_ps(c, _mm_unpackhi_ps(t2, t3)); // c0 c1 c2 c3
-                a = _mm_movelh_ps(t, b);
-                b = _mm_movehl_ps(b, t);
-                a = _mm_mul_ps(a, half);
-                c = _mm_mul_ps(c, half);
-                t = _mm_sub_ps(a, c);
-                t = _mm_add_ps(_mm_mul_ps(t, t), _mm_mul_ps(b,b));
-                a = _mm_sub_ps(_mm_add_ps(a, c), _mm_sqrt_ps(t));
-                _mm_storeu_ps(dst + j, a);
+                __m256 v_a, v_b, v_c, v_t;
+                load_deinterleave(cov + j*3, v_a, v_b, v_c);
+                v_a = _mm256_mul_ps(v_a, half);
+                v_c = _mm256_mul_ps(v_c, half);
+                v_t = _mm256_sub_ps(v_a, v_c);
+                v_t = _mm256_add_ps(_mm256_mul_ps(v_b, v_b), _mm256_mul_ps(v_t, v_t));
+                _mm256_storeu_ps(dst + j, _mm256_sub_ps(_mm256_add_ps(v_a, v_c), _mm256_sqrt_ps(v_t)));
             }
         }
-    #elif CV_NEON
-        float32x4_t v_half = vdupq_n_f32(0.5f);
-        for( ; j <= size.width - 4; j += 4 )
-        {
-            float32x4x3_t v_src = vld3q_f32(cov + j * 3);
-            float32x4_t v_a = vmulq_f32(v_src.val[0], v_half);
-            float32x4_t v_b = v_src.val[1];
-            float32x4_t v_c = vmulq_f32(v_src.val[2], v_half);
+#endif // CV_AVX
 
-            float32x4_t v_t = vsubq_f32(v_a, v_c);
-            v_t = vmlaq_f32(vmulq_f32(v_t, v_t), v_b, v_b);
-            vst1q_f32(dst + j, vsubq_f32(vaddq_f32(v_a, v_c), cv_vsqrtq_f32(v_t)));
+#if CV_SIMD128
+        if( haveSimd )
+        {
+            v_float32x4 half = v_setall_f32(0.5f);
+            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            {
+                v_float32x4 v_a, v_b, v_c, v_t;
+                v_load_deinterleave(cov + j*3, v_a, v_b, v_c);
+                v_a *= half;
+                v_c *= half;
+                v_t = v_a - v_c;
+                v_t = v_muladd(v_b, v_b, (v_t * v_t));
+                v_store(dst + j, (v_a + v_c) - v_sqrt(v_t));
+            }
         }
-    #endif
+#endif // CV_SIMD128
+
         for( ; j < size.width; j++ )
         {
             float a = cov[j*3]*0.5f;
@@ -120,8 +197,11 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
 {
     int i, j;
     Size size = _cov.size();
-#if CV_SSE
-    volatile bool simd = checkHardwareSupport(CV_CPU_SSE);
+#if CV_AVX
+    bool haveAvx = checkHardwareSupport(CV_CPU_AVX);
+#endif
+#if CV_SIMD128
+    bool haveSimd = hasSIMD128();
 #endif
 
     if( _cov.isContinuous() && _dst.isContinuous() )
@@ -136,42 +216,41 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
         float* dst = _dst.ptr<float>(i);
         j = 0;
 
-    #if CV_SSE
-        if( simd )
+#if CV_AVX
+        if( haveAvx )
         {
-            __m128 k4 = _mm_set1_ps((float)k);
-            for( ; j <= size.width - 4; j += 4 )
+            __m256 v_k = _mm256_set1_ps((float)k);
+
+            for( ; j <= size.width - 8; j += 8 )
             {
-                __m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x
-                __m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x
-                __m128 t2 = _mm_loadu_ps(cov + j*3 + 6); // a2 b2 c2 x
-                __m128 t3 = _mm_loadu_ps(cov + j*3 + 9); // a3 b3 c3 x
-                __m128 a, b, c, t;
-                t = _mm_unpacklo_ps(t0, t1); // a0 a1 b0 b1
-                c = _mm_unpackhi_ps(t0, t1); // c0 c1 x x
-                b = _mm_unpacklo_ps(t2, t3); // a2 a3 b2 b3
-                c = _mm_movelh_ps(c, _mm_unpackhi_ps(t2, t3)); // c0 c1 c2 c3
-                a = _mm_movelh_ps(t, b);
-                b = _mm_movehl_ps(b, t);
-                t = _mm_add_ps(a, c);
-                a = _mm_sub_ps(_mm_mul_ps(a, c), _mm_mul_ps(b, b));
-                t = _mm_mul_ps(_mm_mul_ps(k4, t), t);
-                a = _mm_sub_ps(a, t);
-                _mm_storeu_ps(dst + j, a);
+                __m256 v_a, v_b, v_c;
+                load_deinterleave(cov + j * 3, v_a, v_b, v_c);
+
+                __m256 v_ac_bb = _mm256_sub_ps(_mm256_mul_ps(v_a, v_c), _mm256_mul_ps(v_b, v_b));
+                __m256 v_ac = _mm256_add_ps(v_a, v_c);
+                __m256 v_dst = _mm256_sub_ps(v_ac_bb, _mm256_mul_ps(v_k, _mm256_mul_ps(v_ac, v_ac)));
+                _mm256_storeu_ps(dst + j, v_dst);
             }
         }
-    #elif CV_NEON
-        float32x4_t v_k = vdupq_n_f32((float)k);
+#endif // CV_AVX
 
-        for( ; j <= size.width - 4; j += 4 )
+#if CV_SIMD128
+        if( haveSimd )
         {
-            float32x4x3_t v_src = vld3q_f32(cov + j * 3);
-            float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2];
-            float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b);
-            float32x4_t v_ac = vaddq_f32(v_a, v_c);
-            vst1q_f32(dst + j, vmlsq_f32(v_ac_bb, v_k, vmulq_f32(v_ac, v_ac)));
+            v_float32x4 v_k = v_setall_f32((float)k);
+
+            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            {
+                v_float32x4 v_a, v_b, v_c;
+                v_load_deinterleave(cov + j * 3, v_a, v_b, v_c);
+
+                v_float32x4 v_ac_bb = v_a * v_c - v_b * v_b;
+                v_float32x4 v_ac = v_a + v_c;
+                v_float32x4 v_dst = v_ac_bb - v_k * v_ac * v_ac;
+                v_store(dst + j, v_dst);
+            }
         }
-    #endif
+#endif // CV_SIMD128
 
         for( ; j < size.width; j++ )
         {
@@ -272,8 +351,11 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
     if (tegra::useTegra() && tegra::cornerEigenValsVecs(src, eigenv, block_size, aperture_size, op_type, k, borderType))
         return;
 #endif
-#if CV_SSE2
-    bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+#if CV_AVX
+    bool haveAvx = checkHardwareSupport(CV_CPU_AVX);
+#endif
+#if CV_SIMD128
+    bool haveSimd = hasSIMD128();
 #endif
 
     int depth = src.depth();
@@ -309,47 +391,41 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
         const float* dydata = Dy.ptr<float>(i);
         j = 0;
 
-        #if CV_NEON
-        for( ; j <= size.width - 4; j += 4 )
-        {
-            float32x4_t v_dx = vld1q_f32(dxdata + j);
-            float32x4_t v_dy = vld1q_f32(dydata + j);
-
-            float32x4x3_t v_dst;
-            v_dst.val[0] = vmulq_f32(v_dx, v_dx);
-            v_dst.val[1] = vmulq_f32(v_dx, v_dy);
-            v_dst.val[2] = vmulq_f32(v_dy, v_dy);
-
-            vst3q_f32(cov_data + j * 3, v_dst);
-        }
-        #elif CV_SSE2
-        if (haveSSE2)
+#if CV_AVX
+        if( haveAvx )
         {
             for( ; j <= size.width - 8; j += 8 )
             {
-                __m128 v_dx_0 = _mm_loadu_ps(dxdata + j);
-                __m128 v_dx_1 = _mm_loadu_ps(dxdata + j + 4);
-                __m128 v_dy_0 = _mm_loadu_ps(dydata + j);
-                __m128 v_dy_1 = _mm_loadu_ps(dydata + j + 4);
+                __m256 v_dx = _mm256_loadu_ps(dxdata + j);
+                __m256 v_dy = _mm256_loadu_ps(dydata + j);
 
-                __m128 v_dx2_0 = _mm_mul_ps(v_dx_0, v_dx_0);
-                __m128 v_dxy_0 = _mm_mul_ps(v_dx_0, v_dy_0);
-                __m128 v_dy2_0 = _mm_mul_ps(v_dy_0, v_dy_0);
-                __m128 v_dx2_1 = _mm_mul_ps(v_dx_1, v_dx_1);
-                __m128 v_dxy_1 = _mm_mul_ps(v_dx_1, v_dy_1);
-                __m128 v_dy2_1 = _mm_mul_ps(v_dy_1, v_dy_1);
+                __m256 v_dst0, v_dst1, v_dst2;
+                v_dst0 = _mm256_mul_ps(v_dx, v_dx);
+                v_dst1 = _mm256_mul_ps(v_dx, v_dy);
+                v_dst2 = _mm256_mul_ps(v_dy, v_dy);
 
-                _mm_interleave_ps(v_dx2_0, v_dx2_1, v_dxy_0, v_dxy_1, v_dy2_0, v_dy2_1);
-
-                _mm_storeu_ps(cov_data + j * 3, v_dx2_0);
-                _mm_storeu_ps(cov_data + j * 3 + 4, v_dx2_1);
-                _mm_storeu_ps(cov_data + j * 3 + 8, v_dxy_0);
-                _mm_storeu_ps(cov_data + j * 3 + 12, v_dxy_1);
-                _mm_storeu_ps(cov_data + j * 3 + 16, v_dy2_0);
-                _mm_storeu_ps(cov_data + j * 3 + 20, v_dy2_1);
+                store_interleave(cov_data + j * 3, v_dst0, v_dst1, v_dst2);
             }
         }
-        #endif
+#endif // CV_AVX
+
+#if CV_SIMD128
+        if( haveSimd )
+        {
+            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
+            {
+                v_float32x4 v_dx = v_load(dxdata + j);
+                v_float32x4 v_dy = v_load(dydata + j);
+
+                v_float32x4 v_dst0, v_dst1, v_dst2;
+                v_dst0 = v_dx * v_dx;
+                v_dst1 = v_dx * v_dy;
+                v_dst2 = v_dy * v_dy;
+
+                v_store_interleave(cov_data + j * 3, v_dst0, v_dst1, v_dst2);
+            }
+        }
+#endif // CV_SIMD128
 
         for( ; j < size.width; j++ )
         {
@@ -751,13 +827,10 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
     if( src.depth() == CV_8U )
         factor *= 255;
     factor = 1./(factor * factor * factor);
-#if CV_NEON || CV_SSE2
+#if CV_SIMD128
     float factor_f = (float)factor;
-#endif
-
-#if CV_SSE2
-    volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
-    __m128 v_factor = _mm_set1_ps(factor_f), v_m2 = _mm_set1_ps(-2.0f);
+    bool haveSimd = hasSIMD128();
+    v_float32x4 v_factor = v_setall_f32(factor_f), v_m2 = v_setall_f32(-2.0f);
 #endif
 
     Size size = src.size();
@@ -773,30 +846,21 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
 
         j = 0;
 
-#if CV_SSE2
-        if (haveSSE2)
+#if CV_SIMD128
+        if (haveSimd)
         {
-            for( ; j <= size.width - 4; j += 4 )
+            for( ; j <= size.width - v_float32x4::nlanes; j += v_float32x4::nlanes )
             {
-                __m128 v_dx = _mm_loadu_ps((const float *)(dxdata + j));
-                __m128 v_dy = _mm_loadu_ps((const float *)(dydata + j));
+                v_float32x4 v_dx = v_load(dxdata + j);
+                v_float32x4 v_dy = v_load(dydata + j);
 
-                __m128 v_s1 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dx), _mm_loadu_ps((const float *)(d2ydata + j)));
-                __m128 v_s2 = _mm_mul_ps(_mm_mul_ps(v_dy, v_dy), _mm_loadu_ps((const float *)(d2xdata + j)));
-                __m128 v_s3 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dy), _mm_loadu_ps((const float *)(dxydata + j)));
-                v_s1 = _mm_mul_ps(v_factor, _mm_add_ps(v_s1, _mm_add_ps(v_s2, _mm_mul_ps(v_s3, v_m2))));
-                _mm_storeu_ps(dstdata + j, v_s1);
+                v_float32x4 v_s1 = (v_dx * v_dx) * v_load(d2ydata + j);
+                v_float32x4 v_s2 = v_muladd((v_dy * v_dy),  v_load(d2xdata + j), v_s1);
+                v_float32x4 v_s3 = v_muladd((v_dy * v_dx) * v_load(dxydata + j), v_m2, v_s2);
+
+                v_store(dstdata + j, v_s3 * v_factor);
             }
         }
-#elif CV_NEON
-        for( ; j <= size.width - 4; j += 4 )
-        {
-            float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j);
-            float32x4_t v_s = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j)));
-            v_s = vmlaq_f32(v_s, vld1q_f32(d2xdata + j), vmulq_f32(v_dy, v_dy));
-            v_s = vmlaq_f32(v_s, vld1q_f32(dxydata + j), vmulq_n_f32(vmulq_f32(v_dy, v_dx), -2));
-            vst1q_f32(dstdata + j, vmulq_n_f32(v_s, factor_f));
-        }
 #endif
 
         for( ; j < size.width; j++ )
diff --git a/modules/imgproc/src/matchcontours.cpp b/modules/imgproc/src/matchcontours.cpp
index 1a371677bf..2a0c5df330 100644
--- a/modules/imgproc/src/matchcontours.cpp
+++ b/modules/imgproc/src/matchcontours.cpp
@@ -50,6 +50,7 @@ double cv::matchShapes(InputArray contour1, InputArray contour2, int method, dou
     double eps = 1.e-5;
     double mmm;
     double result = 0;
+    bool anyA = false, anyB = false;
 
     HuMoments( moments(contour1), ma );
     HuMoments( moments(contour2), mb );
@@ -62,6 +63,11 @@ double cv::matchShapes(InputArray contour1, InputArray contour2, int method, dou
             double ama = fabs( ma[i] );
             double amb = fabs( mb[i] );
 
+            if (ama > 0)
+                anyA = true;
+            if (amb > 0)
+                anyB = true;
+
             if( ma[i] > 0 )
                 sma = 1;
             else if( ma[i] < 0 )
@@ -90,6 +96,11 @@ double cv::matchShapes(InputArray contour1, InputArray contour2, int method, dou
             double ama = fabs( ma[i] );
             double amb = fabs( mb[i] );
 
+            if (ama > 0)
+                anyA = true;
+            if (amb > 0)
+                anyB = true;
+
             if( ma[i] > 0 )
                 sma = 1;
             else if( ma[i] < 0 )
@@ -118,6 +129,11 @@ double cv::matchShapes(InputArray contour1, InputArray contour2, int method, dou
             double ama = fabs( ma[i] );
             double amb = fabs( mb[i] );
 
+            if (ama > 0)
+                anyA = true;
+            if (amb > 0)
+                anyB = true;
+
             if( ma[i] > 0 )
                 sma = 1;
             else if( ma[i] < 0 )
@@ -145,6 +161,12 @@ double cv::matchShapes(InputArray contour1, InputArray contour2, int method, dou
         CV_Error( CV_StsBadArg, "Unknown comparison method" );
     }
 
+    //If anyA and anyB are both true, the result is correct.
+    //If anyA and anyB are both false, the distance is 0, perfect match.
+    //If only one is true, then it's a false 0 and return large error.
+    if (anyA != anyB)
+        result = DBL_MAX;
+
     return result;
 }
 
diff --git a/modules/imgproc/src/opencl/clahe.cl b/modules/imgproc/src/opencl/clahe.cl
index 187933ce0c..ba69085634 100644
--- a/modules/imgproc/src/opencl/clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
@@ -186,21 +186,13 @@ __kernel void calcLut(__global __const uchar * src, const int srcStep,
 #else
         clipped = smem[0];
 #endif
-
-        // broadcast evaluated value
-
-        __local int totalClipped;
-
-        if (tid == 0)
-            totalClipped = clipped;
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // redistribute clipped samples evenly
-
-        int redistBatch = totalClipped / 256;
+        int redistBatch = clipped / 256;
         tHistVal += redistBatch;
 
-        int residual = totalClipped - redistBatch * 256;
+        int residual = clipped - redistBatch * 256;
         int rStep = 256 / residual;
         if (rStep < 1)
             rStep = 1;
diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp
index 9217558181..a84bd704eb 100644
--- a/modules/imgproc/src/spatialgradient.cpp
+++ b/modules/imgproc/src/spatialgradient.cpp
@@ -129,7 +129,7 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
     int i_start = 0;
     int j_start = 0;
-#if CV_SIMD128 && CV_SSE2
+#if CV_SIMD128
     if(hasSIMD128())
     {
         uchar *m_src;
@@ -160,18 +160,13 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
             n_dx = dx.ptr<short>(i+1);
             n_dy = dy.ptr<short>(i+1);
 
-            v_uint8x16 v_select_m = v_uint8x16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                               0, 0, 0, 0xFF);
-
             // Process rest of columns 16-column chunks at a time
             for ( j = 1; j < W - 16; j += 16 )
             {
                 // Load top row for 3x3 Sobel filter
                 v_uint8x16 v_um = v_load(&p_src[j-1]);
+                v_uint8x16 v_un = v_load(&p_src[j]);
                 v_uint8x16 v_up = v_load(&p_src[j+1]);
-                // TODO: Replace _mm_slli_si128 with hal method
-                v_uint8x16 v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)),
-                                                       v_uint8x16(_mm_srli_si128(v_um.val, 1)));
                 v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2;
                 v_expand(v_um, v_um1, v_um2);
                 v_expand(v_un, v_un1, v_un2);
@@ -185,10 +180,8 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
                 // Load second row for 3x3 Sobel filter
                 v_um = v_load(&c_src[j-1]);
+                v_un = v_load(&c_src[j]);
                 v_up = v_load(&c_src[j+1]);
-                // TODO: Replace _mm_slli_si128 with hal method
-                v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)),
-                                            v_uint8x16(_mm_srli_si128(v_um.val, 1)));
                 v_expand(v_um, v_um1, v_um2);
                 v_expand(v_un, v_un1, v_un2);
                 v_expand(v_up, v_up1, v_up2);
@@ -201,10 +194,8 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
                 // Load third row for 3x3 Sobel filter
                 v_um = v_load(&n_src[j-1]);
+                v_un = v_load(&n_src[j]);
                 v_up = v_load(&n_src[j+1]);
-                // TODO: Replace _mm_slli_si128 with hal method
-                v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)),
-                                            v_uint8x16(_mm_srli_si128(v_um.val, 1)));
                 v_expand(v_um, v_um1, v_um2);
                 v_expand(v_un, v_un1, v_un2);
                 v_expand(v_up, v_up1, v_up2);
@@ -236,10 +227,8 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
 
                 // Load fourth row for 3x3 Sobel filter
                 v_um = v_load(&m_src[j-1]);
+                v_un = v_load(&m_src[j]);
                 v_up = v_load(&m_src[j+1]);
-                // TODO: Replace _mm_slli_si128 with hal method
-                v_un = v_select(v_select_m, v_uint8x16(_mm_slli_si128(v_up.val, 1)),
-                                            v_uint8x16(_mm_srli_si128(v_um.val, 1)));
                 v_expand(v_um, v_um1, v_um2);
                 v_expand(v_un, v_un1, v_un2);
                 v_expand(v_up, v_up1, v_up2);
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index fc8cc1440b..3c8da0b450 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -207,6 +207,7 @@ type_dict = {
     "vector_KeyPoint" : { "j_type" : "MatOfKeyPoint", "jn_type" : "long", "jni_type" : "jlong", "jni_var" : "std::vector<KeyPoint> %(n)s", "suffix" : "J" },
     "vector_DMatch"   : { "j_type" : "MatOfDMatch", "jn_type" : "long", "jni_type" : "jlong", "jni_var" : "std::vector<DMatch> %(n)s", "suffix" : "J" },
     "vector_Rect"     : { "j_type" : "MatOfRect",   "jn_type" : "long", "jni_type" : "jlong", "jni_var" : "std::vector<Rect> %(n)s", "suffix" : "J" },
+    "vector_Rect2d"     : { "j_type" : "MatOfRect2d",   "jn_type" : "long", "jni_type" : "jlong", "jni_var" : "std::vector<Rect2d> %(n)s", "suffix" : "J" },
     "vector_uchar"    : { "j_type" : "MatOfByte",   "jn_type" : "long", "jni_type" : "jlong", "jni_var" : "std::vector<uchar> %(n)s", "suffix" : "J" },
     "vector_char"     : { "j_type" : "MatOfByte",   "jn_type" : "long", "jni_type" : "jlong", "jni_var" : "std::vector<char> %(n)s", "suffix" : "J" },
     "vector_int"      : { "j_type" : "MatOfInt",    "jn_type" : "long", "jni_type" : "jlong", "jni_var" : "std::vector<int> %(n)s", "suffix" : "J" },
@@ -261,6 +262,9 @@ type_dict = {
     "Rect"    : { "j_type" : "Rect",  "jn_args" : (("int", ".x"), ("int", ".y"), ("int", ".width"), ("int", ".height")),
                   "jni_var" : "Rect %(n)s(%(n)s_x, %(n)s_y, %(n)s_width, %(n)s_height)", "jni_type" : "jdoubleArray",
                   "suffix" : "IIII"},
+    "Rect2d"    : { "j_type" : "Rect2d",  "jn_args" : (("double", ".x"), ("double", ".y"), ("double", ".width"), ("double", ".height")),
+    "jni_var" : "Rect %(n)s(%(n)s_x, %(n)s_y, %(n)s_width, %(n)s_height)", "jni_type" : "jdoubleArray",
+        "suffix" : "DDDD"},
     "Size"    : { "j_type" : "Size",  "jn_args" : (("double", ".width"), ("double", ".height")),
                   "jni_var" : "Size %(n)s((int)%(n)s_width, (int)%(n)s_height)", "jni_type" : "jdoubleArray",
                   "suffix" : "DD"},
@@ -825,7 +829,7 @@ class ClassInfo(GeneralInfo):
                 j_type = type_dict[ctype]['j_type']
             elif ctype in ("Algorithm"):
                 j_type = ctype
-            if j_type in ( "CvType", "Mat", "Point", "Point3", "Range", "Rect", "RotatedRect", "Scalar", "Size", "TermCriteria", "Algorithm" ):
+            if j_type in ( "CvType", "Mat", "Point", "Point3", "Range", "Rect", "Rect2d", "RotatedRect", "Scalar", "Size", "TermCriteria", "Algorithm" ):
                 self.imports.add("org.opencv.core." + j_type)
             if j_type == 'String':
                 self.imports.add("java.lang.String")
diff --git a/modules/java/generator/src/cpp/converters.cpp b/modules/java/generator/src/cpp/converters.cpp
index 3c771ce734..7ce7456525 100644
--- a/modules/java/generator/src/cpp/converters.cpp
+++ b/modules/java/generator/src/cpp/converters.cpp
@@ -92,6 +92,19 @@ void vector_Rect_to_Mat(std::vector<Rect>& v_rect, Mat& mat)
     mat = Mat(v_rect, true);
 }
 
+//vector_Rect2d
+
+void Mat_to_vector_Rect2d(Mat& mat, std::vector<Rect2d>& v_rect)
+{
+    v_rect.clear();
+    CHECK_MAT(mat.type()==CV_64FC4 && mat.cols==1);
+    v_rect = (std::vector<Rect2d>) mat;
+}
+
+void vector_Rect2d_to_Mat(std::vector<Rect2d>& v_rect, Mat& mat)
+{
+    mat = Mat(v_rect, true);
+}
 
 //vector_Point
 void Mat_to_vector_Point(Mat& mat, std::vector<Point>& v_point)
diff --git a/modules/java/generator/src/cpp/converters.h b/modules/java/generator/src/cpp/converters.h
index 257f9449e3..25077b10c2 100644
--- a/modules/java/generator/src/cpp/converters.h
+++ b/modules/java/generator/src/cpp/converters.h
@@ -19,6 +19,9 @@ void vector_char_to_Mat(std::vector<char>& v_char, cv::Mat& mat);
 void Mat_to_vector_Rect(cv::Mat& mat, std::vector<cv::Rect>& v_rect);
 void vector_Rect_to_Mat(std::vector<cv::Rect>& v_rect, cv::Mat& mat);
 
+void Mat_to_vector_Rect2d(cv::Mat& mat, std::vector<cv::Rect2d>& v_rect);
+void vector_Rect2d_to_Mat(std::vector<cv::Rect2d>& v_rect, cv::Mat& mat);
+
 
 void Mat_to_vector_Point(cv::Mat& mat, std::vector<cv::Point>& v_point);
 void Mat_to_vector_Point2f(cv::Mat& mat, std::vector<cv::Point2f>& v_point);
diff --git a/modules/java/generator/src/java/utils+Converters.java b/modules/java/generator/src/java/utils+Converters.java
index bd3bb64927..c0575a6665 100644
--- a/modules/java/generator/src/java/utils+Converters.java
+++ b/modules/java/generator/src/java/utils+Converters.java
@@ -14,6 +14,7 @@ import org.opencv.core.MatOfPoint3f;
 import org.opencv.core.Point;
 import org.opencv.core.Point3;
 import org.opencv.core.Rect;
+import org.opencv.core.Rect2d;
 import org.opencv.core.DMatch;
 import org.opencv.core.KeyPoint;
 
@@ -435,6 +436,42 @@ public class Converters {
         }
     }
 
+    public static Mat vector_Rect2d_to_Mat(List<Rect2d> rs) {
+        Mat res;
+        int count = (rs != null) ? rs.size() : 0;
+        if (count > 0) {
+            res = new Mat(count, 1, CvType.CV_64FC4);
+            double[] buff = new double[4 * count];
+            for (int i = 0; i < count; i++) {
+                Rect2d r = rs.get(i);
+                buff[4 * i] = r.x;
+                buff[4 * i + 1] = r.y;
+                buff[4 * i + 2] = r.width;
+                buff[4 * i + 3] = r.height;
+            }
+            res.put(0, 0, buff);
+        } else {
+            res = new Mat();
+        }
+        return res;
+    }
+
+    public static void Mat_to_vector_Rect2d(Mat m, List<Rect2d> rs) {
+        if (rs == null)
+            throw new java.lang.IllegalArgumentException("rs == null");
+        int count = m.rows();
+        if (CvType.CV_64FC4 != m.type() || m.cols() != 1)
+            throw new java.lang.IllegalArgumentException(
+                                                         "CvType.CV_64FC4 != m.type() ||  m.rows()!=1\n" + m);
+
+        rs.clear();
+        double[] buff = new double[4 * count];
+        m.get(0, 0, buff);
+        for (int i = 0; i < count; i++) {
+            rs.add(new Rect2d(buff[4 * i], buff[4 * i + 1], buff[4 * i + 2], buff[4 * i + 3]));
+        }
+    }
+
     public static Mat vector_KeyPoint_to_Mat(List<KeyPoint> kps) {
         Mat res;
         int count = (kps != null) ? kps.size() : 0;
diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
index 99f5883d6e..3614a91298 100644
--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@@ -1169,6 +1169,17 @@ public:
     Algorithm::load to load the pre-trained model.
      */
     CV_WRAP static Ptr<RTrees> create();
+
+    /** @brief Loads and creates a serialized RTree from a file
+     *
+     * Use RTree::save to serialize and store an RTree to disk.
+     * Load the RTree from this file again, by calling this function with the path to the file.
+     * Optionally specify the node for the file containing the classifier
+     *
+     * @param filepath path to serialized RTree
+     * @param nodeName name of node containing the classifier
+     */
+    CV_WRAP static Ptr<RTrees> load(const String& filepath , const String& nodeName = String());
 };
 
 /****************************************************************************************\
diff --git a/modules/ml/src/lr.cpp b/modules/ml/src/lr.cpp
index 57fcac7ef0..c3c314228b 100644
--- a/modules/ml/src/lr.cpp
+++ b/modules/ml/src/lr.cpp
@@ -579,7 +579,7 @@ void LogisticRegressionImpl::write(FileStorage& fs) const
         CV_Error(CV_StsBadArg,"file can't open. Check file path");
     }
     writeFormat(fs);
-    string desc = "Logisitic Regression Classifier";
+    string desc = "Logistic Regression Classifier";
     fs<<"classifier"<<desc.c_str();
     fs<<"alpha"<<this->params.alpha;
     fs<<"iterations"<<this->params.num_iters;
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index cab33ab33b..65fe6827a7 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -41,7 +41,6 @@
 //M*/
 
 #include "precomp.hpp"
-
 namespace cv {
 namespace ml {
 
@@ -422,6 +421,12 @@ Ptr<RTrees> RTrees::create()
     return makePtr<RTreesImpl>();
 }
 
+//Function needed for Python and Java wrappers
+Ptr<RTrees> RTrees::load(const String& filepath, const String& nodeName)
+{
+    return Algorithm::load<RTrees>(filepath, nodeName);
+}
+
 }}
 
 // End of file.
diff --git a/modules/python/CMakeLists.txt b/modules/python/CMakeLists.txt
index 1da5e329d6..d85a516791 100644
--- a/modules/python/CMakeLists.txt
+++ b/modules/python/CMakeLists.txt
@@ -2,16 +2,26 @@
 #  CMake file for python support
 # ----------------------------------------------------------------------------
 
-if((WIN32 AND CMAKE_BUILD_TYPE STREQUAL "Debug")
-    OR BUILD_opencv_world
-    )
-  ocv_module_disable(python2)
-  ocv_module_disable(python3)
+if(ANDROID OR APPLE_FRAMEWORK OR WINRT)
+  set(__disable_python2 ON)
+  set(__disable_python3 ON)
+elseif(BUILD_opencv_world OR (WIN32 AND CMAKE_BUILD_TYPE STREQUAL "Debug"))
+  if(NOT DEFINED BUILD_opencv_python2)
+    set(__disable_python2 ON)
+  endif()
+  if(NOT DEFINED BUILD_opencv_python3)
+    set(__disable_python3 ON)
+  endif()
 endif()
 
-if(ANDROID OR APPLE_FRAMEWORK OR WINRT)
-  ocv_module_disable(python2)
-  ocv_module_disable(python3)
+if(__disable_python2)
+  ocv_module_disable_(python2)
+endif()
+if(__disable_python3)
+  ocv_module_disable_(python3)
+endif()
+if(__disable_python2 AND __disable_python3)
+  return()
 endif()
 
 add_subdirectory(python2)
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index e69e933375..66e3733cec 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -417,34 +417,74 @@ typedef struct {
     UMat* um;
 } cv2_UMatWrapperObject;
 
-// UMatWrapper init - takes one optional argument, that converts to Mat, that converts to UMat and stored inside.
-// If no argument given - empty UMat created.
+static bool PyObject_IsUMat(PyObject *o);
+
+// UMatWrapper init - try to map arguments from python to UMat constructors
 static int UMatWrapper_init(cv2_UMatWrapperObject *self, PyObject *args, PyObject *kwds)
 {
-    self->um = new UMat();
-
-    PyObject *np_mat = NULL;
-
-    static char *kwlist[] = {new char[3], NULL};
-    strcpy(kwlist[0], "mat");
-
-    if (! PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &np_mat))
-        return -1;
-
-    if (np_mat) {
-        Mat m;
-        if (!pyopencv_to(np_mat, m, ArgInfo("UMatWrapper.np_mat", 0)))
-            return -1;
-
-        m.copyTo(*self->um);
+    self->um = NULL;
+    {
+        // constructor ()
+        const char *kwlist[] = {NULL};
+        if (PyArg_ParseTupleAndKeywords(args, kwds, "", (char**) kwlist)) {
+            self->um = new UMat();
+            return 0;
+        }
+        PyErr_Clear();
     }
-
-    return 0;
+    {
+        // constructor (rows, cols, type)
+        const char *kwlist[] = {"rows", "cols", "type", NULL};
+        int rows, cols, type;
+        if (PyArg_ParseTupleAndKeywords(args, kwds, "iii", (char**) kwlist, &rows, &cols, &type)) {
+            self->um = new UMat(rows, cols, type);
+            return 0;
+        }
+        PyErr_Clear();
+    }
+    {
+        // constructor (m, rowRange, colRange)
+        const char *kwlist[] = {"m", "rowRange", "colRange", NULL};
+        PyObject *obj = NULL;
+        int y0 = -1, y1 = -1, x0 = -1, x1 = -1;
+        if (PyArg_ParseTupleAndKeywords(args, kwds, "O(ii)|(ii)", (char**) kwlist, &obj, &y0, &y1, &x0, &x1) && PyObject_IsUMat(obj)) {
+            UMat *um_other = ((cv2_UMatWrapperObject *) obj)->um;
+            Range rowRange(y0, y1);
+            Range colRange = (x0 >= 0 && x1 >= 0) ? Range(x0, x1) : Range::all();
+            self->um = new UMat(*um_other, rowRange, colRange);
+            return 0;
+        }
+        PyErr_Clear();
+    }
+    {
+        // constructor (m)
+        const char *kwlist[] = {"m", NULL};
+        PyObject *obj = NULL;
+        if (PyArg_ParseTupleAndKeywords(args, kwds, "O", (char**) kwlist, &obj)) {
+            // constructor (UMat m)
+            if (PyObject_IsUMat(obj)) {
+                UMat *um_other = ((cv2_UMatWrapperObject *) obj)->um;
+                self->um = new UMat(*um_other);
+                return 0;
+            }
+            // python specific constructor from array like object
+            Mat m;
+            if (pyopencv_to(obj, m, ArgInfo("UMatWrapper.np_mat", 0))) {
+                self->um = new UMat();
+                m.copyTo(*self->um);
+                return 0;
+            }
+        }
+        PyErr_Clear();
+    }
+    PyErr_SetString(PyExc_TypeError, "no matching UMat constructor found/supported");
+    return -1;
 }
 
 static void UMatWrapper_dealloc(cv2_UMatWrapperObject* self)
 {
-    delete self->um;
+    if (self->um)
+        delete self->um;
 #if PY_MAJOR_VERSION >= 3
     Py_TYPE(self)->tp_free((PyObject*)self);
 #else
@@ -529,8 +569,12 @@ static PyTypeObject cv2_UMatWrapperType = {
 #endif
 };
 
+static bool PyObject_IsUMat(PyObject *o) {
+    return (o != NULL) && PyObject_TypeCheck(o, &cv2_UMatWrapperType);
+}
+
 static bool pyopencv_to(PyObject* o, UMat& um, const ArgInfo info) {
-    if (o != NULL && PyObject_TypeCheck(o, &cv2_UMatWrapperType) ) {
+    if (PyObject_IsUMat(o)) {
         um = *((cv2_UMatWrapperObject *) o)->um;
         return true;
     }
diff --git a/modules/python/test/test.py b/modules/python/test/test.py
index 5a66769a49..f4585471e8 100755
--- a/modules/python/test/test.py
+++ b/modules/python/test/test.py
@@ -123,6 +123,17 @@ class Hackathon244Tests(NewOpenCVTests):
         boost.getMaxDepth() # from ml::DTrees
         boost.isClassifier() # from ml::StatModel
 
+    def test_umat_construct(self):
+        data = np.random.random([512, 512])
+        # UMat constructors
+        data_um = cv2.UMat(data)  # from ndarray
+        data_sub_um = cv2.UMat(data_um, [0, 256], [0, 256])  # from UMat
+        data_dst_um = cv2.UMat(256, 256, cv2.CV_64F)  # from size/type
+
+        # simple test
+        cv2.multiply(data_sub_um, 2., dst=data_dst_um)
+        assert np.allclose(2. * data[:256, :256], data_dst_um.get())
+
     def test_umat_matching(self):
         img1 = self.get_sample("samples/data/right01.jpg")
         img2 = self.get_sample("samples/data/right02.jpg")
diff --git a/modules/python/test/test_shape.py b/modules/python/test/test_shape.py
new file mode 100644
index 0000000000..ad0f0be5d5
--- /dev/null
+++ b/modules/python/test/test_shape.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+import cv2
+
+from tests_common import NewOpenCVTests
+
+class shape_test(NewOpenCVTests):
+
+    def test_computeDistance(self):
+
+        a = self.get_sample('samples/data/shape_sample/1.png', cv2.IMREAD_GRAYSCALE);
+        b = self.get_sample('samples/data/shape_sample/2.png', cv2.IMREAD_GRAYSCALE);
+
+        _, ca, _ = cv2.findContours(a, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_KCOS)
+        _, cb, _ = cv2.findContours(b, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_KCOS)
+
+        hd = cv2.createHausdorffDistanceExtractor()
+        sd = cv2.createShapeContextDistanceExtractor()
+
+        d1 = hd.computeDistance(ca[0], cb[0])
+        d2 = sd.computeDistance(ca[0], cb[0])
+
+        self.assertAlmostEqual(d1, 26.4196891785, 3, "HausdorffDistanceExtractor")
+        self.assertAlmostEqual(d2, 0.25804194808, 3, "ShapeContextDistanceExtractor")
diff --git a/modules/shape/src/haus_dis.cpp b/modules/shape/src/haus_dis.cpp
index 6f372c416d..732f288946 100644
--- a/modules/shape/src/haus_dis.cpp
+++ b/modules/shape/src/haus_dis.cpp
@@ -138,6 +138,13 @@ float HausdorffDistanceExtractorImpl::computeDistance(InputArray contour1, Input
         set2.convertTo(set2, CV_32F);
     CV_Assert((set1.channels()==2) && (set1.cols>0));
     CV_Assert((set2.channels()==2) && (set2.cols>0));
+
+    // Force vectors column-based
+    if (set1.dims > 1)
+        set1 = set1.reshape(2, 1);
+    if (set2.dims > 1)
+        set2 = set2.reshape(2, 1);
+
     return std::max( _apply(set1, set2, distanceFlag, rankProportion),
                      _apply(set2, set1, distanceFlag, rankProportion) );
 }
diff --git a/modules/shape/src/precomp.cpp b/modules/shape/src/precomp.cpp
deleted file mode 100644
index 730edbb63d..0000000000
--- a/modules/shape/src/precomp.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-/* End of file. */
diff --git a/modules/shape/src/sc_dis.cpp b/modules/shape/src/sc_dis.cpp
index 89c6d91255..bbda11cff0 100644
--- a/modules/shape/src/sc_dis.cpp
+++ b/modules/shape/src/sc_dis.cpp
@@ -202,6 +202,13 @@ float ShapeContextDistanceExtractorImpl::computeDistance(InputArray contour1, In
 
     CV_Assert((set1.channels()==2) && (set1.cols>0));
     CV_Assert((set2.channels()==2) && (set2.cols>0));
+
+    // Force vectors column-based
+    if (set1.dims > 1)
+        set1 = set1.reshape(2, 1);
+    if (set2.dims > 1)
+        set2 = set2.reshape(2, 1);
+
     if (imageAppearanceWeight!=0)
     {
         CV_Assert((!image1.empty()) && (!image2.empty()));
diff --git a/modules/shape/test/test_shape.cpp b/modules/shape/test/test_shape.cpp
index 0601594f08..97a621e4b5 100644
--- a/modules/shape/test/test_shape.cpp
+++ b/modules/shape/test/test_shape.cpp
@@ -299,3 +299,22 @@ TEST(Hauss, regression)
     ShapeBaseTest<int, computeShapeDistance_Haussdorf> test(NSN_val, NP_val, CURRENT_MAX_ACCUR_val);
     test.safe_run();
 }
+
+TEST(computeDistance, regression_4976)
+{
+    Mat a = imread(cvtest::findDataFile("shape/samples/1.png"), 0);
+    Mat b = imread(cvtest::findDataFile("shape/samples/2.png"), 0);
+
+    vector<vector<Point> > ca,cb;
+    findContours(a, ca, cv::RETR_CCOMP, cv::CHAIN_APPROX_TC89_KCOS);
+    findContours(b, cb, cv::RETR_CCOMP, cv::CHAIN_APPROX_TC89_KCOS);
+
+    Ptr<HausdorffDistanceExtractor> hd = createHausdorffDistanceExtractor();
+    Ptr<ShapeContextDistanceExtractor> sd = createShapeContextDistanceExtractor();
+
+    double d1 = hd->computeDistance(ca[0],cb[0]);
+    double d2 = sd->computeDistance(ca[0],cb[0]);
+
+    EXPECT_NEAR(d1, 26.4196891785, 1e-3) << "HausdorffDistanceExtractor";
+    EXPECT_NEAR(d2, 0.25804194808, 1e-3) << "ShapeContextDistanceExtractor";
+}
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index 76c7bc8489..0e4f39a99d 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -5,7 +5,7 @@ if(HAVE_CUDA)
 endif()
 
 set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")
-if(BUILD_SHARED_LIBS AND BUILD_opencv_world)
+if(BUILD_SHARED_LIBS AND BUILD_opencv_world AND OPENCV_WORLD_EXCLUDE_EXTRA_MODULES)
   set(STITCHING_CONTRIB_DEPS "")
 endif()
 ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect
diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp
index c573905c6f..a1981926ff 100644
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@@ -604,12 +604,17 @@ void dumpOpenCLDevice();
 
 void parseCustomOptions(int argc, char **argv);
 
-#define CV_TEST_MAIN(resourcesubdir, ...) \
+#define CV_TEST_INIT0_NOOP (void)0
+
+#define CV_TEST_MAIN(resourcesubdir, ...) CV_TEST_MAIN_EX(resourcesubdir, NOOP, __VA_ARGS__)
+
+#define CV_TEST_MAIN_EX(resourcesubdir, INIT0, ...) \
 int main(int argc, char **argv) \
 { \
     using namespace cvtest; \
     TS* ts = TS::ptr(); \
     ts->init(resourcesubdir); \
+    __CV_TEST_EXEC_ARGS(CV_TEST_INIT0_ ## INIT0) \
     ::testing::InitGoogleTest(&argc, argv); \
     cvtest::printVersionInfo(); \
     TEST_DUMP_OCL_INFO \
diff --git a/modules/ts/include/opencv2/ts/cuda_test.hpp b/modules/ts/include/opencv2/ts/cuda_test.hpp
index b459bb358f..2780927304 100644
--- a/modules/ts/include/opencv2/ts/cuda_test.hpp
+++ b/modules/ts/include/opencv2/ts/cuda_test.hpp
@@ -351,8 +351,10 @@ namespace cv { namespace cuda
 
 #ifdef HAVE_CUDA
 
-#define CV_CUDA_TEST_MAIN(resourcesubdir) \
-    CV_TEST_MAIN(resourcesubdir, cvtest::parseCudaDeviceOptions(argc, argv), cvtest::printCudaInfo(), cv::setUseOptimized(false))
+#define CV_TEST_INIT0_CUDA cvtest::parseCudaDeviceOptions(argc, argv), cvtest::printCudaInfo(), cv::setUseOptimized(false)
+
+#define CV_CUDA_TEST_MAIN(resourcesubdir, ...) \
+    CV_TEST_MAIN_EX(resourcesubdir, CUDA, __VA_ARGS__)
 
 #else // HAVE_CUDA
 
diff --git a/modules/ts/misc/run_utils.py b/modules/ts/misc/run_utils.py
index 8740aa7855..2e9a66f8bd 100644
--- a/modules/ts/misc/run_utils.py
+++ b/modules/ts/misc/run_utils.py
@@ -193,7 +193,7 @@ class CMakeCache:
         self.tests_dir = os.path.normpath(path)
 
     def read(self, path, fname):
-        rx = re.compile(r'^opencv_(\w+)_SOURCE_DIR:STATIC=(.*)$')
+        rx = re.compile(r'^OPENCV_MODULE_opencv_(\w+)_LOCATION:INTERNAL=(.*)$')
         module_paths = {} # name -> path
         with open(fname, "rt") as cachefile:
             for l in cachefile.readlines():
diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt
index fb64894899..8a5b9d7a36 100644
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@@ -208,7 +208,11 @@ if(IOS)
        ${CMAKE_CURRENT_LIST_DIR}/src/cap_ios_abstract_camera.mm
        ${CMAKE_CURRENT_LIST_DIR}/src/cap_ios_photo_camera.mm
        ${CMAKE_CURRENT_LIST_DIR}/src/cap_ios_video_camera.mm)
+
   list(APPEND VIDEOIO_LIBRARIES "-framework Accelerate" "-framework AVFoundation" "-framework CoreGraphics" "-framework CoreImage" "-framework CoreMedia" "-framework CoreVideo" "-framework QuartzCore" "-framework AssetsLibrary")
+  if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
+    list(APPEND VIDEOIO_LIBRARIES "-framework UIKit")
+  endif()
 endif()
 
 if(WIN32)
diff --git a/modules/videoio/include/opencv2/videoio/cap_ios.h b/modules/videoio/include/opencv2/videoio/cap_ios.h
index c90ad2e73e..0691420cf7 100644
--- a/modules/videoio/include/opencv2/videoio/cap_ios.h
+++ b/modules/videoio/include/opencv2/videoio/cap_ios.h
@@ -39,7 +39,7 @@
 
 @class CvAbstractCamera;
 
-@interface CvAbstractCamera : NSObject
+CV_EXPORTS @interface CvAbstractCamera : NSObject
 {
     UIDeviceOrientation currentDeviceOrientation;
 
@@ -87,7 +87,7 @@
 
 @class CvVideoCamera;
 
-@protocol CvVideoCameraDelegate <NSObject>
+CV_EXPORTS @protocol CvVideoCameraDelegate <NSObject>
 
 #ifdef __cplusplus
 // delegate method for processing image frames
@@ -96,7 +96,7 @@
 
 @end
 
-@interface CvVideoCamera : CvAbstractCamera<AVCaptureVideoDataOutputSampleBufferDelegate>
+CV_EXPORTS @interface CvVideoCamera : CvAbstractCamera<AVCaptureVideoDataOutputSampleBufferDelegate>
 {
     AVCaptureVideoDataOutput *videoDataOutput;
 
@@ -129,14 +129,14 @@
 
 @class CvPhotoCamera;
 
-@protocol CvPhotoCameraDelegate <NSObject>
+CV_EXPORTS @protocol CvPhotoCameraDelegate <NSObject>
 
 - (void)photoCamera:(CvPhotoCamera*)photoCamera capturedImage:(UIImage *)image;
 - (void)photoCameraCancel:(CvPhotoCamera*)photoCamera;
 
 @end
 
-@interface CvPhotoCamera : CvAbstractCamera
+CV_EXPORTS @interface CvPhotoCamera : CvAbstractCamera
 {
     AVCaptureStillImageOutput *stillImageOutput;
 }
diff --git a/modules/viz/CMakeLists.txt b/modules/viz/CMakeLists.txt
index 9fd0301635..a22f52fd87 100644
--- a/modules/viz/CMakeLists.txt
+++ b/modules/viz/CMakeLists.txt
@@ -2,12 +2,14 @@ if(NOT WITH_VTK OR NOT DEFINED HAVE_VTK OR NOT HAVE_VTK)
   ocv_module_disable(viz)
 endif()
 
-include(${VTK_USE_FILE})
 set(the_description "Viz")
-ocv_define_module(viz opencv_core ${VTK_LIBRARIES} WRAP python)
+ocv_define_module(viz opencv_core WRAP python)
+
+include(${VTK_USE_FILE})
+ocv_target_link_libraries(${the_module} ${VTK_LIBRARIES})
 
 if(APPLE AND BUILD_opencv_viz)
-  ocv_target_link_libraries(opencv_viz "-framework Cocoa")
+  ocv_target_link_libraries(${the_module} "-framework Cocoa")
 endif()
 
 if(TARGET opencv_test_viz)
diff --git a/modules/world/CMakeLists.txt b/modules/world/CMakeLists.txt
index db8928d69c..dde793fa00 100644
--- a/modules/world/CMakeLists.txt
+++ b/modules/world/CMakeLists.txt
@@ -2,7 +2,7 @@ set(the_description "All the selected OpenCV modules in a single binary")
 set(OPENCV_MODULE_IS_PART_OF_WORLD FALSE)
 set(BUILD_opencv_world_INIT OFF)
 
-if(APPLE_FRAMEWORK OR NOT BUILD_SHARED_LIBS)
+if(NOT BUILD_SHARED_LIBS)
   set(OPENCV_MODULE_TYPE STATIC)
   set(OPENCV_WORLD_FLAGS_PROPERTY STATIC_LIBRARY_FLAGS)
 else()
@@ -11,6 +11,11 @@ endif()
 
 function(include_one_module m)
   include("${OPENCV_MODULE_${m}_LOCATION}/CMakeLists.txt")
+  foreach(var
+      CMAKE_CXX_FLAGS CMAKE_C_FLAGS # Propagate warnings settings
+  )
+    set(${var} "${${var}}" PARENT_SCOPE)
+  endforeach()
 endfunction()
 
 if(NOT OPENCV_INITIAL_PASS)
@@ -35,12 +40,14 @@ ocv_add_module(world opencv_core)
 set(headers_list "HEADERS")
 set(sources_list "SOURCES")
 set(link_deps "")
-foreach(m ${OPENCV_MODULE_${the_module}_DEPS})
+foreach(m ${OPENCV_MODULE_${the_module}_DEPS} opencv_world)
   if(OPENCV_MODULE_${m}_IS_PART_OF_WORLD)
     set(headers_list "${headers_list};${OPENCV_MODULE_${m}_HEADERS}")
     set(sources_list "${sources_list};${OPENCV_MODULE_${m}_SOURCES}")
   endif()
-  set(link_deps "${link_deps};${OPENCV_MODULE_${m}_LINK_DEPS}")
+  if(NOT " ${OPENCV_MODULE_${m}_LINK_DEPS}" STREQUAL " ")
+    list(APPEND link_deps ${OPENCV_MODULE_${m}_LINK_DEPS})
+  endif()
 endforeach()
 
 ocv_glob_module_sources(${headers_list} ${sources_list})
diff --git a/platforms/ios/Info.Dynamic.plist.in b/platforms/ios/Info.Dynamic.plist.in
new file mode 100644
index 0000000000..4ff68cca04
--- /dev/null
+++ b/platforms/ios/Info.Dynamic.plist.in
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>opencv2</string>
+    <key>CFBundleName</key>
+    <string>${OPENCV_APPLE_BUNDLE_NAME}</string>
+    <key>CFBundleIdentifier</key>
+    <string>${OPENCV_APPLE_BUNDLE_ID}</string>
+    <key>CFBundleVersion</key>
+    <string>${OPENCV_LIBVERSION}</string>
+    <key>CFBundleShortVersionString</key>
+    <string>${OPENCV_LIBVERSION}</string>
+    <key>CFBundleSignature</key>
+    <string>????</string>
+    <key>CFBundlePackageType</key>
+    <string>FMWK</string>
+    <key>CFBundleSupportedPlatforms</key>
+    <array>
+        <string>iPhoneOS</string>
+    </array>
+    <key>MinimumOSVersion</key>
+    <string>8.0</string>
+</dict>
+</plist>
\ No newline at end of file
diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
index f8886b6f93..2770a5a102 100644
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -23,6 +23,8 @@ Script will create <outputdir>, if it's missing, and a few its subdirectories:
 The script should handle minor OpenCV updates efficiently
 - it does not recompile the library from scratch each time.
 However, opencv2.framework directory is erased and recreated on each run.
+
+Adding --dynamic parameter will build opencv2.framework as App Store dynamic framework. Only iOS 8+ versions are supported.
 """
 
 from __future__ import print_function
@@ -43,7 +45,7 @@ def getXCodeMajor():
     return 0
 
 class Builder:
-    def __init__(self, opencv, contrib, exclude, targets):
+    def __init__(self, opencv, contrib, dynamic, bitcodedisabled, exclude, targets):
         self.opencv = os.path.abspath(opencv)
         self.contrib = None
         if contrib:
@@ -52,11 +54,18 @@ class Builder:
                 self.contrib = os.path.abspath(modpath)
             else:
                 print("Note: contrib repository is bad - modules subfolder not found", file=sys.stderr)
+        self.dynamic = dynamic
+        self.bitcodedisabled = bitcodedisabled
         self.exclude = exclude
         self.targets = targets
 
     def getBD(self, parent, t):
-        res = os.path.join(parent, '%s-%s' % t)
+
+        if len(t[0]) == 1:
+            res = os.path.join(parent, 'build-%s-%s' % (t[0][0].lower(), t[1].lower()))
+        else:
+            res = os.path.join(parent, 'build-%s' % t[1].lower())
+
         if not os.path.isdir(res):
             os.makedirs(res)
         return os.path.abspath(res)
@@ -70,17 +79,32 @@ class Builder:
 
         xcode_ver = getXCodeMajor()
 
-        for t in self.targets:
+        if self.dynamic:
+            alltargets = self.targets
+        else:
+            # if we are building a static library, we must build each architecture separately
+            alltargets = []
+
+            for t in self.targets:
+                for at in t[0]:
+                    current = ( [at], t[1] )
+
+                    alltargets.append(current)
+
+        for t in alltargets:
             mainBD = self.getBD(mainWD, t)
             dirs.append(mainBD)
+
             cmake_flags = []
             if self.contrib:
                 cmake_flags.append("-DOPENCV_EXTRA_MODULES_PATH=%s" % self.contrib)
-            if xcode_ver >= 7 and t[1] == 'iPhoneOS':
+            if xcode_ver >= 7 and t[1] == 'iPhoneOS' and self.bitcodedisabled == False:
                 cmake_flags.append("-DCMAKE_C_FLAGS=-fembed-bitcode")
                 cmake_flags.append("-DCMAKE_CXX_FLAGS=-fembed-bitcode")
             self.buildOne(t[0], t[1], mainBD, cmake_flags)
-            self.mergeLibs(mainBD)
+
+            if self.dynamic == False:
+                self.mergeLibs(mainBD)
         self.makeFramework(outdir, dirs)
 
     def build(self, outdir):
@@ -97,13 +121,26 @@ class Builder:
         return None
 
     def getCMakeArgs(self, arch, target):
-        args = [
-            "cmake",
-            "-GXcode",
-            "-DAPPLE_FRAMEWORK=ON",
-            "-DCMAKE_INSTALL_PREFIX=install",
-            "-DCMAKE_BUILD_TYPE=Release",
-        ]
+
+        if self.dynamic:
+            args = [
+                "cmake",
+                "-GXcode",
+                "-DAPPLE_FRAMEWORK=ON",
+                "-DCMAKE_INSTALL_PREFIX=install",
+                "-DCMAKE_BUILD_TYPE=Release",
+                "-DBUILD_SHARED_LIBS=ON",
+                "-DCMAKE_MACOSX_BUNDLE=ON",
+                "-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO",
+            ]
+        else:
+            args = [
+                "cmake",
+                "-GXcode",
+                "-DAPPLE_FRAMEWORK=ON",
+                "-DCMAKE_INSTALL_PREFIX=install",
+                "-DCMAKE_BUILD_TYPE=Release",
+            ]
 
         if len(self.exclude) > 0:
             args += ["-DBUILD_opencv_world=OFF"]
@@ -111,16 +148,38 @@ class Builder:
 
         return args
 
-    def getBuildCommand(self, arch, target):
-        buildcmd = [
-            "xcodebuild",
-            "IPHONEOS_DEPLOYMENT_TARGET=6.0",
-            "ARCHS=%s" % arch,
-            "-sdk", target.lower(),
-            "-configuration", "Release",
-            "-parallelizeTargets",
-            "-jobs", "4"
-        ]
+    def getBuildCommand(self, archs, target):
+
+        if self.dynamic:
+            buildcmd = [
+                "xcodebuild",
+                "IPHONEOS_DEPLOYMENT_TARGET=8.0",
+                "ONLY_ACTIVE_ARCH=NO",
+            ]
+
+            for arch in archs:
+                buildcmd.append("-arch")
+                buildcmd.append(arch.lower())
+
+            buildcmd += [
+                "-sdk", target.lower(),
+                "-configuration", "Release",
+                "-parallelizeTargets",
+                "-jobs", "4",
+                "-target","ALL_BUILD",
+            ]
+        else:
+            arch = ";".join(archs)
+            buildcmd = [
+                "xcodebuild",
+                "IPHONEOS_DEPLOYMENT_TARGET=6.0",
+                "ARCHS=%s" % arch,
+                "-sdk", target.lower(),
+                "-configuration", "Release",
+                "-parallelizeTargets",
+                "-jobs", "4"
+            ]
+
         return buildcmd
 
     def getInfoPlist(self, builddirs):
@@ -131,11 +190,12 @@ class Builder:
         toolchain = self.getToolchain(arch, target)
         cmakecmd = self.getCMakeArgs(arch, target) + \
             (["-DCMAKE_TOOLCHAIN_FILE=%s" % toolchain] if toolchain is not None else [])
-        if arch.startswith("armv") or arch.startswith("arm64"):
+        if target.lower().startswith("iphoneos"):
             cmakecmd.append("-DENABLE_NEON=ON")
         cmakecmd.append(self.opencv)
         cmakecmd.extend(cmakeargs)
         execute(cmakecmd, cwd = builddir)
+
         # Clean and build
         clean_dir = os.path.join(builddir, "install")
         if os.path.isdir(clean_dir):
@@ -153,7 +213,6 @@ class Builder:
 
     def makeFramework(self, outdir, builddirs):
         name = "opencv2"
-        libname = "libopencv_merged.a"
 
         # set the current dir to the dst root
         framework_dir = os.path.join(outdir, "%s.framework" % name)
@@ -161,7 +220,12 @@ class Builder:
             shutil.rmtree(framework_dir)
         os.makedirs(framework_dir)
 
-        dstdir = os.path.join(framework_dir, "Versions", "A")
+        if self.dynamic:
+            dstdir = framework_dir
+            libname = "opencv2.framework/opencv2"
+        else:
+            dstdir = os.path.join(framework_dir, "Versions", "A")
+            libname = "libopencv_merged.a"
 
         # copy headers from one of build folders
         shutil.copytree(os.path.join(builddirs[0], "install", "include", "opencv2"), os.path.join(dstdir, "Headers"))
@@ -174,22 +238,27 @@ class Builder:
         print("Creating universal library from:\n\t%s" % "\n\t".join(libs), file=sys.stderr)
         execute(lipocmd)
 
-        # copy Info.plist
-        resdir = os.path.join(dstdir, "Resources")
-        os.makedirs(resdir)
-        shutil.copyfile(self.getInfoPlist(builddirs), os.path.join(resdir, "Info.plist"))
+        # dynamic framework has different structure, just copy the Plist directly
+        if self.dynamic:
+            resdir = dstdir
+            shutil.copyfile(self.getInfoPlist(builddirs), os.path.join(resdir, "Info.plist"))
+        else:
+            # copy Info.plist
+            resdir = os.path.join(dstdir, "Resources")
+            os.makedirs(resdir)
+            shutil.copyfile(self.getInfoPlist(builddirs), os.path.join(resdir, "Info.plist"))
 
-        # make symbolic links
-        links = [
-            (["A"], ["Versions", "Current"]),
-            (["Versions", "Current", "Headers"], ["Headers"]),
-            (["Versions", "Current", "Resources"], ["Resources"]),
-            (["Versions", "Current", name], [name])
-        ]
-        for l in links:
-            s = os.path.join(*l[0])
-            d = os.path.join(framework_dir, *l[1])
-            os.symlink(s, d)
+            # make symbolic links
+            links = [
+                (["A"], ["Versions", "Current"]),
+                (["Versions", "Current", "Headers"], ["Headers"]),
+                (["Versions", "Current", "Resources"], ["Resources"]),
+                (["Versions", "Current", name], [name])
+            ]
+            for l in links:
+                s = os.path.join(*l[0])
+                d = os.path.join(framework_dir, *l[1])
+                os.symlink(s, d)
 
 class iOSBuilder(Builder):
 
@@ -198,6 +267,8 @@ class iOSBuilder(Builder):
         return toolchain
 
     def getCMakeArgs(self, arch, target):
+        arch = ";".join(arch)
+
         args = Builder.getCMakeArgs(self, arch, target)
         args = args + [
             '-DIOS_ARCH=%s' % arch
@@ -212,18 +283,16 @@ if __name__ == "__main__":
     parser.add_argument('--opencv', metavar='DIR', default=folder, help='folder with opencv repository (default is "../.." relative to script location)')
     parser.add_argument('--contrib', metavar='DIR', default=None, help='folder with opencv_contrib repository (default is "None" - build only main framework)')
     parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework')
+    parser.add_argument('--dynamic', default=False, action='store_true', help='build dynamic framework (default is "False" - builds static framework)')
+    parser.add_argument('--disable-bitcode', default=False, dest='bitcodedisabled', action='store_true', help='disable bitcode (enabled by default)')
     args = parser.parse_args()
 
-    b = iOSBuilder(args.opencv, args.contrib, args.without,
+    b = iOSBuilder(args.opencv, args.contrib, args.dynamic, args.bitcodedisabled, args.without,
         [
-            ("armv7", "iPhoneOS"),
-            ("arm64", "iPhoneOS"),
+            (["armv7", "arm64"], "iPhoneOS"),
         ] if os.environ.get('BUILD_PRECOMMIT', None) else
         [
-            ("armv7", "iPhoneOS"),
-            ("armv7s", "iPhoneOS"),
-            ("arm64", "iPhoneOS"),
-            ("i386", "iPhoneSimulator"),
-            ("x86_64", "iPhoneSimulator"),
+            (["armv7", "armv7s", "arm64"], "iPhoneOS"),
+            (["i386", "x86_64"], "iPhoneSimulator"),
         ])
     b.build(args.out)
diff --git a/platforms/ios/cmake/Modules/Platform/iOS.cmake b/platforms/ios/cmake/Modules/Platform/iOS.cmake
index 63cf1d89e7..6915adfbc7 100644
--- a/platforms/ios/cmake/Modules/Platform/iOS.cmake
+++ b/platforms/ios/cmake/Modules/Platform/iOS.cmake
@@ -38,6 +38,14 @@ set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
 set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
 set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
 
+# Additional flags for dynamic framework
+if (APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
+  set (CMAKE_MODULE_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks")
+  set (CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks")
+  set (CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG 1)
+  set (CMAKE_INSTALL_NAME_DIR "@rpath")
+endif()
+
 # Hidden visibilty is required for cxx on iOS
 set (no_warn "-Wno-unused-function -Wno-overloaded-virtual")
 set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${no_warn}")
@@ -144,7 +152,6 @@ set (CMAKE_C_CREATE_MACOSX_FRAMEWORK
 set (CMAKE_CXX_CREATE_MACOSX_FRAMEWORK
     "<CMAKE_CXX_COMPILER> <LANGUAGE_COMPILE_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <LINK_FLAGS> -o <TARGET> -install_name <TARGET_INSTALLNAME_DIR><TARGET_SONAME> <OBJECTS> <LINK_LIBRARIES>")
 
-
 # Add the install directory of the running cmake to the search directories
 # CMAKE_ROOT is CMAKE_INSTALL_PREFIX/share/cmake, so we need to go two levels up
 get_filename_component (_CMAKE_INSTALL_DIR "${CMAKE_ROOT}" PATH)
diff --git a/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake b/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
index 24dab91ff6..44ad57ce74 100644
--- a/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
+++ b/platforms/ios/cmake/Toolchains/common-ios-toolchain.cmake
@@ -86,24 +86,52 @@ endif()
 set(CMAKE_MACOSX_BUNDLE YES)
 set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO")
 
-set(CMAKE_OSX_ARCHITECTURES "${IOS_ARCH}" CACHE INTERNAL "Build architecture for iOS" FORCE)
+if(APPLE_FRAMEWORK AND NOT BUILD_SHARED_LIBS)
+  set(CMAKE_OSX_ARCHITECTURES "${IOS_ARCH}" CACHE INTERNAL "Build architecture for iOS" FORCE)
+endif()
 
 if(NOT __IN_TRY_COMPILE)
   set(_xcodebuild_wrapper "${CMAKE_BINARY_DIR}/xcodebuild_wrapper")
   if(NOT CMAKE_MAKE_PROGRAM STREQUAL _xcodebuild_wrapper)
-    set(_xcodebuild_wrapper_tmp "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/xcodebuild_wrapper")
-    file(WRITE "${_xcodebuild_wrapper_tmp}" "#!/bin/sh
+    if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
+      set(_xcodebuild_wrapper_tmp "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/xcodebuild_wrapper")
+      file(WRITE "${_xcodebuild_wrapper_tmp}" "#!/bin/sh
+${CMAKE_MAKE_PROGRAM} IPHONEOS_DEPLOYMENT_TARGET=8.0 -sdk ${CMAKE_OSX_SYSROOT} \$*")
+      # Make executable
+      file(COPY "${_xcodebuild_wrapper_tmp}" DESTINATION ${CMAKE_BINARY_DIR} FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+      set(CMAKE_MAKE_PROGRAM "${_xcodebuild_wrapper}" CACHE INTERNAL "" FORCE)
+    else()
+      set(_xcodebuild_wrapper_tmp "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/xcodebuild_wrapper")
+      file(WRITE "${_xcodebuild_wrapper_tmp}" "#!/bin/sh
 ${CMAKE_MAKE_PROGRAM} IPHONEOS_DEPLOYMENT_TARGET=6.0 ARCHS=${IOS_ARCH} -sdk ${CMAKE_OSX_SYSROOT} \$*")
-    # Make executable
-    file(COPY "${_xcodebuild_wrapper_tmp}" DESTINATION ${CMAKE_BINARY_DIR} FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
-    set(CMAKE_MAKE_PROGRAM "${_xcodebuild_wrapper}" CACHE INTERNAL "" FORCE)
+      # Make executable
+      file(COPY "${_xcodebuild_wrapper_tmp}" DESTINATION ${CMAKE_BINARY_DIR} FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+      set(CMAKE_MAKE_PROGRAM "${_xcodebuild_wrapper}" CACHE INTERNAL "" FORCE)
+    endif()
   endif()
 endif()
 
 # Standard settings
 set(CMAKE_SYSTEM_NAME iOS)
-set(CMAKE_SYSTEM_VERSION 6.0)
-set(CMAKE_SYSTEM_PROCESSOR "${IOS_ARCH}")
+
+# Apple Framework settings
+if(APPLE_FRAMEWORK AND BUILD_SHARED_LIBS)
+  set(CMAKE_SYSTEM_VERSION 8.0)
+  set(CMAKE_C_SIZEOF_DATA_PTR 4)
+  set(CMAKE_CXX_SIZEOF_DATA_PTR 4)
+else()
+  set(CMAKE_SYSTEM_VERSION 6.0)
+  set(CMAKE_SYSTEM_PROCESSOR "${IOS_ARCH}")
+
+  if(AARCH64 OR X86_64)
+    set(CMAKE_C_SIZEOF_DATA_PTR 8)
+    set(CMAKE_CXX_SIZEOF_DATA_PTR 8)
+  else()
+    set(CMAKE_C_SIZEOF_DATA_PTR 4)
+    set(CMAKE_CXX_SIZEOF_DATA_PTR 4)
+  endif()
+endif()
+
 # Include extra modules for the iOS platform files
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/platforms/ios/cmake/Modules")
 
@@ -112,13 +140,6 @@ include(CMakeForceCompiler)
 #CMAKE_FORCE_C_COMPILER (clang GNU)
 #CMAKE_FORCE_CXX_COMPILER (clang++ GNU)
 
-if(AARCH64 OR X86_64)
-  set(CMAKE_C_SIZEOF_DATA_PTR 8)
-  set(CMAKE_CXX_SIZEOF_DATA_PTR 8)
-else()
-  set(CMAKE_C_SIZEOF_DATA_PTR 4)
-  set(CMAKE_CXX_SIZEOF_DATA_PTR 4)
-endif()
 set(CMAKE_C_HAS_ISYSROOT 1)
 set(CMAKE_CXX_HAS_ISYSROOT 1)
 set(CMAKE_C_COMPILER_ABI ELF)
@@ -134,4 +155,4 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 
-toolchain_save_config(IOS_ARCH)
+toolchain_save_config(IOS_ARCH)
\ No newline at end of file
diff --git a/platforms/osx/build_framework.py b/platforms/osx/build_framework.py
index 2d39be5f9b..64e73c5490 100644
--- a/platforms/osx/build_framework.py
+++ b/platforms/osx/build_framework.py
@@ -15,10 +15,10 @@ class OSXBuilder(Builder):
     def getToolchain(self, arch, target):
         return None
 
-    def getBuildCommand(self, arch, target):
+    def getBuildCommand(self, archs, target):
         buildcmd = [
             "xcodebuild",
-            "ARCHS=%s" % arch,
+            "ARCHS=%s" % archs[0],
             "-sdk", target.lower(),
             "-configuration", "Release",
             "-parallelizeTargets",
@@ -39,8 +39,8 @@ if __name__ == "__main__":
     parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework')
     args = parser.parse_args()
 
-    b = OSXBuilder(args.opencv, args.contrib, args.without,
+    b = OSXBuilder(args.opencv, args.contrib, False, False, args.without,
         [
-            ("x86_64", "MacOSX")
+            (["x86_64"], "MacOSX")
         ])
     b.build(args.out)
diff --git a/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp b/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
index b24451de35..2535782087 100644
--- a/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/BasicLinearTransforms.cpp
@@ -8,51 +8,60 @@
 #include "opencv2/highgui.hpp"
 #include <iostream>
 
+using namespace std;
 using namespace cv;
 
-double alpha; /**< Simple contrast control */
-int beta;  /**< Simple brightness control */
-
 /**
  * @function main
  * @brief Main function
  */
 int main( int, char** argv )
 {
-   /// Read image given by user
-   Mat image = imread( argv[1] );
-   Mat new_image = Mat::zeros( image.size(), image.type() );
+    //! [basic-linear-transform-parameters]
+    double alpha = 1.0; /*< Simple contrast control */
+    int beta = 0;       /*< Simple brightness control */
+    //! [basic-linear-transform-parameters]
 
-   /// Initialize values
-   std::cout<<" Basic Linear Transforms "<<std::endl;
-   std::cout<<"-------------------------"<<std::endl;
-   std::cout<<"* Enter the alpha value [1.0-3.0]: ";std::cin>>alpha;
-   std::cout<<"* Enter the beta value [0-100]: "; std::cin>>beta;
+    /// Read image given by user
+    //! [basic-linear-transform-load]
+    Mat image = imread( argv[1] );
+    //! [basic-linear-transform-load]
+    //! [basic-linear-transform-output]
+    Mat new_image = Mat::zeros( image.size(), image.type() );
+    //! [basic-linear-transform-output]
 
+    /// Initialize values
+    cout << " Basic Linear Transforms " << endl;
+    cout << "-------------------------" << endl;
+    cout << "* Enter the alpha value [1.0-3.0]: "; cin >> alpha;
+    cout << "* Enter the beta value [0-100]: ";    cin >> beta;
 
-   /// Do the operation new_image(i,j) = alpha*image(i,j) + beta
-   /// Instead of these 'for' loops we could have used simply:
-   /// image.convertTo(new_image, -1, alpha, beta);
-   /// but we wanted to show you how to access the pixels :)
-   for( int y = 0; y < image.rows; y++ )
-      { for( int x = 0; x < image.cols; x++ )
-           { for( int c = 0; c < 3; c++ )
-                {
-          new_image.at<Vec3b>(y,x)[c] = saturate_cast<uchar>( alpha*( image.at<Vec3b>(y,x)[c] ) + beta );
-                }
-       }
-      }
+    /// Do the operation new_image(i,j) = alpha*image(i,j) + beta
+    /// Instead of these 'for' loops we could have used simply:
+    /// image.convertTo(new_image, -1, alpha, beta);
+    /// but we wanted to show you how to access the pixels :)
+    //! [basic-linear-transform-operation]
+    for( int y = 0; y < image.rows; y++ ) {
+        for( int x = 0; x < image.cols; x++ ) {
+            for( int c = 0; c < 3; c++ ) {
+                new_image.at<Vec3b>(y,x)[c] =
+                  saturate_cast<uchar>( alpha*( image.at<Vec3b>(y,x)[c] ) + beta );
+            }
+        }
+    }
+    //! [basic-linear-transform-operation]
 
-   /// Create Windows
-   namedWindow("Original Image", 1);
-   namedWindow("New Image", 1);
+    //! [basic-linear-transform-display]
+    /// Create Windows
+    namedWindow("Original Image", WINDOW_AUTOSIZE);
+    namedWindow("New Image", WINDOW_AUTOSIZE);
 
-   /// Show stuff
-   imshow("Original Image", image);
-   imshow("New Image", new_image);
+    /// Show stuff
+    imshow("Original Image", image);
+    imshow("New Image", new_image);
 
-
-   /// Wait until user press some key
-   waitKey();
-   return 0;
+    /// Wait until user press some key
+    waitKey();
+    //! [basic-linear-transform-display]
+    return 0;
 }
diff --git a/samples/cpp/tutorial_code/ImgProc/HitMiss.cpp b/samples/cpp/tutorial_code/ImgProc/HitMiss.cpp
index 0463aabe39..806eec489e 100644
--- a/samples/cpp/tutorial_code/ImgProc/HitMiss.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/HitMiss.cpp
@@ -15,7 +15,7 @@ int main(){
         0, 255, 0, 255, 0, 0, 255, 0,
         0, 255, 255, 255, 0, 0, 0, 0);
 
-    Mat kernel = (Mat_<uchar>(3, 3) <<
+    Mat kernel = (Mat_<int>(3, 3) <<
         0, 1, 0,
         1, -1, 1,
         0, 1, 0);
@@ -23,10 +23,15 @@ int main(){
     Mat output_image;
     morphologyEx(input_image, output_image, MORPH_HITMISS, kernel);
 
-    namedWindow("Original", CV_WINDOW_NORMAL);
+    const int rate = 10;
+    kernel = (kernel + 1) * 127;
+    kernel.convertTo(kernel, CV_8U);
+    cv::resize(kernel, kernel, cv::Size(), rate, rate, INTER_NEAREST);
+    imshow("kernel", kernel);
+    cv::resize(input_image, input_image, cv::Size(), rate, rate, INTER_NEAREST);
     imshow("Original", input_image);
-    namedWindow("Hit or Miss", CV_WINDOW_NORMAL);
+    cv::resize(output_image, output_image, cv::Size(), rate, rate, INTER_NEAREST);
     imshow("Hit or Miss", output_image);
     waitKey(0);
     return 0;
-}
\ No newline at end of file
+}
diff --git a/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp b/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp
new file mode 100644
index 0000000000..7fc28f03d1
--- /dev/null
+++ b/samples/cpp/tutorial_code/ImgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.cpp
@@ -0,0 +1,91 @@
+#include <iostream>
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/highgui.hpp"
+
+using namespace std;
+using namespace cv;
+
+namespace
+{
+/** Global Variables */
+int alpha = 100;
+int beta = 100;
+int gamma_cor = 100;
+Mat img_original, img_corrected, img_gamma_corrected;
+
+void basicLinearTransform(const Mat &img, const double alpha_, const int beta_)
+{
+    Mat res;
+    img.convertTo(res, -1, alpha_, beta_);
+
+    hconcat(img, res, img_corrected);
+}
+
+void gammaCorrection(const Mat &img, const double gamma_)
+{
+    CV_Assert(gamma_ >= 0);
+    //![changing-contrast-brightness-gamma-correction]
+    Mat lookUpTable(1, 256, CV_8U);
+    uchar* p = lookUpTable.ptr();
+    for( int i = 0; i < 256; ++i)
+        p[i] = saturate_cast<uchar>(pow(i / 255.0, gamma_) * 255.0);
+
+    Mat res = img.clone();
+    LUT(img, lookUpTable, res);
+    //![changing-contrast-brightness-gamma-correction]
+
+    hconcat(img, res, img_gamma_corrected);
+}
+
+void on_linear_transform_alpha_trackbar(int, void *)
+{
+    double alpha_value = alpha / 100.0;
+    int beta_value = beta - 100;
+    basicLinearTransform(img_original, alpha_value, beta_value);
+}
+
+void on_linear_transform_beta_trackbar(int, void *)
+{
+    double alpha_value = alpha / 100.0;
+    int beta_value = beta - 100;
+    basicLinearTransform(img_original, alpha_value, beta_value);
+}
+
+void on_gamma_correction_trackbar(int, void *)
+{
+    double gamma_value = gamma_cor / 100.0;
+    gammaCorrection(img_original, gamma_value);
+}
+}
+
+int main( int, char** argv )
+{
+    img_original = imread( argv[1] );
+    img_corrected = Mat(img_original.rows, img_original.cols*2, img_original.type());
+    img_gamma_corrected = Mat(img_original.rows, img_original.cols*2, img_original.type());
+
+    hconcat(img_original, img_original, img_corrected);
+    hconcat(img_original, img_original, img_gamma_corrected);
+
+    namedWindow("Brightness and contrast adjustments", WINDOW_AUTOSIZE);
+    namedWindow("Gamma correction", WINDOW_AUTOSIZE);
+
+    createTrackbar("Alpha gain (contrast)", "Brightness and contrast adjustments", &alpha, 500, on_linear_transform_alpha_trackbar);
+    createTrackbar("Beta bias (brightness)", "Brightness and contrast adjustments", &beta, 200, on_linear_transform_beta_trackbar);
+    createTrackbar("Gamma correction", "Gamma correction", &gamma_cor, 200, on_gamma_correction_trackbar);
+
+    while (true)
+    {
+        imshow("Brightness and contrast adjustments", img_corrected);
+        imshow("Gamma correction", img_gamma_corrected);
+
+        int c = waitKey(30);
+        if (c == 27)
+            break;
+    }
+
+    imwrite("linear_transform_correction.png", img_corrected);
+    imwrite("gamma_correction.png", img_gamma_corrected);
+
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
new file mode 100644
index 0000000000..c661b919b3
--- /dev/null
+++ b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
@@ -0,0 +1,122 @@
+#include <iostream>
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+
+using namespace std;
+using namespace cv;
+
+namespace
+{
+//! [mandelbrot-escape-time-algorithm]
+int mandelbrot(const complex<float> &z0, const int max)
+{
+    complex<float> z = z0;
+    for (int t = 0; t < max; t++)
+    {
+        if (z.real()*z.real() + z.imag()*z.imag() > 4.0f) return t;
+        z = z*z + z0;
+    }
+
+    return max;
+}
+//! [mandelbrot-escape-time-algorithm]
+
+//! [mandelbrot-grayscale-value]
+int mandelbrotFormula(const complex<float> &z0, const int maxIter=500) {
+    int value = mandelbrot(z0, maxIter);
+    if(maxIter - value == 0)
+    {
+        return 0;
+    }
+
+    return cvRound(sqrt(value / (float) maxIter) * 255);
+}
+//! [mandelbrot-grayscale-value]
+
+//! [mandelbrot-parallel]
+class ParallelMandelbrot : public ParallelLoopBody
+{
+public:
+    ParallelMandelbrot (Mat &img, const float x1, const float y1, const float scaleX, const float scaleY)
+        : m_img(img), m_x1(x1), m_y1(y1), m_scaleX(scaleX), m_scaleY(scaleY)
+    {
+    }
+
+    virtual void operator ()(const Range& range) const
+    {
+        for (int r = range.start; r < range.end; r++)
+        {
+            int i = r / m_img.cols;
+            int j = r % m_img.cols;
+
+            float x0 = j / m_scaleX + m_x1;
+            float y0 = i / m_scaleY + m_y1;
+
+            complex<float> z0(x0, y0);
+            uchar value = (uchar) mandelbrotFormula(z0);
+            m_img.ptr<uchar>(i)[j] = value;
+        }
+    }
+
+    ParallelMandelbrot& operator=(const ParallelMandelbrot &) {
+        return *this;
+    };
+
+private:
+    Mat &m_img;
+    float m_x1;
+    float m_y1;
+    float m_scaleX;
+    float m_scaleY;
+};
+//! [mandelbrot-parallel]
+
+//! [mandelbrot-sequential]
+void sequentialMandelbrot(Mat &img, const float x1, const float y1, const float scaleX, const float scaleY)
+{
+    for (int i = 0; i < img.rows; i++)
+    {
+        for (int j = 0; j < img.cols; j++)
+        {
+            float x0 = j / scaleX + x1;
+            float y0 = i / scaleY + y1;
+
+            complex<float> z0(x0, y0);
+            uchar value = (uchar) mandelbrotFormula(z0);
+            img.ptr<uchar>(i)[j] = value;
+        }
+    }
+}
+//! [mandelbrot-sequential]
+}
+
+int main()
+{
+    //! [mandelbrot-transformation]
+    Mat mandelbrotImg(4800, 5400, CV_8U);
+    float x1 = -2.1f, x2 = 0.6f;
+    float y1 = -1.2f, y2 = 1.2f;
+    float scaleX = mandelbrotImg.cols / (x2 - x1);
+    float scaleY = mandelbrotImg.rows / (y2 - y1);
+    //! [mandelbrot-transformation]
+
+    double t1 = (double) getTickCount();
+    //! [mandelbrot-parallel-call]
+    ParallelMandelbrot parallelMandelbrot(mandelbrotImg, x1, y1, scaleX, scaleY);
+    parallel_for_(Range(0, mandelbrotImg.rows*mandelbrotImg.cols), parallelMandelbrot);
+    //! [mandelbrot-parallel-call]
+    t1 = ((double) getTickCount() - t1) / getTickFrequency();
+    cout << "Parallel Mandelbrot: " << t1 << " s" << endl;
+
+    Mat mandelbrotImgSequential(4800, 5400, CV_8U);
+    double t2 = (double) getTickCount();
+    sequentialMandelbrot(mandelbrotImgSequential, x1, y1, scaleX, scaleY);
+    t2 = ((double) getTickCount() - t2) / getTickFrequency();
+    cout << "Sequential Mandelbrot: " << t2 << " s" << endl;
+    cout << "Speed-up: " << t2/t1 << " X" << endl;
+
+    imwrite("Mandelbrot_parallel.png", mandelbrotImg);
+    imwrite("Mandelbrot_sequential.png", mandelbrotImgSequential);
+
+    return EXIT_SUCCESS;
+}
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 8c97ea865a..0f734677be 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -14,27 +14,19 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   project("${project}_samples")
 
   ocv_include_modules_recurse(${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})
-  ocv_include_directories(
-    "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia"
-    "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia/core"
-    )
 
   if(HAVE_opencv_xfeatures2d)
-    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/xfeatures2d/include")
+    ocv_include_modules_recurse(opencv_xfeatures2d)
   endif()
 
   if(HAVE_opencv_cudacodec)
-    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/cudacodec/include")
+    ocv_include_modules_recurse(opencv_cudacodec)
   endif()
 
   if(HAVE_CUDA)
     ocv_include_directories(${CUDA_INCLUDE_DIRS})
   endif()
 
-  if(HAVE_OPENCL)
-    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/ocl/include")
-  endif()
-
   if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
   endif()
diff --git a/samples/gpu/performance/CMakeLists.txt b/samples/gpu/performance/CMakeLists.txt
index 9c9fb5b676..2b1bf0be6b 100644
--- a/samples/gpu/performance/CMakeLists.txt
+++ b/samples/gpu/performance/CMakeLists.txt
@@ -4,11 +4,11 @@ file(GLOB sources "performance/*.cpp")
 file(GLOB headers "performance/*.h")
 
 if(HAVE_opencv_xfeatures2d)
-  ocv_include_directories("${opencv_xfeatures2d_SOURCE_DIR}/include")
+  ocv_include_modules_recurse(opencv_xfeatures2d)
 endif()
 
 if(HAVE_opencv_bgsegm)
-  ocv_include_directories("${opencv_bgsegm_SOURCE_DIR}/include")
+  ocv_include_modules_recurse(opencv_bgsegm)
 endif()
 
 add_executable(${the_target} ${sources} ${headers})
diff --git a/samples/python/tutorial_code/imgProc/hough_line_transform/hough_line_transform.py b/samples/python/tutorial_code/imgProc/hough_line_transform/hough_line_transform.py
new file mode 100644
index 0000000000..0bcf6c5e43
--- /dev/null
+++ b/samples/python/tutorial_code/imgProc/hough_line_transform/hough_line_transform.py
@@ -0,0 +1,22 @@
+import cv2
+import numpy as np
+
+img = cv2.imread('../data/sudoku.png')
+gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
+edges = cv2.Canny(gray,50,150,apertureSize = 3)
+
+lines = cv2.HoughLines(edges,1,np.pi/180,200)
+for line in lines:
+    rho,theta = line[0]
+    a = np.cos(theta)
+    b = np.sin(theta)
+    x0 = a*rho
+    y0 = b*rho
+    x1 = int(x0 + 1000*(-b))
+    y1 = int(y0 + 1000*(a))
+    x2 = int(x0 - 1000*(-b))
+    y2 = int(y0 - 1000*(a))
+
+    cv2.line(img,(x1,y1),(x2,y2),(0,0,255),2)
+
+cv2.imwrite('houghlines3.jpg',img)
diff --git a/samples/python/tutorial_code/imgProc/hough_line_transform/probabilistic_hough_line_transform.py b/samples/python/tutorial_code/imgProc/hough_line_transform/probabilistic_hough_line_transform.py
new file mode 100644
index 0000000000..2d000a1226
--- /dev/null
+++ b/samples/python/tutorial_code/imgProc/hough_line_transform/probabilistic_hough_line_transform.py
@@ -0,0 +1,12 @@
+import cv2
+import numpy as np
+
+img = cv2.imread('../data/sudoku.png')
+gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
+edges = cv2.Canny(gray,50,150,apertureSize = 3)
+lines = cv2.HoughLinesP(edges,1,np.pi/180,100,minLineLength=100,maxLineGap=10)
+for line in lines:
+    x1,y1,x2,y2 = line[0]
+    cv2.line(img,(x1,y1),(x2,y2),(0,255,0),2)
+
+cv2.imwrite('houghlines5.jpg',img)