diff --git a/apps/interactive-calibration/rotationConverters.cpp b/apps/interactive-calibration/rotationConverters.cpp
index ff31c9e380..421d15a924 100644
--- a/apps/interactive-calibration/rotationConverters.cpp
+++ b/apps/interactive-calibration/rotationConverters.cpp
@@ -16,7 +16,7 @@ void calib::Euler(const cv::Mat& src, cv::Mat& dst, int argType)
 {
     if((src.rows == 3) && (src.cols == 3))
     {
-        //convert rotaion matrix to 3 angles (pitch, yaw, roll)
+        //convert rotation matrix to 3 angles (pitch, yaw, roll)
         dst = cv::Mat(3, 1, CV_64F);
         double pitch, yaw, roll;
 
@@ -55,7 +55,7 @@ void calib::Euler(const cv::Mat& src, cv::Mat& dst, int argType)
     else if( (src.cols == 1 && src.rows == 3) ||
              (src.cols == 3 && src.rows == 1 ) )
     {
-        //convert vector which contains 3 angles (pitch, yaw, roll) to rotaion matrix
+        //convert vector which contains 3 angles (pitch, yaw, roll) to rotation matrix
         double pitch, yaw, roll;
         if(src.cols == 1 && src.rows == 3)
         {
diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
index 632b8c8285..37d557a792 100644
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@@ -141,7 +141,7 @@
 #   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
 #
 #   CUDA_BUILD_CLEAN_TARGET()
-#   -- Creates a convience target that deletes all the dependency files
+#   -- Creates a convenience target that deletes all the dependency files
 #      generated.  You should make clean after running this target to ensure the
 #      dependency files get regenerated.
 #
@@ -473,7 +473,7 @@ else()
 endif()
 
 # Propagate the host flags to the host compiler via -Xcompiler
-option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
+option(CUDA_PROPAGATE_HOST_FLAGS "Propagate C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
 
 # Enable CUDA_SEPARABLE_COMPILATION
 option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index 8beabefe41..377eb98a65 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -761,24 +761,24 @@ macro(ocv_compiler_optimization_fill_cpu_config)
   endif()
 endmacro()
 
-macro(ocv_add_dispatched_file filename)
+macro(__ocv_add_dispatched_file filename target_src_var src_directory dst_directory precomp_hpp optimizations_var)
   if(NOT OPENCV_INITIAL_PASS)
     set(__codestr "
-#include \"${CMAKE_CURRENT_LIST_DIR}/src/precomp.hpp\"
-#include \"${CMAKE_CURRENT_LIST_DIR}/src/${filename}.simd.hpp\"
+#include \"${src_directory}/${precomp_hpp}\"
+#include \"${src_directory}/${filename}.simd.hpp\"
 ")
 
-    set(__declarations_str "#define CV_CPU_SIMD_FILENAME \"${CMAKE_CURRENT_LIST_DIR}/src/${filename}.simd.hpp\"")
+    set(__declarations_str "#define CV_CPU_SIMD_FILENAME \"${src_directory}/${filename}.simd.hpp\"")
     set(__dispatch_modes "BASELINE")
 
-    set(__optimizations "${ARGN}")
+    set(__optimizations "${${optimizations_var}}")
     if(CV_DISABLE_OPTIMIZATION OR NOT CV_ENABLE_INTRINSICS)
       set(__optimizations "")
     endif()
 
     foreach(OPT ${__optimizations})
       string(TOLOWER "${OPT}" OPT_LOWER)
-      set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${OPT_LOWER}.cpp")
+      set(__file "${CMAKE_CURRENT_BINARY_DIR}/${dst_directory}${filename}.${OPT_LOWER}.cpp")
       if(EXISTS "${__file}")
         file(READ "${__file}" __content)
       else()
@@ -791,7 +791,11 @@ macro(ocv_add_dispatched_file filename)
       endif()
 
       if(";${CPU_DISPATCH};" MATCHES "${OPT}" OR __CPU_DISPATCH_INCLUDE_ALL)
-        list(APPEND OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED "${__file}")
+        if(EXISTS "${src_directory}/${filename}.${OPT_LOWER}.cpp")
+          message(STATUS "Using overrided ${OPT} source: ${src_directory}/${filename}.${OPT_LOWER}.cpp")
+        else()
+          list(APPEND ${target_src_var} "${__file}")
+        endif()
       endif()
 
       set(__declarations_str "${__declarations_str}
@@ -803,9 +807,11 @@ macro(ocv_add_dispatched_file filename)
 
     set(__declarations_str "${__declarations_str}
 #define CV_CPU_DISPATCH_MODES_ALL ${__dispatch_modes}
+
+#undef CV_CPU_SIMD_FILENAME
 ")
 
-    set(__file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.simd_declarations.hpp")
+    set(__file "${CMAKE_CURRENT_BINARY_DIR}/${dst_directory}${filename}.simd_declarations.hpp")
     if(EXISTS "${__file}")
       file(READ "${__file}" __content)
     endif()
@@ -817,6 +823,17 @@ macro(ocv_add_dispatched_file filename)
   endif()
 endmacro()
 
+macro(ocv_add_dispatched_file filename)
+  set(__optimizations "${ARGN}")
+  if(" ${ARGV1}" STREQUAL " TEST")
+    list(REMOVE_AT __optimizations 0)
+    __ocv_add_dispatched_file("${filename}" "OPENCV_MODULE_${the_module}_TEST_SOURCES_DISPATCHED" "${CMAKE_CURRENT_LIST_DIR}/test" "test/" "test_precomp.hpp" __optimizations)
+  else()
+    __ocv_add_dispatched_file("${filename}" "OPENCV_MODULE_${the_module}_SOURCES_DISPATCHED" "${CMAKE_CURRENT_LIST_DIR}/src" "" "precomp.hpp" __optimizations)
+  endif()
+endmacro()
+
+
 # Workaround to support code which always require all code paths
 macro(ocv_add_dispatched_file_force_all)
   set(__CPU_DISPATCH_INCLUDE_ALL 1)
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 3bdb6fa961..58e204094f 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -3,7 +3,7 @@ if(WIN32 AND NOT MSVC)
   return()
 endif()
 
-if(NOT APPLE AND CV_CLANG)
+if(NOT UNIX AND CV_CLANG)
   message(STATUS "CUDA compilation is disabled (due to Clang unsupported on your platform).")
   return()
 endif()
@@ -188,6 +188,13 @@ if(CUDA_FOUND)
     foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
       set(${var}_backup_in_cuda_compile_ "${${var}}")
 
+      if (CV_CLANG)
+        # we remove -Winconsistent-missing-override and -Qunused-arguments
+        # just in case we are compiling CUDA with gcc but OpenCV with clang
+        string(REPLACE "-Winconsistent-missing-override" "" ${var} "${${var}}")
+        string(REPLACE "-Qunused-arguments" "" ${var} "${${var}}")
+      endif()
+
       # we remove /EHa as it generates warnings under windows
       string(REPLACE "/EHa" "" ${var} "${${var}}")
 
diff --git a/cmake/OpenCVDownload.cmake b/cmake/OpenCVDownload.cmake
index 7724147d31..cdc47ad2cb 100644
--- a/cmake/OpenCVDownload.cmake
+++ b/cmake/OpenCVDownload.cmake
@@ -20,16 +20,19 @@ if(DEFINED ENV{OPENCV_DOWNLOAD_PATH})
 endif()
 set(OPENCV_DOWNLOAD_PATH "${OpenCV_SOURCE_DIR}/.cache" CACHE PATH "${HELP_OPENCV_DOWNLOAD_PATH}")
 set(OPENCV_DOWNLOAD_LOG "${OpenCV_BINARY_DIR}/CMakeDownloadLog.txt")
+set(OPENCV_DOWNLOAD_WITH_CURL "${OpenCV_BINARY_DIR}/download_with_curl.sh")
+set(OPENCV_DOWNLOAD_WITH_WGET "${OpenCV_BINARY_DIR}/download_with_wget.sh")
 
-# Init download cache directory and log file
+# Init download cache directory and log file and helper scripts
 if(NOT EXISTS "${OPENCV_DOWNLOAD_PATH}")
   file(MAKE_DIRECTORY ${OPENCV_DOWNLOAD_PATH})
 endif()
 if(NOT EXISTS "${OPENCV_DOWNLOAD_PATH}/.gitignore")
   file(WRITE "${OPENCV_DOWNLOAD_PATH}/.gitignore" "*\n")
 endif()
-file(WRITE "${OPENCV_DOWNLOAD_LOG}" "use_cache \"${OPENCV_DOWNLOAD_PATH}\"\n")
-
+file(WRITE "${OPENCV_DOWNLOAD_LOG}" "#use_cache \"${OPENCV_DOWNLOAD_PATH}\"\n")
+file(REMOVE "${OPENCV_DOWNLOAD_WITH_CURL}")
+file(REMOVE "${OPENCV_DOWNLOAD_WITH_WGET}")
 
 function(ocv_download)
   cmake_parse_arguments(DL "UNPACK;RELATIVE_URL" "FILENAME;HASH;DESTINATION_DIR;ID;STATUS" "URL" ${ARGN})
@@ -103,7 +106,7 @@ function(ocv_download)
   endif()
 
   # Log all calls to file
-  ocv_download_log("do_${mode} \"${DL_FILENAME}\" \"${DL_HASH}\" \"${DL_URL}\" \"${DL_DESTINATION_DIR}\"")
+  ocv_download_log("#do_${mode} \"${DL_FILENAME}\" \"${DL_HASH}\" \"${DL_URL}\" \"${DL_DESTINATION_DIR}\"")
   # ... and to console
   set(__msg_prefix "")
   if(DL_ID)
@@ -191,6 +194,9 @@ function(ocv_download)
 For details please refer to the download log file:
 ${OPENCV_DOWNLOAD_LOG}
 ")
+      # write helper scripts for failed downloads
+      file(APPEND "${OPENCV_DOWNLOAD_WITH_CURL}" "curl --output \"${CACHE_CANDIDATE}\" \"${DL_URL}\"\n")
+      file(APPEND "${OPENCV_DOWNLOAD_WITH_WGET}" "wget -O \"${CACHE_CANDIDATE}\" \"${DL_URL}\"\n")
       return()
     endif()
 
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 00d15dc6d9..54f100d3cf 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -1202,6 +1202,9 @@ function(ocv_add_accuracy_tests)
         set(OPENCV_TEST_${the_module}_SOURCES ${test_srcs} ${test_hdrs})
       endif()
 
+      if(OPENCV_MODULE_${the_module}_TEST_SOURCES_DISPATCHED)
+        list(APPEND OPENCV_TEST_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_TEST_SOURCES_DISPATCHED})
+      endif()
       ocv_compiler_optimization_process_sources(OPENCV_TEST_${the_module}_SOURCES OPENCV_TEST_${the_module}_DEPS ${the_target})
 
       if(NOT BUILD_opencv_world)
@@ -1211,6 +1214,9 @@ function(ocv_add_accuracy_tests)
       source_group("Src" FILES "${${the_target}_pch}")
       ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch})
       ocv_target_include_modules(${the_target} ${test_deps} "${test_path}")
+      if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/test")
+        ocv_target_include_directories(${the_target} "${CMAKE_CURRENT_BINARY_DIR}/test")
+      endif()
       ocv_target_link_libraries(${the_target} LINK_PRIVATE ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS} ${OPENCV_TEST_${the_module}_DEPS})
       add_dependencies(opencv_tests ${the_target})
 
diff --git a/cmake/OpenCVPCHSupport.cmake b/cmake/OpenCVPCHSupport.cmake
index b4658c604b..f9b1b48b65 100644
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@@ -362,7 +362,7 @@ MACRO(ADD_NATIVE_PRECOMPILED_HEADER _targetName _input)
           endif()
         endforeach()
 
-        #also inlude ${oldProps} to have the same compile options
+        #also include ${oldProps} to have the same compile options
         GET_TARGET_PROPERTY(oldProps ${_targetName} COMPILE_FLAGS)
         if (oldProps MATCHES NOTFOUND)
             SET(oldProps "")
diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in
index 84262a87b3..fefa359e0a 100644
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -260,7 +260,7 @@ endif()
 set(OpenCV_LIBRARIES ${OpenCV_LIBS})
 
 #
-# Some macroses for samples
+# Some macros for samples
 #
 macro(ocv_check_dependencies)
   set(OCV_DEPENDENCIES_FOUND TRUE)
diff --git a/doc/js_tutorials/js_imgproc/js_grabcut/js_grabcut.markdown b/doc/js_tutorials/js_imgproc/js_grabcut/js_grabcut.markdown
index 570a490fea..ef71d07aa5 100644
--- a/doc/js_tutorials/js_imgproc/js_grabcut/js_grabcut.markdown
+++ b/doc/js_tutorials/js_imgproc/js_grabcut/js_grabcut.markdown
@@ -29,7 +29,7 @@ What happens in background ?
     objects). Everything inside rectangle is unknown. Similarly any user input specifying
     foreground and background are considered as hard-labelling which means they won't change in
     the process.
--   Computer does an initial labelling depeding on the data we gave. It labels the foreground and
+-   Computer does an initial labelling depending on the data we gave. It labels the foreground and
     background pixels (or it hard-labels)
 -   Now a Gaussian Mixture Model(GMM) is used to model the foreground and background.
 -   Depending on the data we gave, GMM learns and create new pixel distribution. That is, the
diff --git a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
index 72f481df7a..88aba1afd5 100644
--- a/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
+++ b/doc/js_tutorials/js_setup/js_usage/js_usage.markdown
@@ -129,7 +129,7 @@ function onOpenCvReady() {
 </html>
 @endcode
 
-@note You have to call delete method of cv.Mat to free memory allocated in Emscripten's heap. Please refer to [Memeory management of Emscripten](https://kripken.github.io/emscripten-site/docs/porting/connecting_cpp_and_javascript/embind.html#memory-management) for details.
+@note You have to call delete method of cv.Mat to free memory allocated in Emscripten's heap. Please refer to [Memory management of Emscripten](https://kripken.github.io/emscripten-site/docs/porting/connecting_cpp_and_javascript/embind.html#memory-management) for details.
 
 Try it
 ------
diff --git a/doc/py_tutorials/py_imgproc/py_grabcut/py_grabcut.markdown b/doc/py_tutorials/py_imgproc/py_grabcut/py_grabcut.markdown
index 2c489e2453..7dc22d37aa 100644
--- a/doc/py_tutorials/py_imgproc/py_grabcut/py_grabcut.markdown
+++ b/doc/py_tutorials/py_imgproc/py_grabcut/py_grabcut.markdown
@@ -37,7 +37,7 @@ So what happens in background ?
     objects). Everything inside rectangle is unknown. Similarly any user input specifying
     foreground and background are considered as hard-labelling which means they won't change in
     the process.
--   Computer does an initial labelling depeding on the data we gave. It labels the foreground and
+-   Computer does an initial labelling depending on the data we gave. It labels the foreground and
     background pixels (or it hard-labels)
 -   Now a Gaussian Mixture Model(GMM) is used to model the foreground and background.
 -   Depending on the data we gave, GMM learns and create new pixel distribution. That is, the
diff --git a/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown b/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
index b974b8bc63..4347d11651 100644
--- a/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
+++ b/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
@@ -16,7 +16,7 @@ In this tutorial is explained how to build a real time application to estimate t
 order to track a textured object with six degrees of freedom given a 2D image and its 3D textured
 model.
 
-The application will have the followings parts:
+The application will have the following parts:
 
 -   Read 3D textured object model and object mesh.
 -   Take input from Camera or Video.
@@ -426,16 +426,16 @@ Here is explained in detail the code for the real time application:
     @endcode
     OpenCV provides four PnP methods: ITERATIVE, EPNP, P3P and DLS. Depending on the application type,
     the estimation method will be different. In the case that we want to make a real time application,
-    the more suitable methods are EPNP and P3P due to that are faster than ITERATIVE and DLS at
+    the more suitable methods are EPNP and P3P since they are faster than ITERATIVE and DLS at
     finding an optimal solution. However, EPNP and P3P are not especially robust in front of planar
-    surfaces and sometimes the pose estimation seems to have a mirror effect. Therefore, in this this
-    tutorial is used ITERATIVE method due to the object to be detected has planar surfaces.
+    surfaces and sometimes the pose estimation seems to have a mirror effect. Therefore, in this
+    tutorial an ITERATIVE method is used due to the object to be detected has planar surfaces.
 
-    The OpenCV RANSAC implementation wants you to provide three parameters: the maximum number of
-    iterations until stop the algorithm, the maximum allowed distance between the observed and
-    computed point projections to consider it an inlier and the confidence to obtain a good result.
+    The OpenCV RANSAC implementation wants you to provide three parameters: 1) the maximum number of
+    iterations until the algorithm stops, 2) the maximum allowed distance between the observed and
+    computed point projections to consider it an inlier and 3) the confidence to obtain a good result.
     You can tune these parameters in order to improve your algorithm performance. Increasing the
-    number of iterations you will have a more accurate solution, but will take more time to find a
+    number of iterations will have a more accurate solution, but will take more time to find a
     solution. Increasing the reprojection error will reduce the computation time, but your solution
     will be unaccurate. Decreasing the confidence your algorithm will be faster, but the obtained
     solution will be unaccurate.
diff --git a/doc/tutorials/introduction/windows_install/windows_install.markdown b/doc/tutorials/introduction/windows_install/windows_install.markdown
index e60c846b12..7f491d8fdd 100644
--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@@ -46,7 +46,7 @@ cd /c/lib
 myRepo=$(pwd)
 CMAKE_CONFIG_GENERATOR="Visual Studio 14 2015 Win64"
 if [  ! -d "$myRepo/opencv"  ]; then
-    echo "clonning opencv"
+    echo "cloning opencv"
     git clone https://github.com/opencv/opencv.git
     mkdir Build
     mkdir Build/opencv
@@ -58,7 +58,7 @@ else
     cd ..
 fi
 if [  ! -d "$myRepo/opencv_contrib"  ]; then
-    echo "clonning opencv_contrib"
+    echo "cloning opencv_contrib"
     git clone https://github.com/opencv/opencv_contrib.git
     mkdir Build
     mkdir Build/opencv_contrib
diff --git a/modules/calib3d/test/test_chesscorners.cpp b/modules/calib3d/test/test_chesscorners.cpp
index 8303a8dcd4..e55d069de0 100644
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@@ -198,7 +198,7 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
 
     if( !fs.isOpened() || board_list.empty() || !board_list.isSeq() || board_list.size() % 2 != 0 )
     {
-        ts->printf( cvtest::TS::LOG, "%s can not be readed or is not valid\n", (folder + filename).c_str() );
+        ts->printf( cvtest::TS::LOG, "%s can not be read or is not valid\n", (folder + filename).c_str() );
         ts->printf( cvtest::TS::LOG, "fs.isOpened=%d, board_list.empty=%d, board_list.isSeq=%d,board_list.size()%2=%d\n",
             fs.isOpened(), (int)board_list.empty(), board_list.isSeq(), board_list.size()%2);
         ts->set_failed_test_info( cvtest::TS::FAIL_MISSING_TEST_DATA );
diff --git a/modules/calib3d/test/test_chesscorners_timing.cpp b/modules/calib3d/test/test_chesscorners_timing.cpp
index 4d643a1d45..b89d2e0147 100644
--- a/modules/calib3d/test/test_chesscorners_timing.cpp
+++ b/modules/calib3d/test/test_chesscorners_timing.cpp
@@ -85,7 +85,7 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
     if( !fs || !board_list || !CV_NODE_IS_SEQ(board_list->tag) ||
         board_list->data.seq->total % 4 != 0 )
     {
-        ts->printf( cvtest::TS::LOG, "chessboard_timing_list.dat can not be readed or is not valid" );
+        ts->printf( cvtest::TS::LOG, "chessboard_timing_list.dat can not be read or is not valid" );
         code = cvtest::TS::FAIL_MISSING_TEST_DATA;
         goto _exit_;
     }
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 1997c906bc..455afaf593 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -3,6 +3,10 @@ set(the_description "The Core Functionality")
 ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
 ocv_add_dispatched_file(stat SSE4_2 AVX2)
 
+# dispatching for accuracy tests
+ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
+ocv_add_dispatched_file_force_all(test_intrin256 TEST AVX2)
+
 ocv_add_module(core
                OPTIONAL opencv_cudev
                WRAP java python js)
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 263659d302..ff2d5160d2 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -204,20 +204,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD512_64F 0
 #endif
 
-#if CV_SIMD512
-    #define CV_SIMD 1
-    #define CV_SIMD_64F CV_SIMD512_64F
-    #define CV_SIMD_WIDTH 64
-#elif CV_SIMD256
-    #define CV_SIMD 1
-    #define CV_SIMD_64F CV_SIMD256_64F
-    #define CV_SIMD_WIDTH 32
-#else
-    #define CV_SIMD CV_SIMD128
-    #define CV_SIMD_64F CV_SIMD128_64F
-    #define CV_SIMD_WIDTH 16
-#endif
-
 //==================================================================================================
 
 #define CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
@@ -309,7 +295,21 @@ template<typename _Tp> struct V_RegTraits
 #endif
 #endif
 
-#if CV_SIMD256
+#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
+#define CV__SIMD_NAMESPACE simd512
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_WIDTH 64
+    // TODO typedef v_uint8 / v_int32 / etc types here
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
+#define CV__SIMD_NAMESPACE simd256
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_WIDTH 32
     typedef v_uint8x32   v_uint8;
     typedef v_int8x32    v_int8;
     typedef v_uint16x16  v_uint16;
@@ -329,7 +329,14 @@ template<typename _Tp> struct V_RegTraits
     CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
     CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
     inline void vx_cleanup() { v256_cleanup(); }
-#elif CV_SIMD128 || CV_SIMD128_CPP
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
+#define CV__SIMD_NAMESPACE simd128
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
     typedef v_uint8x16  v_uint8;
     typedef v_int8x16   v_int8;
     typedef v_uint16x8  v_uint16;
@@ -351,6 +358,8 @@ template<typename _Tp> struct V_RegTraits
     CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
     #endif
     inline void vx_cleanup() { v_cleanup(); }
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
 #endif
 
 inline unsigned int trailingZeros32(unsigned int value) {
@@ -380,6 +389,14 @@ inline unsigned int trailingZeros32(unsigned int value) {
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 #endif
 
+#ifndef CV_SIMD_64F
+#define CV_SIMD_64F 0
+#endif
+
+#ifndef CV_SIMD
+#define CV_SIMD 0
+#endif
+
 } // cv::
 
 //! @endcond
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index d1f24d17b5..e58486fb5d 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -494,7 +494,12 @@ void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
 {
     __m128i delta32 = _mm_set1_epi32(32768);
-    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
+
+    // preliminary saturate negative values to zero
+    __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
+    __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
+
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
 }
 
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 7e384a5c6f..81e986fcd1 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -1764,7 +1764,7 @@ typedef struct CvString
 }
 CvString;
 
-/** All the keys (names) of elements in the readed file storage
+/** All the keys (names) of elements in the read file storage
    are stored in the hash to speed up the lookup operations: */
 typedef struct CvStringHashNode
 {
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 481b86b4f1..e5fd24dfad 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -453,9 +453,9 @@ struct Cvt_SIMD<int, uchar>
             {
                 v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
                 v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
-                v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
-                v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
-                v_store(dst + x, v_pack(v_dst1, v_dst2));
+                v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
+                v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
+                v_store(dst + x, v_pack_u(v_dst1, v_dst2));
             }
         }
         return x;
diff --git a/modules/core/src/datastructs.cpp b/modules/core/src/datastructs.cpp
index 56528fcf69..83c11c1855 100644
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@@ -2779,7 +2779,7 @@ cvGraphAddEdgeByPtr( CvGraph* graph,
 
     if( start_vtx == end_vtx )
         CV_Error( start_vtx ? CV_StsBadArg : CV_StsNullPtr,
-        "vertex pointers coinside (or set to NULL)" );
+        "vertex pointers coincide (or set to NULL)" );
 
     edge = (CvGraphEdge*)cvSetNew( (CvSet*)(graph->edges) );
     assert( edge->flags >= 0 );
diff --git a/modules/core/src/merge.cpp b/modules/core/src/merge.cpp
index 9c52f0e20c..300a718506 100644
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -36,13 +36,14 @@ vecmerge_( const T** src, T* dst, int len, int cn )
     const T* src0 = src[0];
     const T* src1 = src[1];
 
+    const int dstElemSize = cn * sizeof(T);
     int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T)));
     hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE;
     if( r != 0 )
     {
         mode = hal::STORE_UNALIGNED;
-        if( r % cn == 0 && len > VECSZ )
-            i0 = VECSZ - (r / cn);
+        if (r % dstElemSize == 0 && len > VECSZ*2)
+            i0 = VECSZ - (r / dstElemSize);
     }
 
     if( cn == 2 )
diff --git a/modules/core/src/persistence_c.cpp b/modules/core/src/persistence_c.cpp
index ed315d0971..ed349cc150 100644
--- a/modules/core/src/persistence_c.cpp
+++ b/modules/core/src/persistence_c.cpp
@@ -1063,7 +1063,7 @@ cvReadRawDataSlice( const CvFileStorage* fs, CvSeqReader* reader,
         CV_Error( CV_StsNullPtr, "Null pointer to reader or destination array" );
 
     if( !reader->seq && len != 1 )
-        CV_Error( CV_StsBadSize, "The readed sequence is a scalar, thus len must be 1" );
+        CV_Error( CV_StsBadSize, "The read sequence is a scalar, thus len must be 1" );
 
     fmt_pair_count = icvDecodeFormat( dt, fmt_pairs, CV_FS_MAX_FMT_PAIRS );
     size_t step = ::icvCalcStructSize(dt, 0);
diff --git a/modules/core/src/split.cpp b/modules/core/src/split.cpp
index 78d8daadd0..3fab6874b7 100644
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -27,8 +27,8 @@ vecsplit_( const T* src, T** dst, int len, int cn )
     if( (r0|r1|r2|r3) != 0 )
     {
         mode = hal::STORE_UNALIGNED;
-        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % cn == 0 && len > VECSZ )
-            i0 = VECSZ - (r0 / cn);
+        if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % sizeof(T) == 0 && len > VECSZ*2 )
+            i0 = VECSZ - (r0 / sizeof(T));
     }
 
     if( cn == 2 )
diff --git a/modules/core/src/utils/filesystem.cpp b/modules/core/src/utils/filesystem.cpp
index 23bed074f7..32183a2f6c 100644
--- a/modules/core/src/utils/filesystem.cpp
+++ b/modules/core/src/utils/filesystem.cpp
@@ -469,7 +469,32 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu
         {
             if (utils::fs::isDirectory(default_cache_path))
             {
-                default_cache_path = utils::fs::join(default_cache_path, utils::fs::join("opencv", CV_VERSION));
+                cv::String default_cache_path_base = utils::fs::join(default_cache_path, "opencv");
+                default_cache_path = utils::fs::join(default_cache_path_base, "4.0" CV_VERSION_STATUS);
+                if (utils::getConfigurationParameterBool("OPENCV_CACHE_SHOW_CLEANUP_MESSAGE", true)
+                    && !utils::fs::isDirectory(default_cache_path))
+                {
+                    std::vector<cv::String> existedCacheDirs;
+                    try
+                    {
+                        utils::fs::glob_relative(default_cache_path_base, "*", existedCacheDirs, false, true);
+                    }
+                    catch (...)
+                    {
+                        // ignore
+                    }
+                    if (!existedCacheDirs.empty())
+                    {
+                        CV_LOG_WARNING(NULL, "Creating new OpenCV cache directory: " << default_cache_path);
+                        CV_LOG_WARNING(NULL, "There are several neighbour directories, probably created by old OpenCV versions.");
+                        CV_LOG_WARNING(NULL, "Feel free to cleanup these unused directories:");
+                        for (size_t i = 0; i < existedCacheDirs.size(); i++)
+                        {
+                            CV_LOG_WARNING(NULL, "  - " << existedCacheDirs[i]);
+                        }
+                        CV_LOG_WARNING(NULL, "Note: This message is showed only once.");
+                    }
+                }
                 if (sub_directory_name && sub_directory_name[0] != '\0')
                     default_cache_path = utils::fs::join(default_cache_path, cv::String(sub_directory_name) + native_separator);
                 if (!utils::fs::createDirectories(default_cache_path))
diff --git a/modules/core/test/test_intrin.avx2.cpp b/modules/core/test/test_intrin.avx2.cpp
deleted file mode 100644
index 9ebfcdf542..0000000000
--- a/modules/core/test/test_intrin.avx2.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "test_precomp.hpp"
-#include "test_intrin.simd.hpp"
\ No newline at end of file
diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp
index 6610e332de..602877382d 100644
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -2,101 +2,100 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
-#include "test_intrin.simd.hpp"
 
-#define CV_CPU_SIMD_FILENAME "test_intrin.simd.hpp"
-#define CV_CPU_DISPATCH_MODE FP16
-#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
+#include "test_intrin128.simd.hpp"
+#include "test_intrin128.simd_declarations.hpp"
+
+#undef CV_CPU_DISPATCH_MODES_ALL
+
+#include "opencv2/core/cv_cpu_dispatch.h"
+#include "test_intrin256.simd.hpp"
+#include "test_intrin256.simd_declarations.hpp"
 
-#define CV_CPU_DISPATCH_MODE AVX2
-#include "opencv2/core/private/cv_cpu_include_simd_declarations.hpp"
 
 namespace opencv_test { namespace hal {
-using namespace CV_CPU_OPTIMIZATION_NAMESPACE;
 
-TEST(hal_intrin, uint8x16)
-{ test_hal_intrin_uint8(); }
+#define CV_CPU_CALL_BASELINE_(fn, args)  CV_CPU_CALL_BASELINE(fn, args)
 
-TEST(hal_intrin, int8x16)
-{ test_hal_intrin_int8(); }
+#define DISPATCH_SIMD128(fn, cpu_opt) do { \
+    CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
+    throw SkipTestException("SIMD128 (" #cpu_opt ") is not available or disabled"); \
+} while(0)
 
-TEST(hal_intrin, uint16x8)
-{ test_hal_intrin_uint16(); }
+#define DISPATCH_SIMD256(fn, cpu_opt) do { \
+    CV_CPU_CALL_ ## cpu_opt ## _(fn, ()); \
+    throw SkipTestException("SIMD256 (" #cpu_opt ") is not available or disabled"); \
+} while(0)
 
-TEST(hal_intrin, int16x8)
-{ test_hal_intrin_int16(); }
+#define DEFINE_SIMD_TESTS(simd_size, cpu_opt) \
+TEST(hal_intrin ## simd_size, uint8x16_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint8, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int8x16_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int8, cpu_opt); } \
+TEST(hal_intrin ## simd_size, uint16x8_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint16, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int16x8_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int16, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int32x4_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int32, cpu_opt); } \
+TEST(hal_intrin ## simd_size, uint32x4_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint32, cpu_opt); } \
+TEST(hal_intrin ## simd_size, uint64x2_ ## cpu_opt)  { DISPATCH_SIMD ## simd_size(test_hal_intrin_uint64, cpu_opt); } \
+TEST(hal_intrin ## simd_size, int64x2_ ## cpu_opt)   { DISPATCH_SIMD ## simd_size(test_hal_intrin_int64, cpu_opt); } \
+TEST(hal_intrin ## simd_size, float32x4_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_float32, cpu_opt); } \
+TEST(hal_intrin ## simd_size, float64x2_ ## cpu_opt) { DISPATCH_SIMD ## simd_size(test_hal_intrin_float64, cpu_opt); } \
 
-TEST(hal_intrin, int32x4)
-{ test_hal_intrin_int32(); }
+namespace intrin128 {
 
-TEST(hal_intrin, uint32x4)
-{ test_hal_intrin_uint32(); }
+DEFINE_SIMD_TESTS(128, BASELINE)
 
-TEST(hal_intrin, uint64x2)
-{ test_hal_intrin_uint64(); }
+#if defined CV_CPU_DISPATCH_COMPILE_SSE2 || defined CV_CPU_BASELINE_COMPILE_SSE2
+DEFINE_SIMD_TESTS(128, SSE2)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSE3 || defined CV_CPU_BASELINE_COMPILE_SSE3
+DEFINE_SIMD_TESTS(128, SSE3)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSSE3 || defined CV_CPU_BASELINE_COMPILE_SSSE3
+DEFINE_SIMD_TESTS(128, SSSE3)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSE4_1 || defined CV_CPU_BASELINE_COMPILE_SSE4_1
+DEFINE_SIMD_TESTS(128, SSE4_1)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_SSE4_2 || defined CV_CPU_BASELINE_COMPILE_SSE4_2
+DEFINE_SIMD_TESTS(128, SSE4_2)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_AVX || defined CV_CPU_BASELINE_COMPILE_AVX
+DEFINE_SIMD_TESTS(128, AVX)
+#endif
+#if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
+DEFINE_SIMD_TESTS(128, AVX2)
+#endif
 
-TEST(hal_intrin, int64x2)
-{ test_hal_intrin_int64(); }
-
-TEST(hal_intrin, float32x4)
-{ test_hal_intrin_float32(); }
-
-TEST(hal_intrin, float64x2)
-{ test_hal_intrin_float64(); }
-
-TEST(hal_intrin, float16x8)
+TEST(hal_intrin128, float16x8_FP16)
 {
     CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
     throw SkipTestException("Unsupported hardware: FP16 is not available");
 }
 
-#define DISPATCH_SIMD_MODES AVX2
-#define DISPATCH_SIMD_NAME "SIMD256"
-#define DISPATCH_SIMD(fun)                              \
-    do {                                                \
-        CV_CPU_DISPATCH(fun, (), DISPATCH_SIMD_MODES);  \
-        throw SkipTestException(                        \
-            "Unsupported hardware: "                    \
-            DISPATCH_SIMD_NAME                          \
-            " is not available"                         \
-        );                                              \
-    } while(0)
+} // namespace intrin128
 
-TEST(hal_intrin256, uint8x32)
-{ DISPATCH_SIMD(test_hal_intrin_uint8); }
 
-TEST(hal_intrin256, int8x32)
-{ DISPATCH_SIMD(test_hal_intrin_int8); }
+namespace intrin256 {
 
-TEST(hal_intrin256, uint16x16)
-{ DISPATCH_SIMD(test_hal_intrin_uint16); }
 
-TEST(hal_intrin256, int16x16)
-{ DISPATCH_SIMD(test_hal_intrin_int16); }
+// Not available due missing C++ backend for SIMD256
+//DEFINE_SIMD_TESTS(256, BASELINE)
 
-TEST(hal_intrin256, uint32x8)
-{ DISPATCH_SIMD(test_hal_intrin_uint32); }
+//#if defined CV_CPU_DISPATCH_COMPILE_AVX
+//DEFINE_SIMD_TESTS(256, AVX)
+//#endif
 
-TEST(hal_intrin256, int32x8)
-{ DISPATCH_SIMD(test_hal_intrin_int32); }
+#if defined CV_CPU_DISPATCH_COMPILE_AVX2 || defined CV_CPU_BASELINE_COMPILE_AVX2
+DEFINE_SIMD_TESTS(256, AVX2)
+#endif
 
-TEST(hal_intrin256, uint64x4)
-{ DISPATCH_SIMD(test_hal_intrin_uint64); }
-
-TEST(hal_intrin256, int64x4)
-{ DISPATCH_SIMD(test_hal_intrin_int64); }
-
-TEST(hal_intrin256, float32x8)
-{ DISPATCH_SIMD(test_hal_intrin_float32); }
-
-TEST(hal_intrin256, float64x4)
-{ DISPATCH_SIMD(test_hal_intrin_float64); }
-
-TEST(hal_intrin256, float16x16)
+TEST(hal_intrin256, float16x16_FP16)
 {
-    if (!CV_CPU_HAS_SUPPORT_FP16)
-        throw SkipTestException("Unsupported hardware: FP16 is not available");
-    DISPATCH_SIMD(test_hal_intrin_float16);
+    //CV_CPU_CALL_FP16_(test_hal_intrin_float16, ());
+    CV_CPU_CALL_AVX2_(test_hal_intrin_float16, ());
+    throw SkipTestException("Unsupported hardware: FP16 is not available");
 }
 
+
+} // namespace intrin256
+
 }} // namespace
\ No newline at end of file
diff --git a/modules/core/test/test_intrin.fp16.cpp b/modules/core/test/test_intrin.fp16.cpp
deleted file mode 100644
index 9f6416bcf8..0000000000
--- a/modules/core/test/test_intrin.fp16.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "test_precomp.hpp"
-#include "test_intrin_utils.hpp"
-
-namespace opencv_test { namespace hal {
-CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-
-void test_hal_intrin_float16()
-{
-    TheTest<v_float16>()
-        .test_loadstore_fp16()
-        .test_float_cvt_fp16()
-        ;
-}
-
-CV_CPU_OPTIMIZATION_NAMESPACE_END
-}} // namespace
diff --git a/modules/core/test/test_intrin.simd.hpp b/modules/core/test/test_intrin.simd.hpp
deleted file mode 100644
index 4e0d3a073f..0000000000
--- a/modules/core/test/test_intrin.simd.hpp
+++ /dev/null
@@ -1,296 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-#include "test_precomp.hpp"
-#include "test_intrin_utils.hpp"
-
-namespace opencv_test { namespace hal {
-CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-
-void test_hal_intrin_uint8();
-void test_hal_intrin_int8();
-void test_hal_intrin_uint16();
-void test_hal_intrin_int16();
-void test_hal_intrin_uint32();
-void test_hal_intrin_int32();
-void test_hal_intrin_uint64();
-void test_hal_intrin_int64();
-void test_hal_intrin_float32();
-void test_hal_intrin_float64();
-
-#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-
-//=============  8-bit integer =====================================================================
-
-void test_hal_intrin_uint8()
-{
-    TheTest<v_uint8>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-
-#if CV_SIMD256
-    TheTest<v_uint8>()
-        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
-        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
-        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
-        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
-        ;
-#endif
-}
-
-void test_hal_intrin_int8()
-{
-    TheTest<v_int8>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_expand_q()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_cmp()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-}
-
-//============= 16-bit integer =====================================================================
-
-void test_hal_intrin_uint16()
-{
-    TheTest<v_uint16>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
-
-void test_hal_intrin_int16()
-{
-    TheTest<v_int16>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_addsub_wrap()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_dot_prod()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_abs()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
-        ;
-}
-
-//============= 32-bit integer =====================================================================
-
-void test_hal_intrin_uint32()
-{
-    TheTest<v_uint32>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_mul_expand()
-        .test_cmp()
-        .test_shift<1>()
-        .test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_popcount()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_transpose()
-        ;
-}
-
-void test_hal_intrin_int32()
-{
-    TheTest<v_int32>()
-        .test_loadstore()
-        .test_interleave()
-        .test_expand()
-        .test_addsub()
-        .test_mul()
-        .test_abs()
-        .test_cmp()
-        .test_popcount()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_min_max()
-        .test_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
-        .test_unpack()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        .test_float_cvt32()
-        .test_float_cvt64()
-        .test_transpose()
-        ;
-}
-
-//============= 64-bit integer =====================================================================
-
-void test_hal_intrin_uint64()
-{
-    TheTest<v_uint64>()
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-
-void test_hal_intrin_int64()
-{
-    TheTest<v_int64>()
-        .test_loadstore()
-        .test_addsub()
-        .test_shift<1>().test_shift<8>()
-        .test_logic()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-}
-
-//============= Floating point =====================================================================
-void test_hal_intrin_float32()
-{
-    TheTest<v_float32>()
-        .test_loadstore()
-        .test_interleave()
-        .test_interleave_2channel()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_reduce()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt64()
-        .test_matmul()
-        .test_transpose()
-        .test_reduce_sum4()
-        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
-        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        ;
-
-#if CV_SIMD256
-    TheTest<v_float32>()
-        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
-        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
-        ;
-#endif
-}
-
-void test_hal_intrin_float64()
-{
-#if CV_SIMD_64F
-    TheTest<v_float64>()
-        .test_loadstore()
-        .test_addsub()
-        .test_mul()
-        .test_div()
-        .test_cmp()
-        .test_sqrt_abs()
-        .test_min_max()
-        .test_float_absdiff()
-        .test_mask()
-        .test_unpack()
-        .test_float_math()
-        .test_float_cvt32()
-        .test_extract<0>().test_extract<1>()
-        .test_rotate<0>().test_rotate<1>()
-        ;
-
-#if CV_SIMD256
-    TheTest<v_float64>()
-        .test_extract<2>().test_extract<3>()
-        .test_rotate<2>().test_rotate<3>()
-        ;
-#endif //CV_SIMD256
-
-#endif
-}
-
-#if CV_FP16 && CV_SIMD_WIDTH > 16
-void test_hal_intrin_float16()
-{
-    TheTest<v_float16>()
-        .test_loadstore_fp16()
-        .test_float_cvt_fp16()
-        ;
-}
-#endif
-
-#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-
-CV_CPU_OPTIMIZATION_NAMESPACE_END
-
-}} //namespace
\ No newline at end of file
diff --git a/modules/core/test/test_intrin128.simd.hpp b/modules/core/test/test_intrin128.simd.hpp
new file mode 100644
index 0000000000..1d9bee2d33
--- /dev/null
+++ b/modules/core/test/test_intrin128.simd.hpp
@@ -0,0 +1,22 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#define CV__SIMD_FORCE_WIDTH 128
+#include "opencv2/core/hal/intrin.hpp"
+#undef CV__SIMD_FORCE_WIDTH
+
+#if CV_SIMD_WIDTH != 16
+#error "Invalid build configuration"
+#endif
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+namespace opencv_test { namespace hal { namespace intrin128 {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+#include "test_intrin_utils.hpp"
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}}} //namespace
diff --git a/modules/core/test/test_intrin256.simd.hpp b/modules/core/test/test_intrin256.simd.hpp
new file mode 100644
index 0000000000..a5e2cd5221
--- /dev/null
+++ b/modules/core/test/test_intrin256.simd.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#if !defined CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY && \
+    !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS // TODO? C++ fallback implementation for SIMD256
+
+#define CV__SIMD_FORCE_WIDTH 256
+#include "opencv2/core/hal/intrin.hpp"
+#undef CV__SIMD_FORCE_WIDTH
+
+#if CV_SIMD_WIDTH != 32
+#error "Invalid build configuration"
+#endif
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+namespace opencv_test { namespace hal { namespace intrin256 {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+#include "test_intrin_utils.hpp"
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}}} //namespace
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 5f3175bc6c..cc9de4fc75 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1,10 +1,22 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
-#include "opencv2/core/hal/intrin.hpp"
 
-namespace opencv_test { namespace hal {
-CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+// This file is not standalone.
+// It is included with these active namespaces:
+//namespace opencv_test { namespace hal { namespace intrinXXX {
+//CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void test_hal_intrin_uint8();
+void test_hal_intrin_int8();
+void test_hal_intrin_uint16();
+void test_hal_intrin_int16();
+void test_hal_intrin_uint32();
+void test_hal_intrin_int32();
+void test_hal_intrin_uint64();
+void test_hal_intrin_int64();
+void test_hal_intrin_float32();
+void test_hal_intrin_float64();
 
 void test_hal_intrin_float16();
 
@@ -258,6 +270,7 @@ template<typename R> struct TheTest
         v_store(out.u.d, r_low);
         for (int i = 0; i < R::nlanes/2; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]);
         }
 
@@ -266,6 +279,7 @@ template<typename R> struct TheTest
         v_store(out.u.d, r_low_align8byte);
         for (int i = 0; i < R::nlanes/2; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]);
         }
 
@@ -296,6 +310,7 @@ template<typename R> struct TheTest
         resV.fill((LaneType)8);
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)0, resZ[i]);
             EXPECT_EQ((LaneType)8, resV[i]);
         }
@@ -342,6 +357,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(data1, Data<R>(a));
             EXPECT_EQ(data2, Data<R>(b));
             EXPECT_EQ(data3, Data<R>(c));
@@ -374,6 +390,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(data1, Data<R>(a));
             EXPECT_EQ(data2, Data<R>(b));
         }
@@ -397,6 +414,7 @@ template<typename R> struct TheTest
         const int n = Rx2::nlanes;
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i], resB[i]);
             EXPECT_EQ(dataA[i], resC[i]);
             EXPECT_EQ(dataA[i + n], resD[i]);
@@ -412,7 +430,10 @@ template<typename R> struct TheTest
         Data<Rx4> out = vx_load_expand_q(data.d);
         const int n = Rx4::nlanes;
         for (int i = 0; i < n; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(data[i], out[i]);
+        }
 
         return *this;
     }
@@ -426,6 +447,7 @@ template<typename R> struct TheTest
         Data<R> resC = a + b, resD = a - b;
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
             EXPECT_EQ(saturate_cast<LaneType>(dataA[i] - dataB[i]), resD[i]);
         }
@@ -443,6 +465,7 @@ template<typename R> struct TheTest
                 resD = v_sub_wrap(a, b);
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]);
             EXPECT_EQ((LaneType)(dataA[i] - dataB[i]), resD[i]);
         }
@@ -458,6 +481,7 @@ template<typename R> struct TheTest
         Data<R> resC = a * b;
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] * dataB[i], resC[i]);
         }
 
@@ -473,6 +497,7 @@ template<typename R> struct TheTest
         Data<R> resC = a / b;
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] / dataB[i], resC[i]);
         }
 
@@ -492,6 +517,7 @@ template<typename R> struct TheTest
         const int n = R::nlanes / 2;
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]);
             EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]);
         }
@@ -511,6 +537,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < Ru::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]);
         }
 
@@ -529,6 +556,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(static_cast<LaneType>(dataA[i] << s), resB[i]);
             EXPECT_EQ(static_cast<LaneType>(dataA[i] << s), resC[i]);
             EXPECT_EQ(static_cast<LaneType>(dataA[i] >> s), resD[i]);
@@ -553,6 +581,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
             EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
             EXPECT_EQ(dataA[i] >  dataB[i], resE[i] != 0);
@@ -583,6 +612,7 @@ template<typename R> struct TheTest
         const int n = R::nlanes / 2;
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
             EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
         }
@@ -597,6 +627,7 @@ template<typename R> struct TheTest
         Data<R> resC = a & b, resD = a | b, resE = a ^ b, resF = ~a;
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] & dataB[i], resC[i]);
             EXPECT_EQ(dataA[i] | dataB[i], resD[i]);
             EXPECT_EQ(dataA[i] ^ dataB[i], resE[i]);
@@ -615,6 +646,7 @@ template<typename R> struct TheTest
         Data<R> resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d);
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_COMPARE_EQ((float)std::sqrt(dataA[i]), (float)resB[i]);
             EXPECT_COMPARE_EQ(1/(float)std::sqrt(dataA[i]), (float)resC[i]);
             EXPECT_COMPARE_EQ((float)abs(dataA[i]), (float)resE[i]);
@@ -632,6 +664,7 @@ template<typename R> struct TheTest
         Data<R> resC = v_min(a, b), resD = v_max(a, b);
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]);
             EXPECT_EQ(std::max(dataA[i], dataB[i]), resD[i]);
         }
@@ -672,6 +705,7 @@ template<typename R> struct TheTest
         const u_type mask = std::numeric_limits<LaneType>::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0;
         for (int i = 0; i < Ru::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             u_type uA = dataA[i] ^ mask;
             u_type uB = dataB[i] ^ mask;
             EXPECT_EQ(uA > uB ? uA - uB : uB - uA, resC[i]);
@@ -691,6 +725,7 @@ template<typename R> struct TheTest
         Data<R> resC = v_absdiff(a, b);
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]);
         }
         return *this;
@@ -744,6 +779,7 @@ template<typename R> struct TheTest
         Data<R> resF = f;
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             int_type m2 = dataB.as_int(i);
             EXPECT_EQ((dataD.as_int(i) & m2) | (dataE.as_int(i) & ~m2), resF.as_int(i));
         }
@@ -776,6 +812,7 @@ template<typename R> struct TheTest
         const w_type add = (w_type)1 << (s - 1);
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(pack_saturate_cast<LaneType>(dataA[i]), resC[i]);
             EXPECT_EQ(pack_saturate_cast<LaneType>(dataB[i]), resC[i + n]);
             EXPECT_EQ(pack_saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
@@ -816,6 +853,7 @@ template<typename R> struct TheTest
         const w_type add = (w_type)1 << (s - 1);
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(pack_saturate_cast<LaneType>(dataA[i]), resC[i]);
             EXPECT_EQ(pack_saturate_cast<LaneType>(dataB[i]), resC[i + n]);
             EXPECT_EQ(pack_saturate_cast<LaneType>((dataA[i] + add) >> s), resD[i]);
@@ -845,6 +883,7 @@ template<typename R> struct TheTest
         const int n = R::nlanes/2;
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(dataA[i], resC[i*2]);
             EXPECT_EQ(dataB[i], resC[i*2+1]);
             EXPECT_EQ(dataA[i+n], resD[i*2]);
@@ -876,6 +915,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             if (i + s >= R::nlanes)
                 EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]);
             else
@@ -901,6 +941,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             if (i + s >= R::nlanes)
             {
                 EXPECT_EQ((LaneType)0, resC[i]);
@@ -940,6 +981,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(cvRound(data1[i]), resB[i]);
             EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]);
             EXPECT_EQ(cvFloor(data1[i]), resD[i]);
@@ -964,6 +1006,7 @@ template<typename R> struct TheTest
         int n = std::min<int>(Rt::nlanes, R::nlanes);
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
         }
         return *this;
@@ -983,10 +1026,12 @@ template<typename R> struct TheTest
         int n = std::min<int>(Rt::nlanes, R::nlanes);
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
         }
         for (int i = 0; i < n; ++i)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ((typename Rt::lane_type)dataA[i+n], resC[i]);
         }
 #endif
@@ -1006,6 +1051,7 @@ template<typename R> struct TheTest
         {
             for (int j = i; j < i + 4; ++j)
             {
+                SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
                 LaneType val = dataV[i]     * dataA[j]
                              + dataV[i + 1] * dataB[j]
                              + dataV[i + 2] * dataC[j]
@@ -1019,6 +1065,7 @@ template<typename R> struct TheTest
         {
             for (int j = i; j < i + 4; ++j)
             {
+                SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
                 LaneType val = dataV[i]     * dataA[j]
                              + dataV[i + 1] * dataB[j]
                              + dataV[i + 2] * dataC[j]
@@ -1045,6 +1092,7 @@ template<typename R> struct TheTest
         {
             for (int j = 0; j < 4; ++j)
             {
+                SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
                 EXPECT_EQ(dataA[i + j], res[j][i]);
                 EXPECT_EQ(dataB[i + j], res[j][i + 1]);
                 EXPECT_EQ(dataC[i + j], res[j][i + 2]);
@@ -1066,6 +1114,7 @@ template<typename R> struct TheTest
 
         for (int i = 0; i < R::nlanes; i += 4)
         {
+            SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]);
             EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]);
             EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]);
@@ -1121,7 +1170,304 @@ template<typename R> struct TheTest
 
 };
 
+
+#if 1
+#define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*(int)sizeof(v_uint8), CV__TRACE_FUNCTION);
 #endif
 
-CV_CPU_OPTIMIZATION_NAMESPACE_END
-}} // namespace
+//=============  8-bit integer =====================================================================
+
+void test_hal_intrin_uint8()
+{
+    DUMP_ENTRY(v_uint8);
+    TheTest<v_uint8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+
+#if CV_SIMD_WIDTH == 32
+    TheTest<v_uint8>()
+        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
+        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
+        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
+        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
+        ;
+#endif
+}
+
+void test_hal_intrin_int8()
+{
+    DUMP_ENTRY(v_int8);
+    TheTest<v_int8>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_cmp()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        ;
+}
+
+//============= 16-bit integer =====================================================================
+
+void test_hal_intrin_uint16()
+{
+    DUMP_ENTRY(v_uint16);
+    TheTest<v_uint16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+
+void test_hal_intrin_int16()
+{
+    DUMP_ENTRY(v_int16);
+    TheTest<v_int16>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_addsub_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_dot_prod()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_abs()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        ;
+}
+
+//============= 32-bit integer =====================================================================
+
+void test_hal_intrin_uint32()
+{
+    DUMP_ENTRY(v_uint32);
+    TheTest<v_uint32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_mul_expand()
+        .test_cmp()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_popcount()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_transpose()
+        ;
+}
+
+void test_hal_intrin_int32()
+{
+    DUMP_ENTRY(v_int32);
+    TheTest<v_int32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_abs()
+        .test_cmp()
+        .test_popcount()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_min_max()
+        .test_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_unpack()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_float_cvt32()
+        .test_float_cvt64()
+        .test_transpose()
+        ;
+}
+
+//============= 64-bit integer =====================================================================
+
+void test_hal_intrin_uint64()
+{
+    DUMP_ENTRY(v_uint64);
+    TheTest<v_uint64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+
+void test_hal_intrin_int64()
+{
+    DUMP_ENTRY(v_int64);
+    TheTest<v_int64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+}
+
+//============= Floating point =====================================================================
+void test_hal_intrin_float32()
+{
+    DUMP_ENTRY(v_float32);
+    TheTest<v_float32>()
+        .test_loadstore()
+        .test_interleave()
+        .test_interleave_2channel()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_reduce()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt64()
+        .test_matmul()
+        .test_transpose()
+        .test_reduce_sum4()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        ;
+
+#if CV_SIMD_WIDTH == 32
+    TheTest<v_float32>()
+        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
+        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
+        ;
+#endif
+}
+
+void test_hal_intrin_float64()
+{
+    DUMP_ENTRY(v_float64);
+#if CV_SIMD_64F
+    TheTest<v_float64>()
+        .test_loadstore()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_cmp()
+        .test_sqrt_abs()
+        .test_min_max()
+        .test_float_absdiff()
+        .test_mask()
+        .test_unpack()
+        .test_float_math()
+        .test_float_cvt32()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        ;
+
+#if CV_SIMD_WIDTH == 32
+    TheTest<v_float64>()
+        .test_extract<2>().test_extract<3>()
+        .test_rotate<2>().test_rotate<3>()
+        ;
+#endif //CV_SIMD256
+
+#endif
+}
+
+#if CV_FP16
+void test_hal_intrin_float16()
+{
+    DUMP_ENTRY(v_float16);
+#if CV_SIMD_WIDTH > 16
+    TheTest<v_float16>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+#endif
+}
+#endif
+
+/*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16
+void test_hal_intrin_float16()
+{
+    TheTest<v_float16>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+}
+#endif*/
+
+#endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+//CV_CPU_OPTIMIZATION_NAMESPACE_END
+//}}} // namespace
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index d67e53f506..4a8c347c68 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1814,4 +1814,62 @@ BIGDATA_TEST(Mat, push_back_regression_4158)  // memory usage: ~10.6 Gb
     }
 }
 
+
+TEST(Core_Merge, hang_12171)
+{
+    Mat src1(4, 24, CV_8UC1, Scalar::all(1));
+    Mat src2(4, 24, CV_8UC1, Scalar::all(2));
+    Rect src_roi(0, 0, 23, 4);
+    Mat src_channels[2] = { src1(src_roi), src2(src_roi) };
+    Mat dst(4, 24, CV_8UC2, Scalar::all(5));
+    Rect dst_roi(1, 0, 23, 4);
+    cv::merge(src_channels, 2, dst(dst_roi));
+    EXPECT_EQ(5, dst.ptr<uchar>()[0]);
+    EXPECT_EQ(5, dst.ptr<uchar>()[1]);
+    EXPECT_EQ(1, dst.ptr<uchar>()[2]);
+    EXPECT_EQ(2, dst.ptr<uchar>()[3]);
+    EXPECT_EQ(5, dst.ptr<uchar>(1)[0]);
+    EXPECT_EQ(5, dst.ptr<uchar>(1)[1]);
+    EXPECT_EQ(1, dst.ptr<uchar>(1)[2]);
+    EXPECT_EQ(2, dst.ptr<uchar>(1)[3]);
+}
+
+TEST(Core_Split, hang_12171)
+{
+    Mat src(4, 24, CV_8UC2, Scalar(1,2,3,4));
+    Rect src_roi(0, 0, 23, 4);
+    Mat dst1(4, 24, CV_8UC1, Scalar::all(5));
+    Mat dst2(4, 24, CV_8UC1, Scalar::all(10));
+    Rect dst_roi(0, 0, 23, 4);
+    Mat dst[2] = { dst1(dst_roi), dst2(dst_roi) };
+    cv::split(src(src_roi), dst);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[1]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[1]);
+}
+
+TEST(Core_Split, crash_12171)
+{
+    Mat src(4, 40, CV_8UC2, Scalar(1,2,3,4));
+    Rect src_roi(0, 0, 39, 4);
+    Mat dst1(4, 40, CV_8UC1, Scalar::all(5));
+    Mat dst2(4, 40, CV_8UC1, Scalar::all(10));
+    Rect dst_roi(0, 0, 39, 4);
+    Mat dst[2] = { dst1(dst_roi), dst2(dst_roi) };
+    cv::split(src(src_roi), dst);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>()[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>()[1]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[0]);
+    EXPECT_EQ(1, dst1.ptr<uchar>(1)[1]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[0]);
+    EXPECT_EQ(2, dst2.ptr<uchar>(1)[1]);
+}
+
 }} // namespace
diff --git a/modules/core/test/test_precomp.hpp b/modules/core/test/test_precomp.hpp
index 9787586156..a82f5cc12c 100644
--- a/modules/core/test/test_precomp.hpp
+++ b/modules/core/test/test_precomp.hpp
@@ -11,6 +11,5 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/private.hpp"
 #include "opencv2/core/hal/hal.hpp"
-#include "opencv2/core/hal/intrin.hpp"
 
 #endif
diff --git a/modules/cudafilters/src/cuda/median_filter.cu b/modules/cudafilters/src/cuda/median_filter.cu
index f8e02cb039..fe26c7be0e 100644
--- a/modules/cudafilters/src/cuda/median_filter.cu
+++ b/modules/cudafilters/src/cuda/median_filter.cu
@@ -246,7 +246,7 @@ namespace cv { namespace cuda { namespace device
         }
         __syncthreads();
 
-        // Fot all remaining rows in the median filter, add the values to the the histogram
+        // For all remaining rows in the median filter, add the values to the the histogram
         for (int j=threadIdx.x; j<cols; j+=blockDim.x){
             for(int i=initStartRow; i<initStopRow; i++){
                     int pos=::min(i,rows-1);
diff --git a/modules/cudaimgproc/src/mssegmentation.cpp b/modules/cudaimgproc/src/mssegmentation.cpp
index ee9ce5ac0a..2bc071813e 100644
--- a/modules/cudaimgproc/src/mssegmentation.cpp
+++ b/modules/cudaimgproc/src/mssegmentation.cpp
@@ -342,7 +342,7 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp,
         }
     }
 
-    // Sort all graph's edges connecting different components (in asceding order)
+    // Sort all graph's edges connecting different components (in ascending order)
     std::sort(edges.begin(), edges.end());
 
     // Exclude small components (starting from the nearest couple)
diff --git a/modules/cudawarping/test/test_warp_affine.cpp b/modules/cudawarping/test/test_warp_affine.cpp
index f4a08ab928..d26a5fdeb7 100644
--- a/modules/cudawarping/test/test_warp_affine.cpp
+++ b/modules/cudawarping/test/test_warp_affine.cpp
@@ -48,7 +48,7 @@ namespace opencv_test { namespace {
 
 namespace
 {
-    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    cv::Mat createTransformMatrix(cv::Size srcSize, double angle)
     {
         cv::Mat M(2, 3, CV_64FC1);
 
@@ -80,7 +80,7 @@ PARAM_TEST_CASE(BuildWarpAffineMaps, cv::cuda::DeviceInfo, cv::Size, Inverse)
 
 CUDA_TEST_P(BuildWarpAffineMaps, Accuracy)
 {
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 4);
     cv::Mat src = randomMat(randomSize(200, 400), CV_8UC1);
 
     cv::cuda::GpuMat xmap, ymap;
@@ -207,7 +207,7 @@ PARAM_TEST_CASE(WarpAffine, cv::cuda::DeviceInfo, cv::Size, MatType, Inverse, In
 CUDA_TEST_P(WarpAffine, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 3);
     int flags = interpolation;
     if (inverse)
         flags |= cv::WARP_INVERSE_MAP;
@@ -257,7 +257,7 @@ CUDA_TEST_P(WarpAffineNPP, Accuracy)
     cv::Mat src = readImageType("stereobp/aloe-L.png", type);
     ASSERT_FALSE(src.empty());
 
-    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    cv::Mat M = createTransformMatrix(src.size(), CV_PI / 4);
     int flags = interpolation;
     if (inverse)
         flags |= cv::WARP_INVERSE_MAP;
diff --git a/modules/cudawarping/test/test_warp_perspective.cpp b/modules/cudawarping/test/test_warp_perspective.cpp
index 046a334764..7c5c758892 100644
--- a/modules/cudawarping/test/test_warp_perspective.cpp
+++ b/modules/cudawarping/test/test_warp_perspective.cpp
@@ -48,7 +48,7 @@ namespace opencv_test { namespace {
 
 namespace
 {
-    cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
+    cv::Mat createTransformMatrix(cv::Size srcSize, double angle)
     {
         cv::Mat M(3, 3, CV_64FC1);
 
@@ -81,7 +81,7 @@ PARAM_TEST_CASE(BuildWarpPerspectiveMaps, cv::cuda::DeviceInfo, cv::Size, Invers
 
 CUDA_TEST_P(BuildWarpPerspectiveMaps, Accuracy)
 {
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 4);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 4);
 
     cv::cuda::GpuMat xmap, ymap;
     cv::cuda::buildWarpPerspectiveMaps(M, inverse, size, xmap, ymap);
@@ -210,7 +210,7 @@ PARAM_TEST_CASE(WarpPerspective, cv::cuda::DeviceInfo, cv::Size, MatType, Invers
 CUDA_TEST_P(WarpPerspective, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
-    cv::Mat M = createTransfomMatrix(size, CV_PI / 3);
+    cv::Mat M = createTransformMatrix(size, CV_PI / 3);
     int flags = interpolation;
     if (inverse)
         flags |= cv::WARP_INVERSE_MAP;
@@ -260,7 +260,7 @@ CUDA_TEST_P(WarpPerspectiveNPP, Accuracy)
     cv::Mat src = readImageType("stereobp/aloe-L.png", type);
     ASSERT_FALSE(src.empty());
 
-    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
+    cv::Mat M = createTransformMatrix(src.size(), CV_PI / 4);
     int flags = interpolation;
     if (inverse)
         flags |= cv::WARP_INVERSE_MAP;
diff --git a/modules/cudev/test/test_warp.cu b/modules/cudev/test/test_warp.cu
index eda1694860..72d0643148 100644
--- a/modules/cudev/test/test_warp.cu
+++ b/modules/cudev/test/test_warp.cu
@@ -199,7 +199,7 @@ TEST(Resize, Downscale)
 
 // warpAffine & warpPerspective
 
-Mat createAffineTransfomMatrix(Size srcSize, float angle, bool perspective)
+Mat createAffineTransformMatrix(Size srcSize, float angle, bool perspective)
 {
     cv::Mat M(perspective ? 3 : 2, 3, CV_32FC1);
 
@@ -220,7 +220,7 @@ TEST(WarpAffine, Rotation)
     const Size size = randomSize(100, 400);
 
     Mat src = randomMat(size, CV_32FC1, 0, 1);
-    Mat M = createAffineTransfomMatrix(size, static_cast<float>(CV_PI / 4), false);
+    Mat M = createAffineTransformMatrix(size, static_cast<float>(CV_PI / 4), false);
 
     GpuMat_<float> d_src(src);
     GpuMat_<float> d_M;
@@ -240,7 +240,7 @@ TEST(WarpPerspective, Rotation)
     const Size size = randomSize(100, 400);
 
     Mat src = randomMat(size, CV_32FC1, 0, 1);
-    Mat M = createAffineTransfomMatrix(size, static_cast<float>(CV_PI / 4), true);
+    Mat M = createAffineTransformMatrix(size, static_cast<float>(CV_PI / 4), true);
 
     GpuMat_<float> d_src(src);
     GpuMat_<float> d_M;
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 9ba180c7d1..b5416142c9 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -489,7 +489,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         static Ptr<EltwiseLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS BatchNormLayer : public Layer
+    class CV_EXPORTS BatchNormLayer : public ActivationLayer
     {
     public:
         bool hasWeights, hasBias;
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index 16138cb99f..c6cef9f4f7 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -258,6 +258,17 @@ PERF_TEST_P_(DNNTestNetwork, FastNeuralStyle_eccv16)
     processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", "", Mat(cv::Size(320, 240), CV_32FC3));
 }
 
+PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
+{
+    if (backend == DNN_BACKEND_HALIDE ||
+        (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
+        (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
+        throw SkipTestException("");
+    processNet("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb",
+               "dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt", "",
+               Mat(cv::Size(800, 600), CV_32FC3));
+}
+
 const tuple<DNNBackend, DNNTarget> testCases[] = {
 #ifdef HAVE_HALIDE
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 5920edc85e..43ad3d6d42 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1408,7 +1408,7 @@ struct Net::Impl
             bool fused = ld.skip;
 
             Ptr<Layer> layer = ld.layerInstance;
-            if (!layer->supportBackend(preferableBackend))
+            if (!fused && !layer->supportBackend(preferableBackend))
             {
                 addInfEngineNetOutputs(ld);
                 net = Ptr<InfEngineBackendNet>();
@@ -1471,6 +1471,8 @@ struct Net::Impl
             {
                 node = layer->initInfEngine(ld.inputBlobsWrappers);
             }
+            else if (node.empty())
+                continue;
 
             CV_Assert(!node.empty());
             ld.backendNodes[preferableBackend] = node;
@@ -1715,40 +1717,41 @@ struct Net::Impl
                 if (preferableBackend != DNN_BACKEND_OPENCV)
                     continue;  // Go to the next layer.
 
-                // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
-                if ( !IS_DNN_OPENCL_TARGET(preferableTarget) ||
-                     (IS_DNN_OPENCL_TARGET(preferableTarget) &&
-                         nextData &&
-                        ((nextData->type == "ReLU") ||
-                         (nextData->type == "ChannelsPReLU") ||
-                         (nextData->type == "ReLU6") ||
-                         (nextData->type == "TanH") ||
-                         (nextData->type == "Power"))) )
+                while (nextData)
                 {
+                    // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
+                    if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                        nextData->type != "ReLU" &&
+                        nextData->type != "ChannelsPReLU" &&
+                        nextData->type != "ReLU6" &&
+                        nextData->type != "TanH" &&
+                        nextData->type != "Power")
+                        break;
 
-                    Ptr<ActivationLayer> nextActivLayer;
+                    Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+                    if (nextActivLayer.empty())
+                        break;
 
-                    if( nextData )
-                        nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
-
-                    if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0
-                            && currLayer->setActivation(nextActivLayer) )
+                    if (currLayer->setActivation(nextActivLayer))
                     {
-                        LayerData *activData = nextData;
                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
-                        activData->skip = true;
+                        nextData->skip = true;
                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
-
-                        if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
+                        if (nextData->consumers.size() == 1)
                         {
-                            if ( !activData->consumers.empty() )
-                            {
-                                nextData = &layers[activData->consumers[0].lid];
-                                lpNext = LayerPin(activData->consumers[0].lid, 0);
-                            }
+                            int nextLayerId = nextData->consumers[0].lid;
+                            nextData = &layers[nextLayerId];
+                            lpNext = LayerPin(nextLayerId, 0);
+                        }
+                        else
+                        {
+                            nextData = 0;
+                            break;
                         }
                     }
+                    else
+                        break;
                 }
 
                 // fuse convolution layer followed by eltwise + relu
@@ -2050,10 +2053,10 @@ struct Net::Impl
         TickMeter tm;
         tm.start();
 
-        if (preferableBackend == DNN_BACKEND_OPENCV ||
-            !layer->supportBackend(preferableBackend))
+        if( !ld.skip )
         {
-            if( !ld.skip )
+            std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
+            if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
             {
                 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
                 {
@@ -2196,24 +2199,25 @@ struct Net::Impl
                 }
             }
             else
-                tm.reset();
-        }
-        else if (!ld.skip)
-        {
-            Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
-            if (preferableBackend == DNN_BACKEND_HALIDE)
             {
-                forwardHalide(ld.outputBlobsWrappers, node);
-            }
-            else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
-            {
-                forwardInfEngine(node);
-            }
-            else
-            {
-                CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
+                Ptr<BackendNode> node = it->second;
+                CV_Assert(!node.empty());
+                if (preferableBackend == DNN_BACKEND_HALIDE)
+                {
+                    forwardHalide(ld.outputBlobsWrappers, node);
+                }
+                else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
+                {
+                    forwardInfEngine(node);
+                }
+                else
+                {
+                    CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
+                }
             }
         }
+        else
+            tm.reset();
 
         tm.stop();
         layersTimings[ld.id] = tm.getTimeTicks();
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 3b472328c8..1ced532fdc 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -268,6 +268,36 @@ public:
         }
     }
 
+    void forwardSlice(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
+        {
+            int i = 0;
+            float w = weights_.at<float>(cn);
+            float b = bias_.at<float>(cn);
+#if CV_SIMD128
+            v_float32x4 wV = v_setall_f32(w), bV = v_setall_f32(b);
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_float32x4 x0 = v_load(srcptr + i);
+                v_float32x4 x1 = v_load(srcptr + i + 4);
+                v_float32x4 x2 = v_load(srcptr + i + 8);
+                v_float32x4 x3 = v_load(srcptr + i + 12);
+                x0 = v_muladd(x0, w, b);
+                x1 = v_muladd(x1, w, b);
+                x2 = v_muladd(x2, w, b);
+                x3 = v_muladd(x3, w, b);
+                v_store(dstptr + i, x0);
+                v_store(dstptr + i + 4, x1);
+                v_store(dstptr + i + 8, x2);
+                v_store(dstptr + i + 12, x3);
+            }
+#endif
+            for( ; i < len; i++ )
+                dstptr[i] = w * srcptr[i] + b;
+        }
+    }
+
     virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
     {
         switch (node->backendId)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index d08dec548b..08760ab49a 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -296,6 +296,9 @@ public:
 
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
     {
+        if (!activ.empty() && !layer.empty())
+            return false;
+
         activ = layer;
         if (activ.empty())
             reluslope.clear();
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index 7473751707..42a6a6c715 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -196,7 +196,7 @@ public:
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
         return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized;
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && !_locPredTransposed && _bboxesNormalized && !_clip;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 442bfa7aff..3a2c0ddb3f 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -452,8 +452,13 @@ public:
 
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
     {
-        activ = layer;
-        return !activ.empty();
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
     }
 
     Ptr<ActivationLayer> activ;
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index dfaa58c7ed..d17ca27383 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -135,8 +135,13 @@ public:
 
     virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
     {
-        activ = layer;
-        return !activ.empty();
+        if (activ.empty() || layer.empty())
+        {
+            activ = layer;
+            return !activ.empty();
+        }
+        else
+            return false;
     }
 
     class FullyConnected : public ParallelLoopBody
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index 9e4f0ac39c..6a2c6f1dd9 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -42,6 +42,7 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
 #ifdef HAVE_OPENCL
@@ -66,27 +67,25 @@ public:
         fuse_batch_norm = false;
         fuse_relu = false;
         relu_slope = 0.f;
+        zeroDev = false;
     }
 
     Mat scale, shift;
     bool fuse_batch_norm;
 
-    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
-    {
-        if (!fuse_batch_norm)
-        {
-            top->getScaleShift(scale, shift);
-            fuse_batch_norm = !scale.empty() || !shift.empty();
-            return fuse_batch_norm;
-        }
-        return false;
-    }
-
     Ptr<ReLULayer> activ_relu;
     float relu_slope;
     bool fuse_relu;
+    bool zeroDev;  // TODO: Doesn't considered in Intel's Inference Engine backend.
     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
     {
+        if (!layer.empty() && !fuse_relu && !fuse_batch_norm)
+        {
+            layer->getScaleShift(scale, shift);
+            fuse_batch_norm = !scale.empty() || !shift.empty();
+            return fuse_batch_norm;
+        }
+
         if (!layer.empty() && preferableTarget == DNN_TARGET_OPENCL)
         {
             activ_relu = layer.dynamicCast<ReLULayer>();
@@ -97,6 +96,23 @@ public:
         return fuse_relu;
     }
 
+    void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs) CV_OVERRIDE
+    {
+        int splitDim = (acrossChannels) ? 1 : 2;
+        int i, newRows = 1;
+        for( i = 0; i < splitDim; i++ )
+            newRows *= inputs[0]->size[i];
+        zeroDev = inputs[0]->total() == newRows;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
+            return !zeroDev && (preferableTarget == DNN_TARGET_CPU || eps <= 1e-7f);
+        else
+            return backendId == DNN_BACKEND_OPENCV;
+    }
+
 #ifdef HAVE_OPENCL
     bool fast_forward_ocl(std::vector<UMat> &inputs, std::vector<UMat> &outputs)
     {
@@ -324,6 +340,22 @@ public:
         }
     }
 
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "MVN";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::MVNLayer> ieLayer(new InferenceEngine::MVNLayer(lp));
+        ieLayer->params["across_channels"] = acrossChannels ? "1" : "0";
+        ieLayer->params["normalize_variance"] = normVariance ? "1" : "0";
+        ieLayer->params["eps"] = format("%f", eps);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 6cfa78c911..3b53805e1e 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -48,9 +48,8 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
-        return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_HALIDE && haveHalide() ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
+        return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && axis == 1;
     }
 
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index e4c723e3bf..2b0685826f 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -111,7 +111,7 @@ public:
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
         return backendId == DNN_BACKEND_OPENCV ||
-               backendId == DNN_BACKEND_INFERENCE_ENGINE && sliceRanges.size() == 1;
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && sliceRanges.size() == 1 && sliceRanges[0].size() == 4;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 5f50289847..eefd321bb3 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -307,15 +307,17 @@ public:
         return Ptr<BackendNode>();
     }
 
-    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
+        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
+
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "SoftMax";
         lp.precision = InferenceEngine::Precision::FP32;
         std::shared_ptr<InferenceEngine::SoftMaxLayer> ieLayer(new InferenceEngine::SoftMaxLayer(lp));
-        ieLayer->axis = axisRaw;
+        ieLayer->axis = clamp(axisRaw, input->dims.size());
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index adeb38574e..c60b8fcdbb 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -248,39 +248,38 @@ convolve_simd(
 
   int curr_y = or * STRIDE_Y;
   int curr_x = oc * STRIDE_X + lid;
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-  int saved_y = curr_y;
-#endif
+
   int in_addr = input_batch_offset
                 +  (curr_y - INPUT_PAD_H) * INPUT_WIDTH          // y tile offset
                 +   curr_x - INPUT_PAD_W;                        // x tile offset
 
+  const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;
+
   Dtype in_buf[INVEC_SIZE];
 
   for(int kd = 0; kd < INPUT_DEPTH; kd++)
   {
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+    const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);
     int in_offset = in_addr;
     __attribute__((opencl_unroll_hint(INVEC_SIZE)))
-    for (int reg = 0; reg < INVEC_SIZE; reg++)
+    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
     {
-        in_buf[reg] = inputs[in_offset];
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-        if (!(curr_y >= INPUT_PAD_H && curr_y < INPUT_HEIGHT + INPUT_PAD_H &&
-              curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W))
-        {
-          in_buf[reg] = 0;
-        }
-#endif
-        curr_y += 1;
-        in_offset += INPUT_WIDTH;
+      Dtype input = inputs[clamp(in_offset, 0, in_limit)];
+      int cy = curr_y + reg;
+      in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;
     }
+#else
+    int in_offset = in_addr;
+    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
+    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
+    {
+      in_buf[reg] = inputs[min(in_offset, in_limit)];
+    }
+#endif
 
     in_addr += INPUT_PITCH;
 
-#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
-    curr_y = saved_y;
-#endif
-
     Dtype weight_buf[WEIGHT_PREF];
     int w_idx=0;
 
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index fcca577094..66c03a777e 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -716,6 +716,8 @@ void TFImporter::populateNet(Net dstNet)
 
     // find all Const layers for params
     std::map<String, int> value_id;
+    // A map with constant blobs which are shared between multiple layers.
+    std::map<String, Mat> sharedWeights;
     addConstNodes(netBin, value_id, layers_to_ignore);
     addConstNodes(netTxt, value_id, layers_to_ignore);
 
@@ -805,51 +807,64 @@ void TFImporter::populateNet(Net dstNet)
                 }
             }
 
-            const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id);
-            kernelFromTensor(kernelTensor, layerParams.blobs[0]);
-            releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
-            int* kshape = layerParams.blobs[0].size.p;
-            const int outCh = kshape[0];
-            const int inCh = kshape[1];
-            const int height = kshape[2];
-            const int width = kshape[3];
-            if (type == "DepthwiseConv2dNative")
+            int kernelTensorInpId = -1;
+            const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id, -1, &kernelTensorInpId);
+            const String kernelTensorName = layer.input(kernelTensorInpId);
+            std::map<String, Mat>::iterator sharedWeightsIt = sharedWeights.find(kernelTensorName);
+            if (sharedWeightsIt == sharedWeights.end())
             {
-                CV_Assert(!locPredTransposed);
-                const int chMultiplier = kshape[0];
+                kernelFromTensor(kernelTensor, layerParams.blobs[0]);
+                releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
 
-                Mat copy = layerParams.blobs[0].clone();
-                float* src = (float*)copy.data;
-                float* dst = (float*)layerParams.blobs[0].data;
-                for (int i = 0; i < chMultiplier; ++i)
-                    for (int j = 0; j < inCh; ++j)
-                        for (int s = 0; s < height * width; ++s)
-                            {
-                                int src_i = (i * inCh + j) * height * width + s;
-                                int dst_i = (j * chMultiplier + i) * height* width + s;
-                                dst[dst_i] = src[src_i];
-                            }
-                // TODO Use reshape instead
-                kshape[0] = inCh * chMultiplier;
-                kshape[1] = 1;
-                size_t* kstep = layerParams.blobs[0].step.p;
-                kstep[0] = kstep[1]; // fix steps too
-            }
-            layerParams.set("kernel_h", height);
-            layerParams.set("kernel_w", width);
-            layerParams.set("num_output", outCh);
-
-            // Shuffle output channels from yxYX to xyXY.
-            if (locPredTransposed)
-            {
-                const int slice = height * width * inCh;
-                for (int i = 0; i < outCh; i += 2)
+                int* kshape = layerParams.blobs[0].size.p;
+                const int outCh = kshape[0];
+                const int inCh = kshape[1];
+                const int height = kshape[2];
+                const int width = kshape[3];
+                if (type == "DepthwiseConv2dNative")
                 {
-                    cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i));
-                    cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i + 1));
-                    std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+                    CV_Assert(!locPredTransposed);
+                    const int chMultiplier = kshape[0];
+
+                    Mat copy = layerParams.blobs[0].clone();
+                    float* src = (float*)copy.data;
+                    float* dst = (float*)layerParams.blobs[0].data;
+                    for (int i = 0; i < chMultiplier; ++i)
+                        for (int j = 0; j < inCh; ++j)
+                            for (int s = 0; s < height * width; ++s)
+                                {
+                                    int src_i = (i * inCh + j) * height * width + s;
+                                    int dst_i = (j * chMultiplier + i) * height* width + s;
+                                    dst[dst_i] = src[src_i];
+                                }
+                    // TODO Use reshape instead
+                    kshape[0] = inCh * chMultiplier;
+                    kshape[1] = 1;
+                    size_t* kstep = layerParams.blobs[0].step.p;
+                    kstep[0] = kstep[1]; // fix steps too
                 }
+
+                // Shuffle output channels from yxYX to xyXY.
+                if (locPredTransposed)
+                {
+                    const int slice = height * width * inCh;
+                    for (int i = 0; i < outCh; i += 2)
+                    {
+                        cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i));
+                        cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i + 1));
+                        std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+                    }
+                }
+                sharedWeights[kernelTensorName] = layerParams.blobs[0];
             }
+            else
+            {
+                layerParams.blobs[0] = sharedWeightsIt->second;
+            }
+
+            layerParams.set("kernel_h", layerParams.blobs[0].size[2]);
+            layerParams.set("kernel_w", layerParams.blobs[0].size[3]);
+            layerParams.set("num_output", layerParams.blobs[0].size[0]);
 
             setStrides(layerParams, layer);
             setPadding(layerParams, layer);
@@ -954,6 +969,13 @@ void TFImporter::populateNet(Net dstNet)
         {
             CV_Assert(layer.input_size() == 2);
 
+            // For the object detection networks, TensorFlow Object Detection API
+            // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
+            // order. We can manage it at DetectionOutput layer parsing predictions
+            // or shuffle last Faster-RCNN's matmul weights.
+            bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") &&
+                                     getLayerAttr(layer, "loc_pred_transposed").b();
+
             layerParams.set("bias_term", false);
             layerParams.blobs.resize(1);
 
@@ -970,6 +992,17 @@ void TFImporter::populateNet(Net dstNet)
                 blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
                 ExcludeLayer(net, weights_layer_index, 0, false);
                 layers_to_ignore.insert(next_layers[0].first);
+
+                if (locPredTransposed)
+                {
+                    const int numWeights = layerParams.blobs[1].total();
+                    float* biasData = reinterpret_cast<float*>(layerParams.blobs[1].data);
+                    CV_Assert(numWeights % 4 == 0);
+                    for (int i = 0; i < numWeights; i += 2)
+                    {
+                        std::swap(biasData[i], biasData[i + 1]);
+                    }
+                }
             }
 
             int kernel_blob_index = -1;
@@ -983,6 +1016,16 @@ void TFImporter::populateNet(Net dstNet)
             }
 
             layerParams.set("num_output", layerParams.blobs[0].size[0]);
+            if (locPredTransposed)
+            {
+                CV_Assert(layerParams.blobs[0].dims == 2);
+                for (int i = 0; i < layerParams.blobs[0].size[0]; i += 2)
+                {
+                    cv::Mat src = layerParams.blobs[0].row(i);
+                    cv::Mat dst = layerParams.blobs[0].row(i + 1);
+                    std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+                }
+            }
 
             int id = dstNet.addLayer(name, "InnerProduct", layerParams);
             layer_id[name] = id;
@@ -1010,6 +1053,7 @@ void TFImporter::populateNet(Net dstNet)
                 layer_id[permName] = permId;
                 connect(layer_id, dstNet, inpId, permId, 0);
                 inpId = Pin(permName);
+                inpLayout = DATA_LAYOUT_NCHW;
             }
             else if (newShape.total() == 4 && inpLayout == DATA_LAYOUT_NHWC)
             {
@@ -1024,7 +1068,7 @@ void TFImporter::populateNet(Net dstNet)
 
             // one input only
             connect(layer_id, dstNet, inpId, id, 0);
-            data_layouts[name] = newShape.total() == 2 ? DATA_LAYOUT_PLANAR : DATA_LAYOUT_UNKNOWN;
+            data_layouts[name] = newShape.total() == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
         }
         else if (type == "Flatten" || type == "Squeeze")
         {
@@ -1696,41 +1740,6 @@ void TFImporter::populateNet(Net dstNet)
             connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
             data_layouts[name] = DATA_LAYOUT_UNKNOWN;
         }
-        else if (type == "DetectionOutput")
-        {
-            // op: "DetectionOutput"
-            // input_0: "locations"
-            // input_1: "classifications"
-            // input_2: "prior_boxes"
-            if (hasLayerAttr(layer, "num_classes"))
-                layerParams.set("num_classes", getLayerAttr(layer, "num_classes").i());
-            if (hasLayerAttr(layer, "share_location"))
-                layerParams.set("share_location", getLayerAttr(layer, "share_location").b());
-            if (hasLayerAttr(layer, "background_label_id"))
-                layerParams.set("background_label_id", getLayerAttr(layer, "background_label_id").i());
-            if (hasLayerAttr(layer, "nms_threshold"))
-                layerParams.set("nms_threshold", getLayerAttr(layer, "nms_threshold").f());
-            if (hasLayerAttr(layer, "top_k"))
-                layerParams.set("top_k", getLayerAttr(layer, "top_k").i());
-            if (hasLayerAttr(layer, "code_type"))
-                layerParams.set("code_type", getLayerAttr(layer, "code_type").s());
-            if (hasLayerAttr(layer, "keep_top_k"))
-                layerParams.set("keep_top_k", getLayerAttr(layer, "keep_top_k").i());
-            if (hasLayerAttr(layer, "confidence_threshold"))
-                layerParams.set("confidence_threshold", getLayerAttr(layer, "confidence_threshold").f());
-            if (hasLayerAttr(layer, "loc_pred_transposed"))
-                layerParams.set("loc_pred_transposed", getLayerAttr(layer, "loc_pred_transposed").b());
-            if (hasLayerAttr(layer, "clip"))
-                layerParams.set("clip", getLayerAttr(layer, "clip").b());
-            if (hasLayerAttr(layer, "variance_encoded_in_target"))
-                layerParams.set("variance_encoded_in_target", getLayerAttr(layer, "variance_encoded_in_target").b());
-
-            int id = dstNet.addLayer(name, "DetectionOutput", layerParams);
-            layer_id[name] = id;
-            for (int i = 0; i < 3; ++i)
-                connect(layer_id, dstNet, parsePin(layer.input(i)), id, i);
-            data_layouts[name] = DATA_LAYOUT_UNKNOWN;
-        }
         else if (type == "Softmax")
         {
             if (hasLayerAttr(layer, "axis"))
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 0bcbe562a3..63b43f1b72 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -165,12 +165,6 @@ TEST_P(Test_TensorFlow_layers, batch_norm)
     runTensorFlowNet("unfused_batch_norm");
     runTensorFlowNet("fused_batch_norm_no_gamma");
     runTensorFlowNet("unfused_batch_norm_no_gamma");
-}
-
-TEST_P(Test_TensorFlow_layers, mvn_batch_norm)
-{
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE)
-        throw SkipTestException("");
     runTensorFlowNet("mvn_batch_norm");
     runTensorFlowNet("mvn_batch_norm_1x1");
 }
@@ -323,7 +317,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
 TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
 {
     checkBackend();
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE ||
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU) ||
         (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
 
@@ -343,6 +337,26 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_Faster_RCNN)
     normAssertDetections(ref, out, "", 0.3);
 }
 
+TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
+{
+    checkBackend();
+    std::string proto = findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pbtxt", false);
+    std::string model = findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pb", false);
+
+    Net net = readNetFromTensorflow(model, proto);
+    Mat img = imread(findDataFile("dnn/dog416.png", false));
+    Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_ppn_coco.detection_out.npy", false));
+    Mat blob = blobFromImage(img, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false);
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    net.setInput(blob);
+    Mat out = net.forward();
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : default_l1;
+    normAssertDetections(ref, out, "", 0.4, scoreDiff, default_lInf);
+}
+
 TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
 {
     checkBackend();
diff --git a/modules/features2d/doc/read_file_nondiff32.pl b/modules/features2d/doc/read_file_nondiff32.pl
index 6f1b420ecb..2ada4c9ea2 100644
--- a/modules/features2d/doc/read_file_nondiff32.pl
+++ b/modules/features2d/doc/read_file_nondiff32.pl
@@ -131,7 +131,7 @@ my $success_structured;
                   }
                   close $in2 or die "Can't close $filein: $!";
                 }
-                #find next else and interprete it
+                #find next else and interpret it
                 open(my $in3,  "<",  $filein)  or die "Can't open $filein: $!";
         $i3=1;
         $ifcount3=0;
diff --git a/modules/features2d/doc/read_file_score32.pl b/modules/features2d/doc/read_file_score32.pl
index c1adedac20..10cb77d080 100644
--- a/modules/features2d/doc/read_file_score32.pl
+++ b/modules/features2d/doc/read_file_score32.pl
@@ -119,7 +119,7 @@ my $is_a_corner;
                   }
                   close $in2 or die "Can't close $filein: $!";
                 }
-                #find next else and interprete it
+                #find next else and interpret it
                 open(my $in3,  "<",  $filein)  or die "Can't open $filein: $!";
         $i3=1;
         $ifcount3=0;
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 23cad31e4e..8925996da9 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1861,7 +1861,7 @@ gradient term \f$G\f$ and the second gradient term \f$b\f$ gives:
 The algorithm sets the center of the neighborhood window at this new center \f$q\f$ and then iterates
 until the center stays within a set threshold.
 
-@param image Input image.
+@param image Input single-channel, 8-bit or float image.
 @param corners Initial coordinates of the input corners and refined coordinates provided for
 output.
 @param winSize Half of the side length of the search window. For example, if winSize=Size(5,5) ,
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 73b74fa9df..02043ac929 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -2048,7 +2048,7 @@ public:
             svmType == NU_SVC ? "NU_SVC" :
             svmType == ONE_CLASS ? "ONE_CLASS" :
             svmType == EPS_SVR ? "EPS_SVR" :
-            svmType == NU_SVR ? "NU_SVR" : format("Uknown_%d", svmType);
+            svmType == NU_SVR ? "NU_SVR" : format("Unknown_%d", svmType);
         String kernel_type_str =
             kernelType == LINEAR ? "LINEAR" :
             kernelType == POLY ? "POLY" :
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index 8fc18f467f..4bd9596c1d 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -255,8 +255,8 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
     Mat_<float> _lut(1, 256);
     const float* const lut = &_lut(0,0);
 #if CV_SSE2
-    const int indeces[] = { 0, 1, 2, 3 };
-    __m128i idx = _mm_loadu_si128((const __m128i*)indeces);
+    const int indices[] = { 0, 1, 2, 3 };
+    __m128i idx = _mm_loadu_si128((const __m128i*)indices);
     __m128i ifour = _mm_set1_epi32(4);
 
     float* const _data = &_lut(0, 0);
@@ -273,8 +273,8 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
             idx = _mm_add_epi32(idx, ifour);
         }
 #elif CV_NEON
-    const int indeces[] = { 0, 1, 2, 3 };
-    uint32x4_t idx = *(uint32x4_t*)indeces;
+    const int indices[] = { 0, 1, 2, 3 };
+    uint32x4_t idx = *(uint32x4_t*)indices;
     uint32x4_t ifour = vdupq_n_u32(4);
 
     float* const _data = &_lut(0, 0);
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index c3a5593d35..fdbfa66bad 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -7,7 +7,6 @@
 
 #include "precomp.hpp"
 #include "opencv2/objdetect.hpp"
-// #include "opencv2/calib3d.hpp"
 
 #include <limits>
 #include <cmath>
@@ -21,7 +20,6 @@ class QRDecode
 {
 public:
     void init(Mat src, double eps_vertical_ = 0.2, double eps_horizontal_ = 0.1);
-    void binarization();
     bool localization();
     bool transformation();
     Mat getBinBarcode() { return bin_barcode; }
@@ -35,9 +33,7 @@ protected:
     Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2);
     vector<Point2f> getQuadrilateral(vector<Point2f> angle_list);
     bool testBypassRoute(vector<Point2f> hull, int start, int finish);
-    double getTriangleArea(Point2f a, Point2f b, Point2f c);
-    double getPolygonArea(vector<Point2f> points);
-    double getCosVectors(Point2f a, Point2f b, Point2f c);
+    inline double getCosVectors(Point2f a, Point2f b, Point2f c);
 
     Mat barcode, bin_barcode, straight_barcode;
     vector<Point2f> localization_points, transformation_points;
@@ -63,13 +59,7 @@ void QRDecode::init(Mat src, double eps_vertical_, double eps_horizontal_)
     }
     eps_vertical   = eps_vertical_;
     eps_horizontal = eps_horizontal_;
-}
-
-void QRDecode::binarization()
-{
-    Mat filter_barcode;
-    GaussianBlur(barcode, filter_barcode, Size(3, 3), 0);
-    threshold(filter_barcode, bin_barcode, 0, 255, THRESH_BINARY + THRESH_OTSU);
+    adaptiveThreshold(barcode, bin_barcode, 255, ADAPTIVE_THRESH_GAUSSIAN_C, THRESH_BINARY, 71, 2);
 }
 
 vector<Vec3d> QRDecode::searchVerticalLines()
@@ -139,7 +129,7 @@ vector<Point2f> QRDecode::separateHorizontalLines(vector<Vec3d> list_lines)
 
     for (size_t pnt = 0; pnt < list_lines.size(); pnt++)
     {
-        int x = static_cast<int>(list_lines[pnt][0] + list_lines[pnt][2] / 2);
+        int x = static_cast<int>(list_lines[pnt][0] + list_lines[pnt][2] * 0.5);
         int y = static_cast<int>(list_lines[pnt][1]);
 
         // --------------- Search horizontal up-lines --------------- //
@@ -203,7 +193,7 @@ vector<Point2f> QRDecode::separateHorizontalLines(vector<Vec3d> list_lines)
     {
         point2f_result.push_back(
               Point2f(static_cast<float>(result[i][1]),
-                      static_cast<float>(result[i][0] + result[i][2] / 2)));
+                      static_cast<float>(result[i][0] + result[i][2] * 0.5)));
     }
     return point2f_result;
 }
@@ -345,16 +335,23 @@ bool QRDecode::computeTransformationPoints()
             }
         }
     }
+
     if (down_left_edge_point == Point2f(0, 0) ||
-        up_right_edge_point  == Point2f(0, 0)) { return false; }
+        up_right_edge_point  == Point2f(0, 0) ||
+        new_non_zero_elem[0].size() == 0) { return false; }
 
     double max_area = -1;
     up_left_edge_point = new_non_zero_elem[0][0];
+
     for (size_t i = 0; i < new_non_zero_elem[0].size(); i++)
     {
-        double temp_area = getTriangleArea(new_non_zero_elem[0][i],
-                                           down_left_edge_point,
-                                           up_right_edge_point);
+        vector<Point2f> list_edge_points;
+        list_edge_points.push_back(new_non_zero_elem[0][i]);
+        list_edge_points.push_back(down_left_edge_point);
+        list_edge_points.push_back(up_right_edge_point);
+
+        double temp_area = fabs(contourArea(list_edge_points));
+
         if (max_area < temp_area)
         {
             up_left_edge_point = new_non_zero_elem[0][i];
@@ -375,6 +372,7 @@ bool QRDecode::computeTransformationPoints()
         }
     }
 
+
     for (size_t i = 0; i < new_non_zero_elem[2].size(); i++)
     {
         double temp_norm_delta = norm(up_left_edge_point - new_non_zero_elem[2][i])
@@ -485,7 +483,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
         hull[i] = Point2f(x, y);
     }
 
-    const double experimental_area = getPolygonArea(hull);
+    const double experimental_area = fabs(contourArea(hull));
 
     vector<Point2f> result_hull_point(angle_size);
     double min_norm;
@@ -539,7 +537,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
         double temp_norm = getCosVectors(hull[index_hull], intrsc_line_hull, angle_closest_pnt);
         if (min_norm > temp_norm &&
             norm(hull[index_hull] - hull[next_index_hull]) >
-            norm(angle_list[1] - angle_list[2]) / 10)
+            norm(angle_list[1] - angle_list[2]) * 0.1)
         {
             min_norm = temp_norm;
             result_side_begin[0] = hull[index_hull];
@@ -577,7 +575,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
         double temp_norm = getCosVectors(hull[index_hull], intrsc_line_hull, angle_closest_pnt);
         if (min_norm > temp_norm &&
             norm(hull[index_hull] - hull[next_index_hull]) >
-            norm(angle_list[0] - angle_list[1]) / 20)
+            norm(angle_list[0] - angle_list[1]) * 0.05)
         {
             min_norm = temp_norm;
             result_side_begin[1] = hull[index_hull];
@@ -611,7 +609,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
         if (next_index_hull == hull_size) { next_index_hull = 0; }
         if (next_index_hull == -1) { next_index_hull = hull_size - 1; }
 
-        if (norm(hull[index_hull] - hull[next_index_hull]) < standart_norm / 10.0)
+        if (norm(hull[index_hull] - hull[next_index_hull]) < standart_norm * 0.1)
         { index_hull = next_index_hull; continue; }
 
         extra_index_hull = finish_line[1];
@@ -623,7 +621,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
             if (extra_next_index_hull == hull_size) { extra_next_index_hull = 0; }
             if (extra_next_index_hull == -1) { extra_next_index_hull = hull_size - 1; }
 
-            if (norm(hull[extra_index_hull] - hull[extra_next_index_hull]) < standart_norm / 10.0)
+            if (norm(hull[extra_index_hull] - hull[extra_next_index_hull]) < standart_norm * 0.1)
             { extra_index_hull = extra_next_index_hull; continue; }
 
             test_result_angle_list[0]
@@ -639,7 +637,7 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
             = intersectionLines(hull[index_hull], hull[next_index_hull],
                                 result_side_begin[0], result_side_end[0]);
 
-            test_diff_area = fabs(getPolygonArea(test_result_angle_list) - experimental_area);
+            test_diff_area = fabs(fabs(contourArea(test_result_angle_list)) - experimental_area);
             if (min_diff_area > test_diff_area)
             {
                 min_diff_area = test_diff_area;
@@ -656,53 +654,22 @@ vector<Point2f> QRDecode::getQuadrilateral(vector<Point2f> angle_list)
         index_hull = next_index_hull;
     }
     while(index_hull != unstable_pnt);
+
+    if (norm(result_angle_list[0] - angle_list[1]) > 2) { result_angle_list[0] = angle_list[1]; }
+    if (norm(result_angle_list[1] - angle_list[0]) > 2) { result_angle_list[1] = angle_list[0]; }
+    if (norm(result_angle_list[3] - angle_list[2]) > 2) { result_angle_list[3] = angle_list[2]; }
+
     return result_angle_list;
 }
 
-//          b
-//         / |
-//        /  |
-//       /   |
-//      /  S |
-//     /     |
-//   a ----- c
-
-double QRDecode::getTriangleArea(Point2f a, Point2f b, Point2f c)
-{
-    double norm_sides[] = { norm(a - b), norm(b - c), norm(c - a) };
-    double half_perimeter = (norm_sides[0] + norm_sides[1] + norm_sides[2]) / 2.0;
-    double triangle_area = sqrt(half_perimeter *
-                               (half_perimeter - norm_sides[0]) *
-                               (half_perimeter - norm_sides[1]) *
-                               (half_perimeter - norm_sides[2]));
-    return triangle_area;
-}
-
-double QRDecode::getPolygonArea(vector<Point2f> points)
-{
-    CV_Assert(points.size() >= 3);
-    if (points.size() == 3)
-    { return getTriangleArea(points[0], points[1], points[2]); }
-    else
-    {
-        double result_area = 0.0;
-        for (size_t i = 1; i < points.size() - 1; i++)
-        {
-            result_area += getTriangleArea(points[0], points[i], points[i + 1]);
-        }
-        return result_area;
-    }
-}
-
 //      / | b
 //     /  |
 //    /   |
 //  a/    | c
 
-double QRDecode::getCosVectors(Point2f a, Point2f b, Point2f c)
+inline double QRDecode::getCosVectors(Point2f a, Point2f b, Point2f c)
 {
-    return ((a - b).x * (c - b).x + (a - b).y * (c - b).y)
-            / (norm(a - b) * norm(c - b));
+    return ((a - b).x * (c - b).x + (a - b).y * (c - b).y) / (norm(a - b) * norm(c - b));
 }
 
 bool QRDecode::transformation()
@@ -764,7 +731,6 @@ bool QRCodeDetector::detect(InputArray in, OutputArray points) const
     CV_Assert(inarr.type() == CV_8UC1);
     QRDecode qrdec;
     qrdec.init(inarr, p->epsX, p->epsY);
-    qrdec.binarization();
     if (!qrdec.localization()) { return false; }
     if (!qrdec.transformation()) { return false; }
     vector<Point2f> pnts2f = qrdec.getTransformationPoints();
diff --git a/modules/photo/src/contrast_preserve.hpp b/modules/photo/src/contrast_preserve.hpp
index ec8274e883..1afd4bc3e3 100644
--- a/modules/photo/src/contrast_preserve.hpp
+++ b/modules/photo/src/contrast_preserve.hpp
@@ -159,12 +159,12 @@ void Decolor::gradvector(const Mat &img, vector <double> &grad) const
 
     for(int i=0;i<height;i++)
         for(int j=0;j<width;j++)
-            grad[i*height + j] = d_trans.at<float>(i, j);
+            grad[i*width + j] = d_trans.at<float>(i, j);
 
     const int offset = width * height;
     for(int i=0;i<height;i++)
         for(int j=0;j<width;j++)
-            grad[offset + i * height + j] = d1_trans.at<float>(i, j);
+            grad[offset + i * width + j] = d1_trans.at<float>(i, j);
 }
 
 void Decolor::colorGrad(const Mat &img, vector <double> &Cg) const
@@ -204,14 +204,19 @@ void Decolor::add_to_vector_poly(vector < vector <double> > &polyGrad, const vec
     idx1++;
 }
 
-void Decolor::weak_order(const Mat &img, vector <double> &alf) const
+void Decolor::weak_order(const Mat &im, vector <double> &alf) const
 {
-    const int h = img.size().height;
-    const int w = img.size().width;
+    Mat img;
+    const int h = im.size().height;
+    const int w = im.size().width;
     if((h + w) > 800)
     {
         const double sizefactor = double(800)/(h+w);
-        resize(img, img, Size(cvRound(h*sizefactor), cvRound(w*sizefactor)));
+        resize(im, img, Size(cvRound(w*sizefactor), cvRound(h*sizefactor)));
+    }
+    else
+    {
+        img = im;
     }
 
     Mat curIm = Mat(img.size(),CV_32FC1);
@@ -246,16 +251,20 @@ void Decolor::weak_order(const Mat &img, vector <double> &alf) const
         alf[i] -= tmp1[i] * tmp2[i] * tmp3[i];
 }
 
-void Decolor::grad_system(const Mat &img, vector < vector < double > > &polyGrad,
+void Decolor::grad_system(const Mat &im, vector < vector < double > > &polyGrad,
         vector < double > &Cg, vector <Vec3i>& comb) const
 {
-    int h = img.size().height;
-    int w = img.size().width;
-
+    Mat img;
+    int h = im.size().height;
+    int w = im.size().width;
     if((h + w) > 800)
     {
         const double sizefactor = double(800)/(h+w);
-        resize(img, img, Size(cvRound(h*sizefactor), cvRound(w*sizefactor)));
+        resize(im, img, Size(cvRound(w*sizefactor), cvRound(h*sizefactor)));
+    }
+    else
+    {
+        img = im;
     }
 
     h = img.size().height;
diff --git a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
index 4acebea5e1..25c0f2ab1e 100644
--- a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
@@ -137,6 +137,21 @@ private:
     Ptr<Feature2D> surf;
 };
 
+
+/** @brief SIFT features finder.
+
+@sa detail::FeaturesFinder, SIFT
+*/
+class CV_EXPORTS SiftFeaturesFinder : public FeaturesFinder
+{
+public:
+    SiftFeaturesFinder();
+
+private:
+    void find(InputArray image, ImageFeatures &features) CV_OVERRIDE;
+    Ptr<Feature2D> sift;
+};
+
 /** @brief ORB features finder. :
 
 @sa detail::FeaturesFinder, ORB
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 6b9d75cdd8..3d82acf484 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -51,6 +51,7 @@ using namespace cv::cuda;
 #ifdef HAVE_OPENCV_XFEATURES2D
 #include "opencv2/xfeatures2d.hpp"
 using xfeatures2d::SURF;
+using xfeatures2d::SIFT;
 #endif
 
 #ifdef HAVE_OPENCV_CUDAIMGPROC
@@ -475,6 +476,35 @@ void SurfFeaturesFinder::find(InputArray image, ImageFeatures &features)
     }
 }
 
+SiftFeaturesFinder::SiftFeaturesFinder()
+{
+#ifdef HAVE_OPENCV_XFEATURES2D
+    Ptr<SIFT> sift_ = SIFT::create();
+    if( !sift_ )
+        CV_Error( Error::StsNotImplemented, "OpenCV was built without SIFT support" );
+    sift = sift_;
+#else
+    CV_Error( Error::StsNotImplemented, "OpenCV was built without SIFT support" );
+#endif
+}
+
+void SiftFeaturesFinder::find(InputArray image, ImageFeatures &features)
+{
+    UMat gray_image;
+    CV_Assert((image.type() == CV_8UC3) || (image.type() == CV_8UC1));
+    if(image.type() == CV_8UC3)
+    {
+        cvtColor(image, gray_image, COLOR_BGR2GRAY);
+    }
+    else
+    {
+        gray_image = image.getUMat();
+    }
+    UMat descriptors;
+    sift->detectAndCompute(gray_image, Mat(), features.keypoints, descriptors);
+    features.descriptors = descriptors.reshape(1, (int)features.keypoints.size());
+}
+
 OrbFeaturesFinder::OrbFeaturesFinder(Size _grid_size, int n_features, float scaleFactor, int nlevels)
 {
     grid_size = _grid_size;
diff --git a/modules/ts/include/opencv2/ts/ts_gtest.h b/modules/ts/include/opencv2/ts/ts_gtest.h
index 2b1299c3bf..b687a5722e 100644
--- a/modules/ts/include/opencv2/ts/ts_gtest.h
+++ b/modules/ts/include/opencv2/ts/ts_gtest.h
@@ -9013,7 +9013,7 @@ class NativeArray {
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
+// representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
   GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
   if (const ::testing::AssertionResult gtest_ar_ = \
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index 8758b21dd9..2c9570e67f 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -613,10 +613,12 @@ int GStreamerCapture::getCaptureDomain() { return CAP_GSTREAMER; }
  */
 bool GStreamerCapture::open(int id)
 {
+    gst_initializer::init();
+
     if (!is_gst_element_exists("v4l2src"))
         return false;
     std::ostringstream desc;
-    desc << "v4l2src device-name=/dev/video" << id
+    desc << "v4l2src device=/dev/video" << id
              << " ! " << COLOR_ELEM
              << " ! appsink";
     return open(desc.str());
diff --git a/modules/videoio/src/cap_mjpeg_decoder.cpp b/modules/videoio/src/cap_mjpeg_decoder.cpp
index f8ba6857f3..02400fd9f0 100644
--- a/modules/videoio/src/cap_mjpeg_decoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_decoder.cpp
@@ -146,6 +146,9 @@ bool MotionJpegCapture::grabFrame()
         }
         else
         {
+            if (m_frame_iterator == m_mjpeg_frames.end())
+                return false;
+
             ++m_frame_iterator;
         }
     }
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 1b7ae8a19e..d816dbcbf0 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -431,6 +431,7 @@ static int autosetup_capture_mode_v4l2(CvCaptureCAM_V4L* capture) {
             V4L2_PIX_FMT_BGR24,
             V4L2_PIX_FMT_RGB24,
             V4L2_PIX_FMT_YVU420,
+            V4L2_PIX_FMT_YUV420,
             V4L2_PIX_FMT_YUV411P,
             V4L2_PIX_FMT_YUYV,
             V4L2_PIX_FMT_UYVY,
@@ -532,6 +533,7 @@ static int v4l2_set_fps(CvCaptureCAM_V4L* capture) {
 static int v4l2_num_channels(__u32 palette) {
     switch(palette) {
     case V4L2_PIX_FMT_YVU420:
+    case V4L2_PIX_FMT_YUV420:
     case V4L2_PIX_FMT_MJPEG:
     case V4L2_PIX_FMT_JPEG:
     case V4L2_PIX_FMT_Y16:
@@ -562,6 +564,7 @@ static void v4l2_create_frame(CvCaptureCAM_V4L *capture) {
             size = CvSize(capture->buffers[capture->bufferIndex].length, 1);
             break;
         case V4L2_PIX_FMT_YVU420:
+        case V4L2_PIX_FMT_YUV420:
             size.height = size.height * 3 / 2; // "1.5" channels
             break;
         case V4L2_PIX_FMT_Y16:
@@ -1021,10 +1024,10 @@ move_411_block(int yTL, int yTR, int yBL, int yBR, int u, int v,
 
 /* Converts from planar YUV420P to RGB24. */
 static inline void
-yuv420p_to_rgb24(int width, int height, uchar* src, uchar* dst)
+yuv420p_to_rgb24(int width, int height, uchar* src, uchar* dst, bool isYUV)
 {
     cvtColor(Mat(height * 3 / 2, width, CV_8U, src), Mat(height, width, CV_8UC3, dst),
-            COLOR_YUV2BGR_YV12);
+            isYUV ? COLOR_YUV2BGR_IYUV : COLOR_YUV2BGR_YV12);
 }
 
 // Consider a YUV411P image of 8x2 pixels.
@@ -1490,10 +1493,12 @@ static IplImage* icvRetrieveFrameCAM_V4L( CvCaptureCAM_V4L* capture, int) {
         break;
 
     case V4L2_PIX_FMT_YVU420:
+    case V4L2_PIX_FMT_YUV420:
         yuv420p_to_rgb24(capture->form.fmt.pix.width,
                 capture->form.fmt.pix.height,
                 (unsigned char*)(capture->buffers[capture->bufferIndex].start),
-                (unsigned char*)capture->frame.imageData);
+                (unsigned char*)capture->frame.imageData,
+                capture->palette == V4L2_PIX_FMT_YUV420);
         break;
 
     case V4L2_PIX_FMT_YUV411P:
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 6ac1dea60b..15aff36c39 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -70,9 +70,7 @@ endif()
 
 ocv_install_example_src("." CMakeLists.txt)
 if(INSTALL_C_EXAMPLES)
-  install(DIRECTORY data
-          DESTINATION "${OPENCV_SAMPLES_SRC_INSTALL_PATH}/data"
-        COMPONENT samples_data)
+  install(DIRECTORY data DESTINATION "${OPENCV_SAMPLES_SRC_INSTALL_PATH}" COMPONENT samples_data)
 endif()
 
 else()
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index 91641d7a28..2ff4e7b16b 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -82,7 +82,7 @@ static void printUsage()
         "\nMotion Estimation Flags:\n"
         "  --work_megapix <float>\n"
         "      Resolution for image registration step. The default is 0.6 Mpx.\n"
-        "  --features (surf|orb)\n"
+        "  --features (surf|orb|sift)\n"
         "      Type of features used for images matching. The default is surf.\n"
         "  --matcher (homography|affine)\n"
         "      Matcher used for pairwise image matching.\n"
@@ -430,6 +430,9 @@ int main(int argc, char* argv[])
     {
         finder = makePtr<OrbFeaturesFinder>();
     }
+    else if (features_type == "sift") {
+        finder = makePtr<SiftFeaturesFinder>();
+    }
     else
     {
         cout << "Unknown 2D features type: '" << features_type << "'.\n";
diff --git a/samples/cpp/train_HOG.cpp b/samples/cpp/train_HOG.cpp
index 1c6c81481c..3a1527d8f4 100644
--- a/samples/cpp/train_HOG.cpp
+++ b/samples/cpp/train_HOG.cpp
@@ -204,7 +204,7 @@ int main( int argc, char** argv )
     const char* keys =
     {
         "{help h|     | show help message}"
-        "{pd    |     | path of directory contains possitive images}"
+        "{pd    |     | path of directory contains positive images}"
         "{nd    |     | path of directory contains negative images}"
         "{td    |     | path of directory contains test images}"
         "{tv    |     | test video file name}"
diff --git a/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp b/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
index c194e82f24..aa6107c120 100644
--- a/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
@@ -1,6 +1,6 @@
 /**
  * @file introduction_to_pca.cpp
- * @brief This program demonstrates how to use OpenCV PCA to extract the orienation of an object
+ * @brief This program demonstrates how to use OpenCV PCA to extract the orientation of an object
  * @author OpenCV team
  */
 
diff --git a/samples/cpp/warpPerspective_demo.cpp b/samples/cpp/warpPerspective_demo.cpp
index 35bf87dfd9..591e03d59b 100644
--- a/samples/cpp/warpPerspective_demo.cpp
+++ b/samples/cpp/warpPerspective_demo.cpp
@@ -26,7 +26,7 @@ static void help(char** argv)
          "\tESC, q - quit the program\n"
          "\tr - change order of points to rotate transformation\n"
          "\tc - delete selected points\n"
-         "\ti - change order of points to invers transformation \n"
+         "\ti - change order of points to inverse transformation \n"
          "\nUse your mouse to select a point and move it to see transformation changes" << endl;
 }
 
diff --git a/samples/dnn/CMakeLists.txt b/samples/dnn/CMakeLists.txt
index 0df76517a5..4af6d40928 100644
--- a/samples/dnn/CMakeLists.txt
+++ b/samples/dnn/CMakeLists.txt
@@ -13,32 +13,6 @@ if(NOT BUILD_EXAMPLES OR NOT OCV_DEPENDENCIES_FOUND)
   return()
 endif()
 
-function(download_net name commit hash)
-  set(DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR "${CMAKE_CURRENT_LIST_DIR}/face_detector")
-  if(COMMAND ocv_download)
-    ocv_download(FILENAME ${name}
-               HASH ${hash}
-               URL
-                 "$ENV{OPENCV_DNN_MODELS_URL}"
-                 "${OPENCV_DNN_MODELS_URL}"
-                 "https://raw.githubusercontent.com/opencv/opencv_3rdparty/${commit}/"
-               DESTINATION_DIR ${DNN_FACE_DETECTOR_MODEL_DOWNLOAD_DIR}
-               ID DNN_FACE_DETECTOR
-               RELATIVE_URL
-               STATUS res)
-  endif()
-endfunction()
-
-# Model branch name: dnn_samples_face_detector_20180205_fp16
-download_net("res10_300x300_ssd_iter_140000_fp16.caffemodel"
-             "19512576c112aa2c7b6328cb0e8d589a4a90a26d"
-             "f737f886e33835410c69e3ccfe0720a1")
-
-# Model branch name: dnn_samples_face_detector_20180220_uint8
-download_net("opencv_face_detector_uint8.pb"
-             "7b425df276ba2161b8edaab0f0756f4a735d61b9"
-             "56acf81f55d9b9e96c3347bc65409b9e")
-
 project(dnn_samples)
 ocv_include_modules_recurse(${OPENCV_DNN_SAMPLES_REQUIRED_DEPS})
 file(GLOB_RECURSE dnn_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
diff --git a/samples/dnn/custom_layers.hpp b/samples/dnn/custom_layers.hpp
index 918cc8ae46..a18bb9a5cf 100644
--- a/samples/dnn/custom_layers.hpp
+++ b/samples/dnn/custom_layers.hpp
@@ -198,7 +198,7 @@ private:
 //! [ResizeBilinearLayer]
 
 //
-// The folowing code is used only to generate tutorials documentation.
+// The following code is used only to generate tutorials documentation.
 //
 
 //! [A custom layer interface]
diff --git a/samples/dnn/face_detector/download_weights.py b/samples/dnn/face_detector/download_weights.py
new file mode 100755
index 0000000000..f872190d02
--- /dev/null
+++ b/samples/dnn/face_detector/download_weights.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import hashlib
+import time
+import sys
+import xml.etree.ElementTree as ET
+if sys.version_info[0] < 3:
+    from urllib2 import urlopen
+else:
+    from urllib.request import urlopen
+
+class HashMismatchException(Exception):
+    def __init__(self, expected, actual):
+        Exception.__init__(self)
+        self.expected = expected
+        self.actual = actual
+    def __str__(self):
+        return 'Hash mismatch: {} vs {}'.format(self.expected, self.actual)
+
+class MetalinkDownloader(object):
+    BUFSIZE = 10*1024*1024
+    NS = {'ml': 'urn:ietf:params:xml:ns:metalink'}
+    tick = 0
+
+    def download(self, metalink_file):
+        status = True
+        for file_elem in ET.parse(metalink_file).getroot().findall('ml:file', self.NS):
+            url = file_elem.find('ml:url', self.NS).text
+            fname = file_elem.attrib['name']
+            hash_sum = file_elem.find('ml:hash', self.NS).text
+            print('*** {}'.format(fname))
+            try:
+                self.verify(hash_sum, fname)
+            except Exception as ex:
+                print('  {}'.format(ex))
+                try:
+                    print('  {}'.format(url))
+                    with open(fname, 'wb') as file_stream:
+                        self.buffered_read(urlopen(url), file_stream.write)
+                    self.verify(hash_sum, fname)
+                except Exception as ex:
+                    print('  {}'.format(ex))
+                    print('  FAILURE')
+                    status = False
+                    continue
+            print('  SUCCESS')
+        return status
+
+    def print_progress(self, msg, timeout = 0):
+        if time.time() - self.tick > timeout:
+            print(msg, end='')
+            sys.stdout.flush()
+            self.tick = time.time()
+
+    def buffered_read(self, in_stream, processing):
+        self.print_progress('  >')
+        while True:
+            buf = in_stream.read(self.BUFSIZE)
+            if not buf:
+                break
+            processing(buf)
+            self.print_progress('>', 5)
+        print(' done')
+
+    def verify(self, hash_sum, fname):
+        sha = hashlib.sha1()
+        with open(fname, 'rb') as file_stream:
+            self.buffered_read(file_stream, sha.update)
+        if hash_sum != sha.hexdigest():
+            raise HashMismatchException(hash_sum, sha.hexdigest())
+
+if __name__ == '__main__':
+    sys.exit(0 if MetalinkDownloader().download('weights.meta4') else 1)
diff --git a/samples/dnn/face_detector/weights.meta4 b/samples/dnn/face_detector/weights.meta4
new file mode 100644
index 0000000000..35d303085b
--- /dev/null
+++ b/samples/dnn/face_detector/weights.meta4
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<metalink xmlns="urn:ietf:params:xml:ns:metalink">
+    <file name="res10_300x300_ssd_iter_140000_fp16.caffemodel">
+        <identity>OpenCV face detector FP16 weights</identity>
+        <hash type="sha-1">31fc22bfdd907567a04bb45b7cfad29966caddc1</hash>
+        <url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel</url>
+    </file>
+    <file name="opencv_face_detector_uint8.pb">
+        <identity>OpenCV face detector UINT8 weights</identity>
+        <hash type="sha-1">4f2fdf6f231d759d7bbdb94353c5a68690f3d2ae</hash>
+        <url>https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180220_uint8/opencv_face_detector_uint8.pb</url>
+    </file>
+</metalink>
diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py
new file mode 100644
index 0000000000..61e3bbcaee
--- /dev/null
+++ b/samples/dnn/tf_text_graph_common.py
@@ -0,0 +1,25 @@
+import tensorflow as tf
+from tensorflow.core.framework.node_def_pb2 import NodeDef
+from google.protobuf import text_format
+
+def tensorMsg(values):
+    if all([isinstance(v, float) for v in values]):
+        dtype = 'DT_FLOAT'
+        field = 'float_val'
+    elif all([isinstance(v, int) for v in values]):
+        dtype = 'DT_INT32'
+        field = 'int_val'
+    else:
+        raise Exception('Wrong values types')
+
+    msg = 'tensor { dtype: ' + dtype + ' tensor_shape { dim { size: %d } }' % len(values)
+    for value in values:
+        msg += '%s: %s ' % (field, str(value))
+    return msg + '}'
+
+def addConstNode(name, values, graph_def):
+    node = NodeDef()
+    node.name = name
+    node.op = 'Const'
+    text_format.Merge(tensorMsg(values), node.attr["value"])
+    graph_def.node.extend([node])
diff --git a/samples/dnn/tf_text_graph_faster_rcnn.py b/samples/dnn/tf_text_graph_faster_rcnn.py
index 7ad5de283a..9aea38424a 100644
--- a/samples/dnn/tf_text_graph_faster_rcnn.py
+++ b/samples/dnn/tf_text_graph_faster_rcnn.py
@@ -6,6 +6,8 @@ from tensorflow.core.framework.node_def_pb2 import NodeDef
 from tensorflow.tools.graph_transforms import TransformGraph
 from google.protobuf import text_format
 
+from tf_text_graph_common import tensorMsg, addConstNode
+
 parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
                                              'SSD model from TensorFlow Object Detection API. '
                                              'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
@@ -93,21 +95,6 @@ while True:
     if node.op == 'CropAndResize':
         break
 
-def tensorMsg(values):
-    if all([isinstance(v, float) for v in values]):
-        dtype = 'DT_FLOAT'
-        field = 'float_val'
-    elif all([isinstance(v, int) for v in values]):
-        dtype = 'DT_INT32'
-        field = 'int_val'
-    else:
-        raise Exception('Wrong values types')
-
-    msg = 'tensor { dtype: ' + dtype + ' tensor_shape { dim { size: %d } }' % len(values)
-    for value in values:
-        msg += '%s: %s ' % (field, str(value))
-    return msg + '}'
-
 def addSlice(inp, out, begins, sizes):
     beginsNode = NodeDef()
     beginsNode.name = out + '/begins'
@@ -151,17 +138,25 @@ def addSoftMax(inp, out):
     softmax.input.append(inp)
     graph_def.node.extend([softmax])
 
+def addFlatten(inp, out):
+    flatten = NodeDef()
+    flatten.name = out
+    flatten.op = 'Flatten'
+    flatten.input.append(inp)
+    graph_def.node.extend([flatten])
+
 addReshape('FirstStageBoxPredictor/ClassPredictor/BiasAdd',
            'FirstStageBoxPredictor/ClassPredictor/reshape_1', [0, -1, 2])
 
 addSoftMax('FirstStageBoxPredictor/ClassPredictor/reshape_1',
            'FirstStageBoxPredictor/ClassPredictor/softmax')  # Compare with Reshape_4
 
-flatten = NodeDef()
-flatten.name = 'FirstStageBoxPredictor/BoxEncodingPredictor/flatten'  # Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
-flatten.op = 'Flatten'
-flatten.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd')
-graph_def.node.extend([flatten])
+addFlatten('FirstStageBoxPredictor/ClassPredictor/softmax',
+           'FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
+
+# Compare with FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd
+addFlatten('FirstStageBoxPredictor/BoxEncodingPredictor/BiasAdd',
+           'FirstStageBoxPredictor/BoxEncodingPredictor/flatten')
 
 proposals = NodeDef()
 proposals.name = 'proposals'  # Compare with ClipToWindow/Gather/Gather (NOTE: normalized)
@@ -194,7 +189,7 @@ detectionOut.name = 'detection_out'
 detectionOut.op = 'DetectionOutput'
 
 detectionOut.input.append('FirstStageBoxPredictor/BoxEncodingPredictor/flatten')
-detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax')
+detectionOut.input.append('FirstStageBoxPredictor/ClassPredictor/softmax/flatten')
 detectionOut.input.append('proposals')
 
 text_format.Merge('i: 2', detectionOut.attr['num_classes'])
@@ -204,11 +199,21 @@ text_format.Merge('f: 0.7', detectionOut.attr['nms_threshold'])
 text_format.Merge('i: 6000', detectionOut.attr['top_k'])
 text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type'])
 text_format.Merge('i: 100', detectionOut.attr['keep_top_k'])
-text_format.Merge('b: true', detectionOut.attr['clip'])
-text_format.Merge('b: true', detectionOut.attr['loc_pred_transposed'])
+text_format.Merge('b: false', detectionOut.attr['clip'])
 
 graph_def.node.extend([detectionOut])
 
+addConstNode('clip_by_value/lower', [0.0], graph_def)
+addConstNode('clip_by_value/upper', [1.0], graph_def)
+
+clipByValueNode = NodeDef()
+clipByValueNode.name = 'detection_out/clip_by_value'
+clipByValueNode.op = 'ClipByValue'
+clipByValueNode.input.append('detection_out')
+clipByValueNode.input.append('clip_by_value/lower')
+clipByValueNode.input.append('clip_by_value/upper')
+graph_def.node.extend([clipByValueNode])
+
 # Save as text.
 for node in reversed(topNodes):
     graph_def.node.extend([node])
@@ -225,17 +230,13 @@ addReshape('SecondStageBoxPredictor/Reshape_1/slice',
 # Replace Flatten subgraph onto a single node.
 for i in reversed(range(len(graph_def.node))):
     if graph_def.node[i].op == 'CropAndResize':
-        graph_def.node[i].input.insert(1, 'detection_out')
+        graph_def.node[i].input.insert(1, 'detection_out/clip_by_value')
 
     if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape':
-        shapeNode = NodeDef()
-        shapeNode.name = 'SecondStageBoxPredictor/Reshape/shape2'
-        shapeNode.op = 'Const'
-        text_format.Merge(tensorMsg([1, -1, 4]), shapeNode.attr["value"])
-        graph_def.node.extend([shapeNode])
+        addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def)
 
         graph_def.node[i].input.pop()
-        graph_def.node[i].input.append(shapeNode.name)
+        graph_def.node[i].input.append('SecondStageBoxPredictor/Reshape/shape2')
 
     if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
                                   'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
@@ -246,12 +247,15 @@ for node in graph_def.node:
     if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape':
         node.op = 'Flatten'
         node.input.pop()
-        break
+
+    if node.name in ['FirstStageBoxPredictor/BoxEncodingPredictor/Conv2D',
+                     'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
+        text_format.Merge('b: true', node.attr["loc_pred_transposed"])
 
 ################################################################################
 ### Postprocessing
 ################################################################################
-addSlice('detection_out', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4])
+addSlice('detection_out/clip_by_value', 'detection_out/slice', [0, 0, 0, 3], [-1, -1, -1, 4])
 
 variance = NodeDef()
 variance.name = 'proposals/variance'
@@ -268,12 +272,13 @@ text_format.Merge('i: 2', varianceEncoder.attr["axis"])
 graph_def.node.extend([varianceEncoder])
 
 addReshape('detection_out/slice', 'detection_out/slice/reshape', [1, 1, -1])
+addFlatten('variance_encoded', 'variance_encoded/flatten')
 
 detectionOut = NodeDef()
 detectionOut.name = 'detection_out_final'
 detectionOut.op = 'DetectionOutput'
 
-detectionOut.input.append('variance_encoded')
+detectionOut.input.append('variance_encoded/flatten')
 detectionOut.input.append('SecondStageBoxPredictor/Reshape_1/Reshape')
 detectionOut.input.append('detection_out/slice/reshape')
 
@@ -283,7 +288,6 @@ text_format.Merge('i: %d' % (args.num_classes + 1), detectionOut.attr['backgroun
 text_format.Merge('f: 0.6', detectionOut.attr['nms_threshold'])
 text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type'])
 text_format.Merge('i: 100', detectionOut.attr['keep_top_k'])
-text_format.Merge('b: true', detectionOut.attr['loc_pred_transposed'])
 text_format.Merge('b: true', detectionOut.attr['clip'])
 text_format.Merge('b: true', detectionOut.attr['variance_encoded_in_target'])
 graph_def.node.extend([detectionOut])
diff --git a/samples/dnn/tf_text_graph_ssd.py b/samples/dnn/tf_text_graph_ssd.py
index 1bf4079113..573a6d8941 100644
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@@ -15,6 +15,7 @@ from math import sqrt
 from tensorflow.core.framework.node_def_pb2 import NodeDef
 from tensorflow.tools.graph_transforms import TransformGraph
 from google.protobuf import text_format
+from tf_text_graph_common import tensorMsg, addConstNode
 
 parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
                                              'SSD model from TensorFlow Object Detection API. '
@@ -29,6 +30,11 @@ parser.add_argument('--aspect_ratios', default=[1.0, 2.0, 0.5, 3.0, 0.333], type
                     help='Hyper-parameter of ssd_anchor_generator from config file.')
 parser.add_argument('--image_width', default=300, type=int, help='Training images width.')
 parser.add_argument('--image_height', default=300, type=int, help='Training images height.')
+parser.add_argument('--not_reduce_boxes_in_lowest_layer', default=False, action='store_true',
+                    help='A boolean to indicate whether the fixed 3 boxes per '
+                         'location is used in the lowest achors generation layer.')
+parser.add_argument('--box_predictor', default='convolutional', type=str,
+                    choices=['convolutional', 'weight_shared_convolutional'])
 args = parser.parse_args()
 
 # Nodes that should be kept.
@@ -160,28 +166,6 @@ graph_def.node[1].input.append(weights)
 # Create SSD postprocessing head ###############################################
 
 # Concatenate predictions of classes, predictions of bounding boxes and proposals.
-def tensorMsg(values):
-    if all([isinstance(v, float) for v in values]):
-        dtype = 'DT_FLOAT'
-        field = 'float_val'
-    elif all([isinstance(v, int) for v in values]):
-        dtype = 'DT_INT32'
-        field = 'int_val'
-    else:
-        raise Exception('Wrong values types')
-
-    msg = 'tensor { dtype: ' + dtype + ' tensor_shape { dim { size: %d } }' % len(values)
-    for value in values:
-        msg += '%s: %s ' % (field, str(value))
-    return msg + '}'
-
-def addConstNode(name, values):
-    node = NodeDef()
-    node.name = name
-    node.op = 'Const'
-    text_format.Merge(tensorMsg(values), node.attr["value"])
-    graph_def.node.extend([node])
-
 def addConcatNode(name, inputs, axisNodeName):
     concat = NodeDef()
     concat.name = name
@@ -194,12 +178,18 @@ def addConcatNode(name, inputs, axisNodeName):
 addConstNode('concat/axis_flatten', [-1])
 addConstNode('PriorBox/concat/axis', [-2])
 
-for label in ['ClassPredictor', 'BoxEncodingPredictor']:
+for label in ['ClassPredictor', 'BoxEncodingPredictor' if args.box_predictor is 'convolutional' else 'BoxPredictor']:
     concatInputs = []
     for i in range(args.num_layers):
         # Flatten predictions
         flatten = NodeDef()
-        inpName = 'BoxPredictor_%d/%s/BiasAdd' % (i, label)
+        if args.box_predictor is 'convolutional':
+            inpName = 'BoxPredictor_%d/%s/BiasAdd' % (i, label)
+        else:
+            if i == 0:
+                inpName = 'WeightSharedConvolutionalBoxPredictor/%s/BiasAdd' % label
+            else:
+                inpName = 'WeightSharedConvolutionalBoxPredictor_%d/%s/BiasAdd' % (i, label)
         flatten.input.append(inpName)
         flatten.name = inpName + '/Flatten'
         flatten.op = 'Flatten'
@@ -210,7 +200,9 @@ for label in ['ClassPredictor', 'BoxEncodingPredictor']:
 
 idx = 0
 for node in graph_def.node:
-    if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx):
+    if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx) or \
+       node.name == ('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/Conv2D' % idx) or \
+       node.name == 'WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D':
         text_format.Merge('b: true', node.attr["loc_pred_transposed"])
         idx += 1
 assert(idx == args.num_layers)
@@ -224,13 +216,19 @@ for i in range(args.num_layers):
     priorBox = NodeDef()
     priorBox.name = 'PriorBox_%d' % i
     priorBox.op = 'PriorBox'
-    priorBox.input.append('BoxPredictor_%d/BoxEncodingPredictor/BiasAdd' % i)
+    if args.box_predictor is 'convolutional':
+        priorBox.input.append('BoxPredictor_%d/BoxEncodingPredictor/BiasAdd' % i)
+    else:
+        if i == 0:
+            priorBox.input.append('WeightSharedConvolutionalBoxPredictor/BoxPredictor/Conv2D')
+        else:
+            priorBox.input.append('WeightSharedConvolutionalBoxPredictor_%d/BoxPredictor/BiasAdd' % i)
     priorBox.input.append(graph_def.node[0].name)  # image_tensor
 
     text_format.Merge('b: false', priorBox.attr["flip"])
     text_format.Merge('b: false', priorBox.attr["clip"])
 
-    if i == 0:
+    if i == 0 and not args.not_reduce_boxes_in_lowest_layer:
         widths = [0.1, args.min_scale * sqrt(2.0), args.min_scale * sqrt(0.5)]
         heights = [0.1, args.min_scale / sqrt(2.0), args.min_scale / sqrt(0.5)]
     else:
@@ -261,7 +259,10 @@ detectionOut = NodeDef()
 detectionOut.name = 'detection_out'
 detectionOut.op = 'DetectionOutput'
 
-detectionOut.input.append('BoxEncodingPredictor/concat')
+if args.box_predictor == 'convolutional':
+    detectionOut.input.append('BoxEncodingPredictor/concat')
+else:
+    detectionOut.input.append('BoxPredictor/concat')
 detectionOut.input.append(sigmoid.name)
 detectionOut.input.append('PriorBox/concat')
 
diff --git a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Common/StandardStyles.xaml b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Common/StandardStyles.xaml
index 4def039e59..c8f8500db2 100644
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Common/StandardStyles.xaml
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Common/StandardStyles.xaml
@@ -1091,7 +1091,7 @@ Style x:Key="SkipBackAppBarButtonStyle" TargetType="ButtonBase" BasedOn="{Static
     </Style>
     <Style x:Key="PermissionsAppBarButtonStyle" TargetType="ButtonBase" BasedOn="{StaticResource AppBarButtonStyle}">
         <Setter Property="AutomationProperties.AutomationId" Value="PermissionsAppBarButton"/>
-        <Setter Property="AutomationProperties.Name" Value="Permisions"/>
+        <Setter Property="AutomationProperties.Name" Value="Permissions"/>
         <Setter Property="Content" Value="&#xE192;"/>
     </Style>
     <Style x:Key="HighlightAppBarButtonStyle" TargetType="ButtonBase" BasedOn="{StaticResource AppBarButtonStyle}">