diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index 6389b19894..de34aff4dd 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -346,7 +346,7 @@ elseif(MIPS)
   ocv_update(CPU_MSA_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_msa.cpp")
   ocv_update(CPU_KNOWN_OPTIMIZATIONS "MSA")
   ocv_update(CPU_MSA_FLAGS_ON "-mmsa")
-  set(CPU_BASELINE "MSA" CACHE STRING "${HELP_CPU_BASELINE}")
+  set(CPU_BASELINE "DETECT" CACHE STRING "${HELP_CPU_BASELINE}")
 elseif(PPC64LE)
   ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3")
   ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp")
diff --git a/cmake/OpenCVFindMKL.cmake b/cmake/OpenCVFindMKL.cmake
index 5eee3f5daa..19a76ddf57 100644
--- a/cmake/OpenCVFindMKL.cmake
+++ b/cmake/OpenCVFindMKL.cmake
@@ -133,7 +133,7 @@ message(STATUS "Found MKL ${MKL_VERSION_STR} at: ${MKL_ROOT_DIR}")
 set(HAVE_MKL ON)
 set(MKL_ROOT_DIR "${MKL_ROOT_DIR}" CACHE PATH "Path to MKL directory")
 set(MKL_INCLUDE_DIRS "${MKL_INCLUDE_DIRS}" CACHE PATH "Path to MKL include directory")
-set(MKL_LIBRARIES "${MKL_LIBRARIES}" CACHE STRING "MKL libarries")
+set(MKL_LIBRARIES "${MKL_LIBRARIES}" CACHE STRING "MKL libraries")
 if(UNIX AND NOT MKL_LIBRARIES_DONT_HACK)
     #it's ugly but helps to avoid cyclic lib problem
     set(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_LIBRARIES} ${MKL_LIBRARIES} "-lpthread" "-lm" "-ldl")
diff --git a/cmake/platforms/OpenCV-WindowsPhone.cmake b/cmake/platforms/OpenCV-WindowsPhone.cmake
index 8a496d3a7b..c32c256b75 100644
--- a/cmake/platforms/OpenCV-WindowsPhone.cmake
+++ b/cmake/platforms/OpenCV-WindowsPhone.cmake
@@ -1,4 +1,4 @@
-include("${CMAKE_CURRENT_LIST_DIR}/OpenCV_WinRT.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/OpenCV-WinRT.cmake")
 
 # Adding additional using directory for WindowsPhone 8.0 to get Windows.winmd properly
 if(WINRT_8_0)
diff --git a/cmake/platforms/OpenCV-WindowsStore.cmake b/cmake/platforms/OpenCV-WindowsStore.cmake
index 8b5dfa5556..efc8b4f86d 100644
--- a/cmake/platforms/OpenCV-WindowsStore.cmake
+++ b/cmake/platforms/OpenCV-WindowsStore.cmake
@@ -1 +1 @@
-include("${CMAKE_CURRENT_LIST_DIR}/OpenCV_WinRT.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/OpenCV-WinRT.cmake")
diff --git a/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown b/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
index 73e483943d..30ed918576 100644
--- a/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
+++ b/doc/js_tutorials/js_core/js_image_arithmetics/js_image_arithmetics.markdown
@@ -27,7 +27,7 @@ src1.delete(); src2.delete(); dst.delete(); mask.delete();
 Image Subtraction
 --------------
 
-You can subtract two images by OpenCV function, cv.subtract(). res = img1 - img2. Both images should be of same depth and type.
+You can subtract two images by OpenCV function, cv.subtract(). res = img1 - img2. Both images should be of same depth and type. Note that when used with RGBA images, the alpha channel is also subtracted.
 
 For example, consider below sample:
 @code{.js}
@@ -59,4 +59,4 @@ Try it
 <iframe src="../../js_image_arithmetics_bitwise.html" width="100%"
         onload="this.style.height=this.contentDocument.body.scrollHeight +'px';">
 </iframe>
-\endhtmlonly
\ No newline at end of file
+\endhtmlonly
diff --git a/doc/py_tutorials/py_gui/py_table_of_contents_gui.markdown b/doc/py_tutorials/py_gui/py_table_of_contents_gui.markdown
index a7ad0a1a43..471d464b55 100644
--- a/doc/py_tutorials/py_gui/py_table_of_contents_gui.markdown
+++ b/doc/py_tutorials/py_gui/py_table_of_contents_gui.markdown
@@ -4,21 +4,21 @@ Gui Features in OpenCV {#tutorial_py_table_of_contents_gui}
 -   @subpage tutorial_py_image_display
 
     Learn to load an
-    image, display it and save it back
+    image, display it, and save it back
 
 -   @subpage tutorial_py_video_display
 
     Learn to play videos,
-    capture videos from Camera and write it as a video
+    capture videos from a camera, and write videos
 
 -   @subpage tutorial_py_drawing_functions
 
     Learn to draw lines,
-    rectangles, ellipses, circles etc with OpenCV
+    rectangles, ellipses, circles, etc with OpenCV
 
 -   @subpage tutorial_py_mouse_handling
 
-    Draw stuffs with your
+    Draw stuff with your
     mouse
 
 -   @subpage tutorial_py_trackbar
diff --git a/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown b/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
index 9cea2359c7..d60b846245 100644
--- a/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
+++ b/doc/py_tutorials/py_gui/py_video_display/py_video_display.markdown
@@ -4,19 +4,19 @@ Getting Started with Videos {#tutorial_py_video_display}
 Goal
 ----
 
--   Learn to read video, display video and save video.
--   Learn to capture from Camera and display it.
+-   Learn to read video, display video, and save video.
+-   Learn to capture video from a camera and display it.
 -   You will learn these functions : **cv.VideoCapture()**, **cv.VideoWriter()**
 
 Capture Video from Camera
 -------------------------
 
-Often, we have to capture live stream with camera. OpenCV provides a very simple interface to this.
-Let's capture a video from the camera (I am using the in-built webcam of my laptop), convert it into
+Often, we have to capture live stream with a camera. OpenCV provides a very simple interface to do this.
+Let's capture a video from the camera (I am using the built-in webcam on my laptop), convert it into
 grayscale video and display it. Just a simple task to get started.
 
 To capture a video, you need to create a **VideoCapture** object. Its argument can be either the
-device index or the name of a video file. Device index is just the number to specify which camera.
+device index or the name of a video file. A device index is just the number to specify which camera.
 Normally one camera will be connected (as in my case). So I simply pass 0 (or -1). You can select
 the second camera by passing 1 and so on. After that, you can capture frame-by-frame. But at the
 end, don't forget to release the capture.
@@ -46,16 +46,16 @@ while True:
 # When everything done, release the capture
 cap.release()
 cv.destroyAllWindows()@endcode
-`cap.read()` returns a bool (`True`/`False`). If frame is read correctly, it will be `True`. So you can
-check end of the video by checking this return value.
+`cap.read()` returns a bool (`True`/`False`). If the frame is read correctly, it will be `True`. So you can
+check for the end of the video by checking this returned value.
 
-Sometimes, cap may not have initialized the capture. In that case, this code shows error. You can
+Sometimes, cap may not have initialized the capture. In that case, this code shows an error. You can
 check whether it is initialized or not by the method **cap.isOpened()**. If it is `True`, OK.
 Otherwise open it using **cap.open()**.
 
 You can also access some of the features of this video using **cap.get(propId)** method where propId
 is a number from 0 to 18. Each number denotes a property of the video (if it is applicable to that
-video) and full details can be seen here: cv::VideoCapture::get().
+video). Full details can be seen here: cv::VideoCapture::get().
 Some of these values can be modified using **cap.set(propId, value)**. Value is the new value you
 want.
 
@@ -63,13 +63,13 @@ For example, I can check the frame width and height by `cap.get(cv.CAP_PROP_FRAM
 640x480 by default. But I want to modify it to 320x240. Just use `ret = cap.set(cv.CAP_PROP_FRAME_WIDTH,320)` and
 `ret = cap.set(cv.CAP_PROP_FRAME_HEIGHT,240)`.
 
-@note If you are getting error, make sure camera is working fine using any other camera application
+@note If you are getting an error, make sure your camera is working fine using any other camera application
 (like Cheese in Linux).
 
 Playing Video from file
 -----------------------
 
-It is same as capturing from Camera, just change camera index with video file name. Also while
+Playing video from file is the same as capturing it from camera, just change the camera index to a video file name. Also while
 displaying the frame, use appropriate time for `cv.waitKey()`. If it is too less, video will be very
 fast and if it is too high, video will be slow (Well, that is how you can display videos in slow
 motion). 25 milliseconds will be OK in normal cases.
@@ -96,23 +96,23 @@ cap.release()
 cv.destroyAllWindows()
 @endcode
 
-@note Make sure proper versions of ffmpeg or gstreamer is installed. Sometimes, it is a headache to
-work with Video Capture mostly due to wrong installation of ffmpeg/gstreamer.
+@note Make sure a proper version of ffmpeg or gstreamer is installed. Sometimes it is a headache to
+work with video capture, mostly due to wrong installation of ffmpeg/gstreamer.
 
 Saving a Video
 --------------
 
-So we capture a video, process it frame-by-frame and we want to save that video. For images, it is
-very simple, just use `cv.imwrite()`. Here a little more work is required.
+So we capture a video and process it frame-by-frame, and we want to save that video. For images, it is
+very simple: just use `cv.imwrite()`. Here, a little more work is required.
 
 This time we create a **VideoWriter** object. We should specify the output file name (eg:
 output.avi). Then we should specify the **FourCC** code (details in next paragraph). Then number of
-frames per second (fps) and frame size should be passed. And last one is **isColor** flag. If it is
-`True`, encoder expect color frame, otherwise it works with grayscale frame.
+frames per second (fps) and frame size should be passed. And the last one is the **isColor** flag. If it is
+`True`, the encoder expect color frame, otherwise it works with grayscale frame.
 
 [FourCC](http://en.wikipedia.org/wiki/FourCC) is a 4-byte code used to specify the video codec. The
 list of available codes can be found in [fourcc.org](http://www.fourcc.org/codecs.php). It is
-platform dependent. Following codecs works fine for me.
+platform dependent. The following codecs work fine for me.
 
 -   In Fedora: DIVX, XVID, MJPG, X264, WMV1, WMV2. (XVID is more preferable. MJPG results in high
     size video. X264 gives very small size video)
@@ -122,7 +122,7 @@ platform dependent. Following codecs works fine for me.
 FourCC code is passed as `cv.VideoWriter_fourcc('M','J','P','G')` or
 `cv.VideoWriter_fourcc(*'MJPG')` for MJPG.
 
-Below code capture from a Camera, flip every frame in vertical direction and saves it.
+The below code captures from a camera, flips every frame in the vertical direction, and saves the video.
 @code{.py}
 import numpy as np
 import cv2 as cv
diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index 64a7071ca2..a7c7bfd849 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -216,30 +216,30 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
         dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
         x = 1;
 
-#if CV_SIMD128
+#if CV_SIMD
         {
-            v_int16x8 ftz = v_setall_s16((short) ftzero);
-            v_int16x8 ftz2 = v_setall_s16((short)(ftzero*2));
-            v_int16x8 z = v_setzero_s16();
+            v_int16 ftz = vx_setall_s16((short) ftzero);
+            v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
+            v_int16 z = vx_setzero_s16();
 
-            for(; x <= (size.width - 1) - 8; x += 8 )
+            for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
             {
-                v_int16x8 s00 = v_reinterpret_as_s16(v_load_expand(srow0 + x + 1));
-                v_int16x8 s01 = v_reinterpret_as_s16(v_load_expand(srow0 + x - 1));
-                v_int16x8 s10 = v_reinterpret_as_s16(v_load_expand(srow1 + x + 1));
-                v_int16x8 s11 = v_reinterpret_as_s16(v_load_expand(srow1 + x - 1));
-                v_int16x8 s20 = v_reinterpret_as_s16(v_load_expand(srow2 + x + 1));
-                v_int16x8 s21 = v_reinterpret_as_s16(v_load_expand(srow2 + x - 1));
-                v_int16x8 s30 = v_reinterpret_as_s16(v_load_expand(srow3 + x + 1));
-                v_int16x8 s31 = v_reinterpret_as_s16(v_load_expand(srow3 + x - 1));
+                v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
+                v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
+                v_int16 s10 = v_reinterpret_as_s16(vx_load_expand(srow1 + x + 1));
+                v_int16 s11 = v_reinterpret_as_s16(vx_load_expand(srow1 + x - 1));
+                v_int16 s20 = v_reinterpret_as_s16(vx_load_expand(srow2 + x + 1));
+                v_int16 s21 = v_reinterpret_as_s16(vx_load_expand(srow2 + x - 1));
+                v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
+                v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));
 
-                v_int16x8 d0 = s00 - s01;
-                v_int16x8 d1 = s10 - s11;
-                v_int16x8 d2 = s20 - s21;
-                v_int16x8 d3 = s30 - s31;
+                v_int16 d0 = s00 - s01;
+                v_int16 d1 = s10 - s11;
+                v_int16 d2 = s20 - s21;
+                v_int16 d3 = s30 - s31;
 
-                v_uint16x8 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
-                v_uint16x8 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
+                v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
+                v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
 
                 v_pack_store(dptr0 + x, v0);
                 v_pack_store(dptr1 + x, v1);
@@ -262,10 +262,10 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
     {
         uchar* dptr = dst.ptr<uchar>(y);
         x = 0;
-#if CV_SIMD128
+#if CV_SIMD
         {
-            v_uint8x16 val0_16 = v_setall_u8(val0);
-            for(; x <= size.width-16; x+=16 )
+            v_uint8 val0_16 = vx_setall_u8(val0);
+            for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
                 v_store(dptr + x, val0_16);
         }
 #endif
@@ -309,13 +309,13 @@ inline int dispDescale(int v1, int v2, int d)
     return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float
 }
 
-#if CV_SIMD128
+#if CV_SIMD
 template <typename dType>
 static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                                             Mat& disp, Mat& cost, StereoBMParams& state,
                                             uchar* buf, int _dy0, int _dy1 )
 {
-    const int ALIGN = 16;
+    const int ALIGN = CV_SIMD_WIDTH;
     int x, y, d;
     int wsz = state.SADWindowSize, wsz2 = wsz/2;
     int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@@ -345,7 +345,9 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
     const int TABSZ = 256;
     uchar tab[TABSZ];
-    const v_int16x8 d0_8 = v_int16x8(0,1,2,3,4,5,6,7), dd_8 = v_setall_s16(8);
+    short v_seq[v_int16::nlanes];
+    for (short i = 0; i < v_int16::nlanes; ++i)
+        v_seq[i] = i;
 
     sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
     hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
@@ -368,20 +370,26 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
         for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
         {
             int lval = lptr[0];
-            v_uint8x16 lv = v_setall_u8((uchar)lval);
-            for( d = 0; d < ndisp; d += 16 )
+            v_uint8 lv = vx_setall_u8((uchar)lval);
+            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
             {
-                v_uint8x16 rv = v_load(rptr + d);
-                v_uint16x8 hsad_l = v_load(hsad + d);
-                v_uint16x8 hsad_h = v_load(hsad + d + 8);
-                v_uint8x16 diff = v_absdiff(lv, rv);
+                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
                 v_store(cbuf + d, diff);
-                v_uint16x8 diff0, diff1;
-                v_expand(diff, diff0, diff1);
-                hsad_l += diff0;
-                hsad_h += diff1;
-                v_store(hsad + d, hsad_l);
-                v_store(hsad + d + 8, hsad_h);
+                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
+                v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
+            }
+            if( d <= ndisp - v_uint16::nlanes )
+            {
+                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
+                v_store_low(cbuf + d, diff);
+                v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
+                d += v_uint16::nlanes;
+            }
+            for( ; d < ndisp; d++ )
+            {
+                int diff = abs(lval - rptr[d]);
+                cbuf[d] = (uchar)diff;
+                hsad[d] += (ushort)diff;
             }
             htext[y] += tab[lval];
         }
@@ -412,24 +420,27 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
         {
             int lval = lptr[0];
-            v_uint8x16 lv = v_setall_u8((uchar)lval);
-            for( d = 0; d < ndisp; d += 16 )
+            v_uint8 lv = vx_setall_u8((uchar)lval);
+            for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
             {
-                v_uint8x16 rv = v_load(rptr + d);
-                v_uint16x8 hsad_l = v_load(hsad + d);
-                v_uint16x8 hsad_h = v_load(hsad + d + 8);
-                v_uint8x16 cbs = v_load(cbuf_sub + d);
-                v_uint8x16 diff = v_absdiff(lv, rv);
-                v_int16x8 diff_l, diff_h, cbs_l, cbs_h;
+                v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
+                v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
                 v_store(cbuf + d, diff);
-                v_expand(v_reinterpret_as_s8(diff), diff_l, diff_h);
-                v_expand(v_reinterpret_as_s8(cbs), cbs_l, cbs_h);
-                diff_l -= cbs_l;
-                diff_h -= cbs_h;
-                hsad_h = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_h) + diff_h);
-                hsad_l = v_reinterpret_as_u16(v_reinterpret_as_s16(hsad_l) + diff_l);
-                v_store(hsad + d, hsad_l);
-                v_store(hsad + d + 8, hsad_h);
+                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
+                v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
+            }
+            if( d <= ndisp - v_uint16::nlanes)
+            {
+                v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
+                v_store_low(cbuf + d, diff);
+                v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
+                d += v_uint16::nlanes;
+            }
+            for( ; d < ndisp; d++ )
+            {
+                int diff = abs(lval - rptr[d]);
+                cbuf[d] = (uchar)diff;
+                hsad[d] = hsad[d] + (ushort)diff - cbuf_sub[d];
             }
             htext[y] += tab[lval] - tab[lptr_sub[0]];
         }
@@ -446,17 +457,25 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
 
         hsad = hsad0 + (1 - dy0)*ndisp;
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
-            for( d = 0; d <= ndisp-16; d += 16 )
+        {
+            for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
             {
-                v_uint16x8 s0 = v_load(sad + d);
-                v_uint16x8 s1 = v_load(sad + d + 8);
-                v_uint16x8 t0 = v_load(hsad + d);
-                v_uint16x8 t1 = v_load(hsad + d + 8);
-                s0 = s0 + t0;
-                s1 = s1 + t1;
-                v_store(sad + d, s0);
-                v_store(sad + d + 8, s1);
+                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
+                v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
             }
+            if( d <= ndisp-v_uint16::nlanes )
+            {
+                v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
+                d += v_uint16::nlanes;
+            }
+            if( d <= ndisp-v_uint16::nlanes/2 )
+            {
+                v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
+                d += v_uint16::nlanes/2;
+            }
+            for( ; d < ndisp; d++ )
+                sad[d] = sad[d] + hsad[d];
+        }
         int tsum = 0;
         for( y = -wsz2-1; y < wsz2; y++ )
             tsum += htext[y];
@@ -467,38 +486,41 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
             int minsad = INT_MAX, mind = -1;
             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
-            v_int16x8 minsad8 = v_setall_s16(SHRT_MAX);
-            v_int16x8 mind8 = v_setall_s16(0), d8 = d0_8;
+            v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
+            v_int16 mind8 = vx_setall_s16(0);
 
-            for( d = 0; d < ndisp; d += 16 )
+            for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
             {
-                v_int16x8 u0 = v_reinterpret_as_s16(v_load(hsad_sub + d));
-                v_int16x8 u1 = v_reinterpret_as_s16(v_load(hsad + d));
+                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_store(sad + d, v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                minsad8 = v_min(minsad8, sad8);
 
-                v_int16x8 v0 = v_reinterpret_as_s16(v_load(hsad_sub + d + 8));
-                v_int16x8 v1 = v_reinterpret_as_s16(v_load(hsad + d + 8));
-
-                v_int16x8 usad8 = v_reinterpret_as_s16(v_load(sad + d));
-                v_int16x8 vsad8 = v_reinterpret_as_s16(v_load(sad + d + 8));
-
-                u1 -= u0;
-                v1 -= v0;
-                usad8 += u1;
-                vsad8 += v1;
-
-                v_int16x8 mask = minsad8 > usad8;
-                minsad8 = v_min(minsad8, usad8);
-                mind8 = v_max(mind8, (mask& d8));
-
-                v_store(sad + d, v_reinterpret_as_u16(usad8));
-                v_store(sad + d + 8, v_reinterpret_as_u16(vsad8));
-
-                mask = minsad8 > vsad8;
-                minsad8 = v_min(minsad8, vsad8);
-
-                d8 = d8 + dd_8;
-                mind8 = v_max(mind8, (mask & d8));
-                d8 = d8 + dd_8;
+                sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
+                v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d+v_int16::nlanes));
+                minsad8 = v_min(minsad8, sad8);
+            }
+            if( d <= ndisp - v_int16::nlanes )
+            {
+                v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
+                v_store(sad + d, v_reinterpret_as_u16(sad8));
+                mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
+                minsad8 = v_min(minsad8, sad8);
+                d += v_int16::nlanes;
+            }
+            minsad = v_reduce_min(minsad8);
+            v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
+            mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+            for( ; d < ndisp; d++ )
+            {
+                int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
+                sad[d] = (ushort)sad8;
+                if(minsad > sad8)
+                {
+                    mind = d;
+                    minsad = sad8;
+                }
             }
 
             tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
@@ -508,41 +530,45 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
                 continue;
             }
 
-            ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
-            v_store(minsad_buf, v_reinterpret_as_u16(minsad8));
-            v_store(mind_buf, v_reinterpret_as_u16(mind8));
-            for( d = 0; d < 8; d++ )
-                if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
-                {
-                    minsad = minsad_buf[d];
-                    mind = mind_buf[d];
-                }
-
             if( uniquenessRatio > 0 )
             {
                 int thresh = minsad + (minsad * uniquenessRatio/100);
-                v_int32x4 thresh4 = v_setall_s32(thresh + 1);
-                v_int32x4 d1 = v_setall_s32(mind-1), d2 = v_setall_s32(mind+1);
-                v_int32x4 dd_4 = v_setall_s32(4);
-                v_int32x4 d4 = v_int32x4(0,1,2,3);
-                v_int32x4 mask4;
+                v_int32 thresh4 = vx_setall_s32(thresh + 1);
+                v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
+                v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
+                v_int32 d4 = vx_load_expand(v_seq);
 
-                for( d = 0; d < ndisp; d += 8 )
+                for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
                 {
-                    v_int16x8 sad8 = v_reinterpret_as_s16(v_load(sad + d));
-                    v_int32x4 sad4_l, sad4_h;
-                    v_expand(sad8, sad4_l, sad4_h);
-                    mask4 = thresh4 > sad4_l;
-                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_check_any(mask4) )
+                    v_int32 sad4_l, sad4_h;
+                    v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
+                    if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
                         break;
                     d4 += dd_4;
-                    mask4 = thresh4 > sad4_h;
-                    mask4 = mask4 & ((d1 > d4) | (d4 > d2));
-                    if( v_check_any(mask4) )
+                    if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
                         break;
                     d4 += dd_4;
                 }
+                if( d <= ndisp - v_int16::nlanes )
+                {
+                    dptr[y*dstep] = FILTERED;
+                    continue;
+                }
+                if( d <= ndisp - v_int32::nlanes )
+                {
+                    v_int32 sad4_l = vx_load_expand((short*)sad + d);
+                    if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
+                    {
+                        dptr[y*dstep] = FILTERED;
+                        continue;
+                    }
+                    d += v_int16::nlanes;
+                }
+                for( ; d < ndisp; d++ )
+                {
+                    if( (thresh + 1) > sad[d] && ((mind - 1) > d || d > (mind + 1)) )
+                        break;
+                }
                 if( d < ndisp )
                 {
                     dptr[y*dstep] = FILTERED;
@@ -571,7 +597,7 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                             uchar* buf, int _dy0, int _dy1 )
 {
 
-    const int ALIGN = 16;
+    const int ALIGN = CV_SIMD_WIDTH;
     int x, y, d;
     int wsz = state.SADWindowSize, wsz2 = wsz/2;
     int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
@@ -587,12 +613,6 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
     const int disp_shift = dispShiftTemplate<mType>::value;
     mType FILTERED = (mType)((mindisp - 1) << disp_shift);
 
-#if CV_SIMD128
-    {
-        CV_Assert (ndisp % 8 == 0);
-    }
-#endif
-
     int *sad, *hsad0, *hsad, *hsad_sub, *htext;
     uchar *cbuf0, *cbuf;
     const uchar* lptr0 = left.ptr() + lofs;
@@ -607,6 +627,13 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
     const int TABSZ = 256;
     uchar tab[TABSZ];
 
+#if CV_SIMD
+    int v_seq[v_int32::nlanes];
+    for (int i = 0; i < v_int32::nlanes; ++i)
+        v_seq[i] = i;
+    v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
+#endif
+
     sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
     hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
     htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
@@ -628,22 +655,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             d = 0;
-#if CV_SIMD128
+#if CV_SIMD
             {
-                v_uint8x16 lv = v_setall_u8((uchar)lval);
+                v_uint8 lv = vx_setall_u8((uchar)lval);
 
-                for( ; d <= ndisp - 16; d += 16 )
+                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
                 {
-                    v_uint8x16 rv = v_load(rptr + d);
-                    v_int32x4 hsad_0 = v_load(hsad + d);
-                    v_int32x4 hsad_1 = v_load(hsad + d + 4);
-                    v_int32x4 hsad_2 = v_load(hsad + d + 8);
-                    v_int32x4 hsad_3 = v_load(hsad + d + 12);
-                    v_uint8x16 diff = v_absdiff(lv, rv);
+                    v_uint8 rv = vx_load(rptr + d);
+                    v_int32 hsad_0 = vx_load(hsad + d);
+                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_uint8 diff = v_absdiff(lv, rv);
                     v_store(cbuf + d, diff);
 
-                    v_uint16x8 diff0, diff1;
-                    v_uint32x4 diff00, diff01, diff10, diff11;
+                    v_uint16 diff0, diff1;
+                    v_uint32 diff00, diff01, diff10, diff11;
                     v_expand(diff, diff0, diff1);
                     v_expand(diff0, diff00, diff01);
                     v_expand(diff1, diff10, diff11);
@@ -654,9 +681,9 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                     hsad_3 += v_reinterpret_as_s32(diff11);
 
                     v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + 4, hsad_1);
-                    v_store(hsad + d + 8, hsad_2);
-                    v_store(hsad + d + 12, hsad_3);
+                    v_store(hsad + d + v_int32::nlanes, hsad_1);
+                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
+                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
                 }
             }
 #endif
@@ -696,22 +723,22 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         {
             int lval = lptr[0];
             d = 0;
-#if CV_SIMD128
+#if CV_SIMD
             {
-                v_uint8x16 lv = v_setall_u8((uchar)lval);
-                for( ; d <= ndisp - 16; d += 16 )
+                v_uint8 lv = vx_setall_u8((uchar)lval);
+                for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
                 {
-                    v_uint8x16 rv = v_load(rptr + d);
-                    v_int32x4 hsad_0 = v_load(hsad + d);
-                    v_int32x4 hsad_1 = v_load(hsad + d + 4);
-                    v_int32x4 hsad_2 = v_load(hsad + d + 8);
-                    v_int32x4 hsad_3 = v_load(hsad + d + 12);
-                    v_uint8x16 cbs = v_load(cbuf_sub + d);
-                    v_uint8x16 diff = v_absdiff(lv, rv);
+                    v_uint8 rv = vx_load(rptr + d);
+                    v_int32 hsad_0 = vx_load(hsad + d);
+                    v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
+                    v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
+                    v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
+                    v_uint8 cbs = vx_load(cbuf_sub + d);
+                    v_uint8 diff = v_absdiff(lv, rv);
                     v_store(cbuf + d, diff);
 
-                    v_uint16x8 diff0, diff1, cbs0, cbs1;
-                    v_int32x4 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
+                    v_uint16 diff0, diff1, cbs0, cbs1;
+                    v_int32 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
                     v_expand(diff, diff0, diff1);
                     v_expand(cbs, cbs0, cbs1);
                     v_expand(v_reinterpret_as_s16(diff0), diff00, diff01);
@@ -719,19 +746,19 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
                     v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
                     v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
 
-                    v_int32x4 diff_0 = diff00 - cbs00;
-                    v_int32x4 diff_1 = diff01 - cbs01;
-                    v_int32x4 diff_2 = diff10 - cbs10;
-                    v_int32x4 diff_3 = diff11 - cbs11;
+                    v_int32 diff_0 = diff00 - cbs00;
+                    v_int32 diff_1 = diff01 - cbs01;
+                    v_int32 diff_2 = diff10 - cbs10;
+                    v_int32 diff_3 = diff11 - cbs11;
                     hsad_0 += diff_0;
                     hsad_1 += diff_1;
                     hsad_2 += diff_2;
                     hsad_3 += diff_3;
 
                     v_store(hsad + d, hsad_0);
-                    v_store(hsad + d + 4, hsad_1);
-                    v_store(hsad + d + 8, hsad_2);
-                    v_store(hsad + d + 12, hsad_3);
+                    v_store(hsad + d + v_int32::nlanes, hsad_1);
+                    v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
+                    v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
                 }
             }
 #endif
@@ -758,18 +785,18 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
         {
             d = 0;
-#if CV_SIMD128
+#if CV_SIMD
             {
-                for( d = 0; d <= ndisp-8; d += 8 )
+                for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
                 {
-                    v_int32x4 s0 = v_load(sad + d);
-                    v_int32x4 s1 = v_load(sad + d + 4);
-                    v_int32x4 t0 = v_load(hsad + d);
-                    v_int32x4 t1 = v_load(hsad + d + 4);
+                    v_int32 s0 = vx_load(sad + d);
+                    v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
+                    v_int32 t0 = vx_load(hsad + d);
+                    v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
                     s0 += t0;
                     s1 += t1;
                     v_store(sad + d, s0);
-                    v_store(sad + d + 4, s1);
+                    v_store(sad + d + v_int32::nlanes, s1);
                 }
             }
 #endif
@@ -787,50 +814,31 @@ findStereoCorrespondenceBM( const Mat& left, const Mat& right,
             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
             d = 0;
-#if CV_SIMD128
+#if CV_SIMD
             {
-                v_int32x4 d0_4 = v_int32x4(0, 1, 2, 3);
-                v_int32x4 dd_4 = v_setall_s32(4);
-                v_int32x4 minsad4 = v_setall_s32(INT_MAX);
-                v_int32x4 mind4 = v_setall_s32(0), d4 = d0_4;
+                v_int32 minsad4 = vx_setall_s32(INT_MAX);
+                v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;
 
-                for( ; d <= ndisp - 8; d += 8 )
+                for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
                 {
-                    v_int32x4 u0 = v_load(hsad_sub + d);
-                    v_int32x4 u1 = v_load(hsad + d);
-
-                    v_int32x4 v0 = v_load(hsad_sub + d + 4);
-                    v_int32x4 v1 = v_load(hsad + d + 4);
-
-                    v_int32x4 usad4 = v_load(sad + d);
-                    v_int32x4 vsad4 = v_load(sad + d + 4);
-
-                    u1 -= u0;
-                    v1 -= v0;
-                    usad4 += u1;
-                    vsad4 += v1;
-
-                    v_store(sad + d, usad4);
-                    v_store(sad + d + 4, vsad4);
-
-                    v_int32x4 mask = minsad4 > usad4;
-                    minsad4 = v_min(minsad4, usad4);
-                    mind4 = v_select(mask, d4, mind4);
+                    v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
+                    v_store(sad + d, sad4);
+                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    minsad4 = v_min(minsad4, sad4);
                     d4 += dd_4;
 
-                    mask = minsad4 > vsad4;
-                    minsad4 = v_min(minsad4, vsad4);
-                    mind4 = v_select(mask, d4, mind4);
+                    sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
+                    v_store(sad + d + v_int32::nlanes, sad4);
+                    mind4 = v_select(minsad4 > sad4, d4, mind4);
+                    minsad4 = v_min(minsad4, sad4);
                     d4 += dd_4;
                 }
 
-                int CV_DECL_ALIGNED(16) minsad_buf[4], mind_buf[4];
+                int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
                 v_store(minsad_buf, minsad4);
                 v_store(mind_buf, mind4);
-                if(minsad_buf[0] < minsad || (minsad == minsad_buf[0] && mind_buf[0] < mind)) { minsad = minsad_buf[0]; mind = mind_buf[0]; }
-                if(minsad_buf[1] < minsad || (minsad == minsad_buf[1] && mind_buf[1] < mind)) { minsad = minsad_buf[1]; mind = mind_buf[1]; }
-                if(minsad_buf[2] < minsad || (minsad == minsad_buf[2] && mind_buf[2] < mind)) { minsad = minsad_buf[2]; mind = mind_buf[2]; }
-                if(minsad_buf[3] < minsad || (minsad == minsad_buf[3] && mind_buf[3] < mind)) { minsad = minsad_buf[3]; mind = mind_buf[3]; }
+                for (int i = 0; i < v_int32::nlanes; ++i)
+                    if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
             }
 #endif
             for( ; d < ndisp; d++ )
@@ -1027,7 +1035,7 @@ struct FindStereoCorrespInvoker : public ParallelLoopBody
         Mat disp_i = disp->rowRange(row0, row1);
         Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
 
-#if CV_SIMD128
+#if CV_SIMD
         if (useShorts)
         {
             if( disp_i.type() == CV_16S)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 8f6c982c72..fbd6f470cd 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1012,6 +1012,54 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps
 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)
 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
 
+/** Reverse **/
+inline v_uint8x32 v_reverse(const v_uint8x32 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int8x32 v_reverse(const v_int8x32 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x16 v_reverse(const v_uint16x16 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int16x16 v_reverse(const v_int16x16 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x8 v_reverse(const v_uint32x8 &a)
+{
+    static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
+}
+
+inline v_int32x8 v_reverse(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x8 v_reverse(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x4 v_reverse(const v_uint64x4 &a)
+{
+    return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int64x4 v_reverse(const v_int64x4 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x4 v_reverse(const v_float64x4 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 ////////// Reduce and mask /////////
 
 /** Reduce **/
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index 844c546e38..2c31a8d014 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -1068,6 +1068,79 @@ OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)
 
+/** Reverse **/
+inline v_uint8x64 v_reverse(const v_uint8x64 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
+            0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
+            0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+    return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int8x64 v_reverse(const v_int8x64 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x32 v_reverse(const v_uint16x32 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00000001, 0x00020003, 0x00040005, 0x00060007,
+            0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
+            0x00100011, 0x00120013, 0x00140015, 0x00160017,
+            0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
+    return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int16x32 v_reverse(const v_int16x32 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x16 v_reverse(const v_uint32x16 &a)
+{
+    static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+inline v_int32x16 v_reverse(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x16 v_reverse(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x8 v_reverse(const v_uint64x8 &a)
+{
+    static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+    return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
+}
+
+inline v_int64x8 v_reverse(const v_int64x8 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x8 v_reverse(const v_float64x8 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 ////////// Reduce /////////
 
 /** Reduce **/
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 39de0b5a09..031420e9fc 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -112,6 +112,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
+- Reverse: @ref v_reverse
 - Extract: @ref v_extract
 
 
@@ -215,6 +216,7 @@ Regular integers:
 |cvt_flt32          |   |   |   |   |   | x |
 |cvt_flt64          |   |   |   |   |   | x |
 |transpose4x4       |   |   |   |   | x | x |
+|reverse            | x | x | x | x | x | x |
 
 Big integers:
 
@@ -224,6 +226,7 @@ Big integers:
 |add, sub           | x | x |
 |shift              | x | x |
 |logical            | x | x |
+|reverse            | x | x |
 |extract            | x | x |
 |rotate (lanes)     | x | x |
 |cvt_flt64          |   | x |
@@ -250,6 +253,7 @@ Floating point:
 |transpose4x4       | x |   |
 |extract            | x | x |
 |rotate (lanes)     | x | x |
+|reverse            | x | x |
 
  @{ */
 
@@ -1724,6 +1728,23 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
     }
 }
 
+/** @brief Vector reverse order
+
+Reverse the order of the vector
+Scheme:
+@code
+  REG {A1 ... An} ==> REG {An ... A1}
+@endcode
+For all types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[n-i-1];
+    return c;
+}
+
 /** @brief Vector extract
 
 Scheme:
diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
index 5ece9c131e..4dbdfef49d 100755
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@@ -906,6 +906,57 @@ OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
 
+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
+    return c;
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
+    return c;
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    v_uint32x4 c;
+    c.val[0] = a.val[3];
+    c.val[1] = a.val[2];
+    c.val[2] = a.val[1];
+    c.val[3] = a.val[0];
+    return c;
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    v_uint64x2 c;
+    c.val[0] = a.val[1];
+    c.val[1] = a.val[0];
+    return c;
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
 #define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
 inline unsigned short v_reduce_##func(const v_uint16x8& a) \
 { \
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 3e8321aca3..abbd635fac 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -1585,6 +1585,52 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
 OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
 #endif
 
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    uint8x16_t vec = vrev64q_u8(a.val);
+    return v_uint8x16(vextq_u8(vec, vec, 8));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    uint16x8_t vec = vrev64q_u16(a.val);
+    return v_uint16x8(vextq_u16(vec, vec, 4));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    uint32x4_t vec = vrev64q_u32(a.val);
+    return v_uint32x4(vextq_u32(vec, vec, 2));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    uint64x2_t vec = a.val;
+    uint64x1_t vec_lo = vget_low_u64(vec);
+    uint64x1_t vec_hi = vget_high_u64(vec);
+    return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
 template <int s> \
 inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index c4de1195b5..e7370504ef 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1220,14 +1220,23 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
 
-#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
+{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+{ return ~(a == b); }
+#else
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
+  return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
+#endif
 
-OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
-OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
 
 inline v_float32x4 v_not_nan(const v_float32x4& a)
 { return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
@@ -1914,6 +1923,59 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
 
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
+#else
+    uchar CV_DECL_ALIGNED(32) d[16];
+    v_store_aligned(d, a);
+    return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
+#endif
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
+#else
+    __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
+    r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    return v_uint16x8(r);
+#endif
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 template<int s, typename _Tpvec>
 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 {
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 0d65ca5e7a..5b4a0d4137 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -678,6 +678,53 @@ OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
 OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
 
+/* Reverse */
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_uint8x16(vec_perm(vec, vec, perm));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
 /* Extract */
 template<int s, typename _Tpvec>
 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index f2da617cfe..4b8cd61dd2 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -21,6 +21,18 @@ namespace cv
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
+// handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
+#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
+#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
+#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
+#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
+#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
+#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
+#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
+#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
+#endif // COMPATIBILITY: <1.38.46
+
 ///////// Types ///////////
 
 struct v_uint8x16
@@ -3111,6 +3123,38 @@ OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
 
 
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
 inline scalartype v_reduce_sum(const _Tpvec& a) \
 { \
@@ -3400,25 +3444,25 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
 inline v_int32x4 v_round(const v_float32x4& a)
 {
     v128_t h = wasm_f32x4_splat(0.5);
-    return v_int32x4(wasm_trunc_saturate_i32x4_f32x4(wasm_f32x4_add(a.val, h)));
+    return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
 }
 
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
-    v128_t a1 = wasm_trunc_saturate_i32x4_f32x4(a.val);
-    v128_t mask = wasm_f32x4_lt(a.val, wasm_convert_f32x4_i32x4(a1));
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
     return v_int32x4(wasm_i32x4_add(a1, mask));
 }
 
 inline v_int32x4 v_ceil(const v_float32x4& a)
 {
-    v128_t a1 = wasm_trunc_saturate_i32x4_f32x4(a.val);
-    v128_t mask = wasm_f32x4_gt(a.val, wasm_convert_f32x4_i32x4(a1));
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
     return v_int32x4(wasm_i32x4_sub(a1, mask));
 }
 
 inline v_int32x4 v_trunc(const v_float32x4& a)
-{ return v_int32x4(wasm_trunc_saturate_i32x4_f32x4(a.val)); }
+{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
 
 #define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc, _Tpvec, _Tpnvec, _Tp, _Tpn) \
 inline _Tpnvec func(const _Tpvec& a) \
@@ -3924,7 +3968,7 @@ OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2,
 
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
-    return v_float32x4(wasm_convert_f32x4_i32x4(a.val));
+    return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
@@ -3943,7 +3987,7 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
 #ifdef __wasm_unimplemented_simd128__
     v128_t p = v128_cvti32x4_i64x2(a.val);
-    return v_float64x2(wasm_convert_f64x2_i64x2(p));
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
     fallback::v_int32x4 a_(a);
     return fallback::v_cvt_f64(a_);
@@ -3954,7 +3998,7 @@ inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
 #ifdef __wasm_unimplemented_simd128__
     v128_t p = v128_cvti32x4_i64x2_high(a.val);
-    return v_float64x2(wasm_convert_f64x2_i64x2(p));
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
 #else
     fallback::v_int32x4 a_(a);
     return fallback::v_cvt_f64_high(a_);
@@ -3976,7 +4020,7 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 {
 #ifdef __wasm_unimplemented_simd128__
-    return v_float64x2(wasm_convert_f64x2_i64x2(a.val));
+    return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
 #else
     fallback::v_int64x2 a_(a);
     return fallback::v_cvt_f64(a_);
diff --git a/modules/core/src/alloc.cpp b/modules/core/src/alloc.cpp
index 8384f6dd53..98012998fc 100644
--- a/modules/core/src/alloc.cpp
+++ b/modules/core/src/alloc.cpp
@@ -112,6 +112,13 @@ bool isAlignedAllocationEnabled()
     }
     return useMemalign;
 }
+// do not use variable directly, details: https://github.com/opencv/opencv/issues/15691
+static const bool g_force_initialization_memalign_flag
+#if defined __GNUC__
+    __attribute__((unused))
+#endif
+    = isAlignedAllocationEnabled();
+
 #endif
 
 #ifdef OPENCV_ALLOC_ENABLE_STATISTICS
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index fb721e2d63..c1478de763 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -711,6 +711,13 @@ static bool ipp_flip(Mat &src, Mat &dst, int flip_mode)
 #ifdef HAVE_IPP_IW
     CV_INSTRUMENT_REGION_IPP();
 
+    // Details: https://github.com/opencv/opencv/issues/12943
+    if (flip_mode <= 0 /* swap rows */
+        && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42
+        && (int64_t)(src.total()) * src.elemSize() >= CV_BIG_INT(0x80000000)/*2Gb*/
+    )
+        return false;
+
     IppiAxis ippMode;
     if(flip_mode < 0)
         ippMode = ippAxsBoth;
diff --git a/modules/core/src/count_non_zero.simd.hpp b/modules/core/src/count_non_zero.simd.hpp
index 4c01c08850..6994564127 100644
--- a/modules/core/src/count_non_zero.simd.hpp
+++ b/modules/core/src/count_non_zero.simd.hpp
@@ -179,7 +179,25 @@ static int countNonZero32f( const float* src, int len )
 
 static int countNonZero64f( const double* src, int len )
 {
-    return countNonZero_(src, len);
+    int nz = 0, i = 0;
+#if CV_SIMD_64F
+    v_int64 sum1 = vx_setzero_s64();
+    v_int64 sum2 = vx_setzero_s64();
+    v_float64 zero = vx_setzero_f64();
+    int step = v_float64::nlanes * 2;
+    int len0 = len & -step;
+
+    for(i = 0; i < len0; i += step )
+        {
+        sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
+        sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
+        }
+
+    // N.B the value is incremented by -1 (0xF...F) for each value
+    nz = i + (int)v_reduce_sum(sum1 + sum2);
+    v_cleanup();
+#endif
+    return nz + countNonZero_(src + i, len - i);
 }
 
 CountNonZeroFunc getCountNonZeroTab(int depth)
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index fcb6b93a3c..bd1e24722c 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1115,6 +1115,22 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_reverse()
+    {
+        Data<R> dataA;
+        R a = dataA;
+
+        Data<R> resB = v_reverse(a);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(dataA[R::nlanes - i - 1], resB[i]);
+        }
+
+        return *this;
+    }
+
     template<int s>
     TheTest & test_extract()
     {
@@ -1426,6 +1442,50 @@ template<typename R> struct TheTest
         return *this;
     }
 #endif
+
+#if CV_SIMD_64F
+    TheTest & test_cmp64()
+    {
+        Data<R> dataA, dataB;
+        R a = dataA, b = dataB;
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            dataA[i] = dataB[i];
+        }
+        dataA[0]++;
+
+        a = dataA, b = dataB;
+
+        Data<R> resC = (a == b);
+        Data<R> resD = (a != b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
+            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
+        }
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            dataA[i] = dataB[i] = (LaneType)-1;
+        }
+
+        a = dataA, b = dataB;
+
+        resC = (a == b);
+        resD = (a != b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0);
+            EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0);
+        }
+        return *this;
+    }
+#endif
 };
 
 
@@ -1459,6 +1519,7 @@ void test_hal_intrin_uint8()
         .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
         .test_pack_b()
         .test_unpack()
+        .test_reverse()
         .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
         .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
         ;
@@ -1497,6 +1558,7 @@ void test_hal_intrin_int8()
         .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
         .test_unpack()
+        .test_reverse()
         .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
         .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
         ;
@@ -1529,6 +1591,7 @@ void test_hal_intrin_uint16()
         .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
         .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
         .test_unpack()
+        .test_reverse()
         .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
         .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
         ;
@@ -1561,6 +1624,7 @@ void test_hal_intrin_int16()
         .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
         .test_unpack()
+        .test_reverse()
         .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
         .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
         ;
@@ -1590,6 +1654,7 @@ void test_hal_intrin_uint32()
         .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
         .test_unpack()
+        .test_reverse()
         .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
         .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
         .test_transpose()
@@ -1619,6 +1684,7 @@ void test_hal_intrin_int32()
         .test_mask()
         .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
         .test_unpack()
+        .test_reverse()
         .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
         .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
         .test_float_cvt32()
@@ -1635,8 +1701,12 @@ void test_hal_intrin_uint64()
     TheTest<v_uint64>()
         .test_loadstore()
         .test_addsub()
+#if CV_SIMD_64F
+        .test_cmp64()
+#endif
         .test_shift<1>().test_shift<8>()
         .test_logic()
+        .test_reverse()
         .test_extract<0>().test_extract<1>()
         .test_rotate<0>().test_rotate<1>()
         ;
@@ -1648,8 +1718,12 @@ void test_hal_intrin_int64()
     TheTest<v_int64>()
         .test_loadstore()
         .test_addsub()
+#if CV_SIMD_64F
+        .test_cmp64()
+#endif
         .test_shift<1>().test_shift<8>()
         .test_logic()
+        .test_reverse()
         .test_extract<0>().test_extract<1>()
         .test_rotate<0>().test_rotate<1>()
         .test_cvt64_double()
@@ -1680,6 +1754,7 @@ void test_hal_intrin_float32()
         .test_matmul()
         .test_transpose()
         .test_reduce_sum4()
+        .test_reverse()
         .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
         .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
         ;
@@ -1709,6 +1784,7 @@ void test_hal_intrin_float64()
         .test_unpack()
         .test_float_math()
         .test_float_cvt32()
+        .test_reverse()
         .test_extract<0>().test_extract<1>()
         .test_rotate<0>().test_rotate<1>()
         ;
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 885ff0e43f..d2328771d9 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -2025,4 +2025,17 @@ TEST(Core_Eigen, eigen2cv_check_Mat_type)
 }
 #endif // HAVE_EIGEN
 
+TEST(Mat, regression_12943)  // memory usage: ~4.5 Gb
+{
+    applyTestTag(CV_TEST_TAG_MEMORY_6GB);
+
+    const int width = 0x8000;
+    const int height = 0x10001;
+
+    cv::Mat src(height, width, CV_8UC1, Scalar::all(128));
+
+    cv::Mat dst;
+    cv::flip(src, dst, 0);
+}
+
 }} // namespace
diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp
index b96cf3d6a6..3b372f93d5 100644
--- a/modules/dnn/include/opencv2/dnn/version.hpp
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP
 
 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20190902
+#define OPENCV_DNN_API_VERSION 20191024
 
 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index 54a53fd867..3a90081e17 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -128,7 +128,7 @@ namespace cv {
 
 
                 void setConvolution(int kernel, int pad, int stride,
-                    int filters_num, int channels_num, int use_batch_normalize, int use_relu)
+                    int filters_num, int channels_num, int use_batch_normalize)
                 {
                     cv::dnn::LayerParams conv_param =
                         getParamConvolution(kernel, pad, stride, filters_num);
@@ -168,27 +168,29 @@ namespace cv {
                         net->layers.push_back(lp);
                     }
 
-                    if (use_relu)
-                    {
-                        cv::dnn::LayerParams activation_param;
-                        activation_param.set<float>("negative_slope", 0.1f);
-                        activation_param.name = "ReLU-name";
-                        activation_param.type = "ReLU";
-
-                        darknet::LayerParameter lp;
-                        std::string layer_name = cv::format("relu_%d", layer_id);
-                        lp.layer_name = layer_name;
-                        lp.layer_type = activation_param.type;
-                        lp.layerParams = activation_param;
-                        lp.bottom_indexes.push_back(last_layer);
-                        last_layer = layer_name;
-                        net->layers.push_back(lp);
-                    }
-
                     layer_id++;
                     fused_layer_names.push_back(last_layer);
                 }
 
+                void setReLU()
+                {
+                    cv::dnn::LayerParams activation_param;
+                    activation_param.set<float>("negative_slope", 0.1f);
+                    activation_param.name = "ReLU-name";
+                    activation_param.type = "ReLU";
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("relu_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = activation_param.type;
+                    lp.layerParams = activation_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    fused_layer_names.back() = last_layer;
+                }
+
                 void setMaxpool(size_t kernel, size_t pad, size_t stride)
                 {
                     cv::dnn::LayerParams maxpool_param;
@@ -409,12 +411,19 @@ namespace cv {
                     fused_layer_names.push_back(last_layer);
                 }
 
-                void setShortcut(int from)
+                void setShortcut(int from, float alpha)
                 {
                     cv::dnn::LayerParams shortcut_param;
                     shortcut_param.name = "Shortcut-name";
                     shortcut_param.type = "Eltwise";
 
+                    if (alpha != 1)
+                    {
+                        std::vector<float> coeffs(2, 1);
+                        coeffs[0] = alpha;
+                        shortcut_param.set("coeff", DictValue::arrayReal<float*>(&coeffs[0], coeffs.size()));
+                    }
+
                     shortcut_param.set<std::string>("op", "sum");
 
                     darknet::LayerParameter lp;
@@ -422,8 +431,8 @@ namespace cv {
                     lp.layer_name = layer_name;
                     lp.layer_type = shortcut_param.type;
                     lp.layerParams = shortcut_param;
-                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
                     lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
                     last_layer = layer_name;
                     net->layers.push_back(lp);
 
@@ -548,10 +557,7 @@ namespace cv {
                         int pad = getParam<int>(layer_params, "pad", 0);
                         int stride = getParam<int>(layer_params, "stride", 1);
                         int filters = getParam<int>(layer_params, "filters", -1);
-                        std::string activation = getParam<std::string>(layer_params, "activation", "linear");
                         bool batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
-                        if(activation != "linear" && activation != "leaky")
-                            CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
                         int flipped = getParam<int>(layer_params, "flipped", 0);
                         if (flipped == 1)
                             CV_Error(cv::Error::StsNotImplemented, "Transpose the convolutional weights is not implemented");
@@ -563,7 +569,7 @@ namespace cv {
                         CV_Assert(current_channels > 0);
 
                         setParams.setConvolution(kernel_size, pad, stride, filters, current_channels,
-                            batch_normalize, activation == "leaky");
+                            batch_normalize);
 
                         current_channels = filters;
                     }
@@ -593,7 +599,7 @@ namespace cv {
 
                         current_channels = 0;
                         for (size_t k = 0; k < layers_vec.size(); ++k) {
-                            layers_vec[k] = layers_vec[k] > 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
+                            layers_vec[k] = layers_vec[k] >= 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
                             current_channels += net->out_channels_vec[layers_vec[k]];
                         }
 
@@ -631,13 +637,15 @@ namespace cv {
                     else if (layer_type == "shortcut")
                     {
                         std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        float alpha = getParam<float>(layer_params, "alpha", 1);
+                        float beta = getParam<float>(layer_params, "beta", 0);
+                        if (beta != 0)
+                            CV_Error(Error::StsNotImplemented, "Non-zero beta");
                         CV_Assert(!bottom_layer.empty());
                         int from = std::atoi(bottom_layer.c_str());
 
-                        from += layers_counter;
-                        current_channels = net->out_channels_vec[from];
-
-                        setParams.setShortcut(from);
+                        from = from < 0 ? from + layers_counter : from;
+                        setParams.setShortcut(from, alpha);
                     }
                     else if (layer_type == "upsample")
                     {
@@ -667,6 +675,15 @@ namespace cv {
                     else {
                         CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type);
                     }
+
+                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
+                    if (activation == "leaky")
+                    {
+                        setParams.setReLU();
+                    }
+                    else if (activation != "linear")
+                        CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
+
                     net->out_channels_vec[layers_counter] = current_channels;
                 }
 
@@ -710,7 +727,6 @@ namespace cv {
                     {
                         int kernel_size = getParam<int>(layer_params, "size", -1);
                         int filters = getParam<int>(layer_params, "filters", -1);
-                        std::string activation = getParam<std::string>(layer_params, "activation", "linear");
                         bool use_batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
 
                         CV_Assert(kernel_size > 0 && filters > 0);
@@ -754,14 +770,16 @@ namespace cv {
                             bn_blobs.push_back(biasData_mat);
                             setParams.setLayerBlobs(cv_layers_counter, bn_blobs);
                         }
-
-                        if(activation == "leaky")
-                            ++cv_layers_counter;
                     }
                     if (layer_type == "region" || layer_type == "yolo")
                     {
                         ++cv_layers_counter;  // For permute.
                     }
+
+                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
+                    if(activation == "leaky")
+                        ++cv_layers_counter;  // For ReLU
+
                     current_channels = net->out_channels_vec[darknet_layers_counter];
                 }
                 return true;
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 47acc07063..3903298a1d 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -103,6 +103,37 @@ public:
         static BackendRegistry impl;
         return impl;
     }
+
+    static inline bool checkIETarget(int target)
+    {
+#ifndef HAVE_INF_ENGINE
+        return false;
+#else
+        cv::dnn::Net net;
+        cv::dnn::LayerParams lp;
+        lp.set("kernel_size", 1);
+        lp.set("num_output", 1);
+        lp.set("bias_term", false);
+        lp.type = "Convolution";
+        lp.name = "testLayer";
+        lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
+        net.addLayerToPrev(lp.name, lp.type, lp);
+        net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
+        net.setPreferableTarget(target);
+        static int inpDims[] = {1, 2, 3, 4};
+        net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
+        try
+        {
+            net.forward();
+        }
+        catch(...)
+        {
+            return false;
+        }
+        return true;
+#endif
+    }
+
 private:
     BackendRegistry()
     {
@@ -154,35 +185,6 @@ private:
         }
 #endif
     }
-    static inline bool checkIETarget(int target)
-    {
-#ifndef HAVE_INF_ENGINE
-        return false;
-#else
-        cv::dnn::Net net;
-        cv::dnn::LayerParams lp;
-        lp.set("kernel_size", 1);
-        lp.set("num_output", 1);
-        lp.set("bias_term", false);
-        lp.type = "Convolution";
-        lp.name = "testLayer";
-        lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
-        net.addLayerToPrev(lp.name, lp.type, lp);
-        net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
-        net.setPreferableTarget(target);
-        static int inpDims[] = {1, 2, 3, 4};
-        net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
-        try
-        {
-            net.forward();
-        }
-        catch(...)
-        {
-            return false;
-        }
-        return true;
-#endif
-    }
 
     BackendsList backends;
 };
@@ -1689,6 +1691,9 @@ struct Net::Impl
         // backend. Split a whole model on several Inference Engine networks if
         // some of layers are not implemented.
 
+        bool supportsCPUFallback = preferableTarget == DNN_TARGET_CPU ||
+                                   BackendRegistry::checkIETarget(DNN_TARGET_CPU);
+
         // Set of all input and output blobs wrappers for current network.
         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
         for (it = layers.begin(); it != layers.end(); ++it)
@@ -1702,7 +1707,8 @@ struct Net::Impl
             if (!fused && !layer->supportBackend(preferableBackend))
             {
                 bool customizable = ld.id != 0 && ld.outputBlobs.size() == 1 &&
-                                    INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2);
+                                    INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2) &&
+                                    supportsCPUFallback;
                 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
                 if (preferableTarget == DNN_TARGET_MYRIAD)
                 {
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index cccef29374..c67b198d03 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -70,6 +70,7 @@ public:
         MAX = 2,
     } op;
     std::vector<float> coeffs;
+    bool variableChannels;
 
     EltwiseLayerImpl(const LayerParams& params)
     {
@@ -105,7 +106,7 @@ public:
         return backendId == DNN_BACKEND_OPENCV ||
                backendId == DNN_BACKEND_CUDA ||
                backendId == DNN_BACKEND_HALIDE ||
-               (backendId == DNN_BACKEND_INFERENCE_ENGINE &&
+               (backendId == DNN_BACKEND_INFERENCE_ENGINE && !variableChannels &&
                 (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()));
     }
 
@@ -115,33 +116,57 @@ public:
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
         CV_Assert(inputs.size() >= 2);
+        CV_Assert(inputs[0].size() >= 2);
         CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
         CV_Assert(op == SUM || coeffs.size() == 0);
 
+        int dims = inputs[0].size();
+        // Number of channels in output shape is determined by the first input tensor.
+        int numChannels = inputs[0][1];
         for (int i = 1; i < inputs.size(); i++)
         {
-            CV_Assert(inputs[0] == inputs[i]);
+            CV_Assert(inputs[0][0] == inputs[i][0]);
+
+            // It's allowed for channels axis to be different.
+            for (int j = 2; j < dims; j++)
+                CV_Assert(inputs[0][j] == inputs[i][j]);
         }
 
         outputs.assign(1, inputs[0]);
-
+        outputs[0][1] = numChannels;
         return false;
     }
 
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        variableChannels = false;
+        for (int i = 1; i < inputs.size(); ++i)
+        {
+            if (inputs[i].size[1] != inputs[0].size[1])
+            {
+                variableChannels = true;
+                break;
+            }
+        }
+    }
+
+
     class EltwiseInvoker : public ParallelLoopBody
     {
     public:
-        const Mat* srcs;
+        std::vector<const Mat*> srcs;
         int nsrcs;
         Mat* dst;
-        const std::vector<float>* coeffs;
+        std::vector<float> coeffs;
         EltwiseOp op;
         int nstripes;
         const ActivationLayer* activ;
         int channels;
         size_t planeSize;
 
-        EltwiseInvoker() : srcs(0), nsrcs(0), dst(0), coeffs(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
+        EltwiseInvoker() : nsrcs(0), dst(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
 
         static void run(const Mat* srcs, int nsrcs, Mat& dst,
                         const std::vector<float>& coeffs, EltwiseOp op,
@@ -150,15 +175,23 @@ public:
             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
             CV_Assert(coeffs.empty() || coeffs.size() == (size_t)nsrcs);
 
+            EltwiseInvoker p;
+            p.srcs.resize(nsrcs);
+            p.coeffs = coeffs;
             for( int i = 0; i < nsrcs; i++ )
             {
-                CV_Assert(srcs[i].size == dst.size &&
-                          srcs[i].type() == dst.type() &&
+                p.srcs[i] = srcs + i;
+                CV_Assert(srcs[i].type() == dst.type() &&
                           srcs[i].isContinuous());
+                // Sort srcs and coefficients in the order by number of channels
+                for( int j = i; j >= 1 && p.srcs[j - 1]->size[1] < p.srcs[j]->size[1]; j-- )
+                {
+                    std::swap(p.srcs[j - 1], p.srcs[j]);
+                    if (!p.coeffs.empty())
+                        std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                }
             }
 
-            EltwiseInvoker p;
-            p.srcs = srcs;
             p.nsrcs = nsrcs;
             p.dst = &dst;
             p.op = op;
@@ -180,7 +213,8 @@ public:
                         break;
                     }
             }
-            p.coeffs = simpleCoeffs ? 0 : &coeffs;
+            if (simpleCoeffs)
+                p.coeffs.clear();
             p.activ = activ;
 
             parallel_for_(Range(0, nstripes), p, nstripes);
@@ -192,8 +226,8 @@ public:
             size_t stripeSize = (total + nstripes - 1)/nstripes;
             size_t stripeStart = r.start*stripeSize;
             size_t stripeEnd = std::min(r.end*stripeSize, total);
-            int c, j, k, n = nsrcs;
-            const float* coeffsptr = coeffs && !coeffs->empty() ? &coeffs->at(0) : 0;
+            int c, j, k, n;
+            const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
             float* dstptr0 = dst->ptr<float>();
             int blockSize0 = 1 << 12, blockSize;
 
@@ -208,14 +242,35 @@ public:
                 for( c = 0; c < channels; c++ )
                 {
                     size_t globalDelta = delta + (sampleIdx*channels + c)*planeSize;
-                    const float* srcptr0 = srcs[0].ptr<float>() + globalDelta;
+                    const float* srcptr0 = srcs[0]->ptr<float>() + globalDelta;
                     float* dstptr = dstptr0 + globalDelta;
 
-                    if( op == PROD )
+                    // This code assumes that srcs are sorted in descending order by channels.
+                    for (n = 1; n < nsrcs && c < srcs[n]->size[1]; ++n) {}
+
+                    if (n == 1)
+                    {
+                        if( !coeffsptr )
+                        {
+                            for( j = 0; j < blockSize; j++ )
+                            {
+                                dstptr[j] = srcptr0[j];
+                            }
+                        }
+                        else
+                        {
+                            float c0 = coeffsptr[0];
+                            for( j = 0; j < blockSize; j++ )
+                            {
+                                dstptr[j] = c0*srcptr0[j];
+                            }
+                        }
+                    }
+                    else if( op == PROD )
                     {
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                             for( j = 0; j < blockSize; j++ )
                             {
                                 dstptr[j] = srcptr0[j]*srcptr1[j];
@@ -227,7 +282,7 @@ public:
                     {
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                             for( j = 0; j < blockSize; j++ )
                             {
                                 dstptr[j] = std::max(srcptr0[j], srcptr1[j]);
@@ -239,7 +294,7 @@ public:
                     {
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                             for( j = 0; j < blockSize; j++ )
                             {
                                 dstptr[j] = srcptr0[j] + srcptr1[j];
@@ -252,7 +307,7 @@ public:
                         float c0 = coeffsptr[0];
                         for( k = 1; k < n; k++ )
                         {
-                            const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
+                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
                             float c1 = coeffsptr[k];
                             for( j = 0; j < blockSize; j++ )
                             {
@@ -279,7 +334,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        if (inputs_.depth() == CV_16S && op != SUM)
+        if ((inputs_.depth() == CV_16S && op != SUM) || variableChannels)
             return false;
 
         inputs_.getUMatVector(inputs);
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 26637ebbe6..61e3cad9a8 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -390,12 +390,6 @@ TEST_P(Test_Darknet_nets, YOLOv3)
 {
     applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
 
-#if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#endif
-
     // batchId, classId, confidence, left, top, right, bottom
     Mat ref = (Mat_<float>(9, 7) << 0, 7,  0.952983f, 0.614622f, 0.150257f, 0.901369f, 0.289251f,  // a truck
                                     0, 1,  0.987908f, 0.150913f, 0.221933f, 0.742255f, 0.74626f,   // a bicycle
@@ -413,23 +407,35 @@ TEST_P(Test_Darknet_nets, YOLOv3)
     std::string config_file = "yolov3.cfg";
     std::string weights_file = "yolov3.weights";
 
+#if defined(INF_ENGINE_RELEASE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
+        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+    {
+        scoreDiff = 0.04;
+        iouDiff = 0.2;
+    }
+#endif
+
     {
     SCOPED_TRACE("batch size 1");
     testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff);
     }
 
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL)  // Test with 'batch size 2' is disabled for DLIE/OpenCL target
-#endif
-
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
+#if defined(INF_ENGINE_RELEASE)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE)
     {
-        if (target == DNN_TARGET_OPENCL)
-            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
-        if (target == DNN_TARGET_OPENCL_FP16)
-            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
+        if (INF_ENGINE_VER_MAJOR_LE(2018050000) && target == DNN_TARGET_OPENCL)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_2018R5);
+        else if (INF_ENGINE_VER_MAJOR_EQ(2019020000))
+        {
+            if (target == DNN_TARGET_OPENCL)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
+            if (target == DNN_TARGET_OPENCL_FP16)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_2019R2);
+        }
+        else if (target == DNN_TARGET_MYRIAD &&
+                 getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
     }
 #endif
 
@@ -444,6 +450,9 @@ INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_nets, dnnBackendsAndTargets());
 TEST_P(Test_Darknet_layers, shortcut)
 {
     testDarknetLayer("shortcut");
+    testDarknetLayer("shortcut_leaky");
+    testDarknetLayer("shortcut_unequal");
+    testDarknetLayer("shortcut_unequal_2");
 }
 
 TEST_P(Test_Darknet_layers, upsample)
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 43e54b952a..e6bc350520 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -1493,4 +1493,62 @@ TEST(Layer_Test_Convolution, relu_fusion)
     normAssert(input, output);
 }
 
+typedef testing::TestWithParam<tuple<bool, tuple<Backend, Target> > > Layer_Test_Eltwise_unequal;
+TEST_P(Layer_Test_Eltwise_unequal, Accuracy)
+{
+    bool weighted = get<0>(GetParam());
+    int backendId = get<0>(get<1>(GetParam()));
+    int targetId = get<1>(get<1>(GetParam()));
+
+    if (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Eltwise";
+    lp.name = "testLayer";
+
+    const int inpShapes[][4] = {{1, 4, 2, 2}, {1, 5, 2, 2}, {1, 3, 2, 2}};
+    std::vector<String> inpNames(3);
+    std::vector<Mat> inputs(3);
+    size_t numOutValues = 1*4*2*2;  // By the first input
+
+    std::vector<float> weights(3, 1);
+    if (weighted)
+    {
+        for (int i = 0; i < inputs.size(); ++i)
+            randu(Mat(1, 1, CV_32F, &weights[i]), -1, 1);
+        lp.set("coeff", DictValue::arrayReal<float*>(&weights[0], weights.size()));
+    }
+
+    int eltwiseId = net.addLayer(lp.name, lp.type, lp);
+    for (int i = 0; i < inputs.size(); ++i)
+    {
+        inputs[i].create(4, inpShapes[i], CV_32F);
+        randu(inputs[i], 0, 255);
+        inpNames[i] = format("input_%d", i);
+        net.connect(0, i, eltwiseId, i);
+    }
+    Mat ref(1, numOutValues, CV_32F, Scalar(0));
+
+    net.setInputsNames(inpNames);
+    for (int i = 0; i < inputs.size(); ++i)
+    {
+        net.setInput(inputs[i], inpNames[i]);
+        if (numOutValues >= inputs[i].total())
+            ref.colRange(0, inputs[i].total()) += weights[i] * inputs[i].reshape(1, 1);
+        else
+            ref += weights[i] * inputs[i].reshape(1, 1).colRange(0, numOutValues);
+    }
+
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+    Mat out = net.forward();
+    normAssert(out.reshape(1, 1), ref);
+}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Eltwise_unequal, Combine(
+    testing::Bool(),
+    dnnBackendsAndTargets()
+));
+
 }} // namespace
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index a5e78a38fa..3045215717 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -394,7 +394,9 @@ enum ConnectedComponentsTypes {
     CC_STAT_WIDTH  = 2, //!< The horizontal size of the bounding box
     CC_STAT_HEIGHT = 3, //!< The vertical size of the bounding box
     CC_STAT_AREA   = 4, //!< The total area (in pixels) of the connected component
-    CC_STAT_MAX    = 5
+#ifndef CV_DOXYGEN
+    CC_STAT_MAX    = 5 //!< Max enumeration value. Used internally only for memory allocation
+#endif
 };
 
 //! connected components algorithm
@@ -4008,7 +4010,23 @@ without self-intersections. Otherwise, the function output is undefined.
  */
 CV_EXPORTS_W bool isContourConvex( InputArray contour );
 
-//! finds intersection of two convex polygons
+/** @example samples/cpp/intersectExample.cpp
+Examples of how intersectConvexConvex works
+*/
+
+/** @brief Finds intersection of two convex polygons
+
+@param _p1 First polygon
+@param _p2 Second polygon
+@param _p12 Output polygon describing the intersecting area
+@param handleNested When true, an intersection is found if one of the polygons is fully enclosed in the other.
+When false, no intersection is found. If the polygons share a side or the vertex of one polygon lies on an edge
+of the other, they are not considered nested and an intersection will be found regardless of the value of handleNested.
+
+@returns Absolute value of area of intersecting polygon
+
+@note intersectConvexConvex doesn't confirm that both polygons are convex and will return invalid results if they aren't.
+ */
 CV_EXPORTS_W float intersectConvexConvex( InputArray _p1, InputArray _p2,
                                           OutputArray _p12, bool handleNested = true );
 
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
index 7bca93de87..6b0e6d6fbe 100644
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -2624,11 +2624,127 @@ void accW_simd_(const uchar* src, float* dst, const uchar* mask, int len, int cn
             v_dst10 = v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha);
             v_dst11 = v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha);
 
-            v_store(dst + x, v_dst00);
-            v_store(dst + x + step, v_dst01);
+            v_store(dst + x           , v_dst00);
+            v_store(dst + x + step    , v_dst01);
             v_store(dst + x + step * 2, v_dst10);
             v_store(dst + x + step * 3, v_dst11);
         }
+    } else {
+        const v_float32 zero = vx_setall_f32((float)0);
+        int size = len * cn;
+
+        if ( cn == 1 ){
+            for (; x <= size - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint8 v_src = vx_load(src + x);
+                v_uint8 v_mask = vx_load(mask + x);
+
+                v_uint16 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+                v_uint32 v_m00, v_m01, v_m10, v_m11;
+                v_expand(v_m0, v_m00, v_m01);
+                v_expand(v_m1, v_m10, v_m11);
+
+                v_float32 v_mf00, v_mf01, v_mf10, v_mf11;
+                v_mf00 = v_cvt_f32(v_reinterpret_as_s32(v_m00));
+                v_mf01 = v_cvt_f32(v_reinterpret_as_s32(v_m01));
+                v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10));
+                v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11));
+
+                v_uint16 v_src0, v_src1;
+                v_expand(v_src, v_src0, v_src1);
+
+                v_uint32 v_src00, v_src01, v_src10, v_src11;
+                v_expand(v_src0, v_src00, v_src01);
+                v_expand(v_src1, v_src10, v_src11);
+
+                v_float32 v_dst00 = vx_load(dst + x);
+                v_float32 v_dst01 = vx_load(dst + x + step);
+                v_float32 v_dst10 = vx_load(dst + x + step * 2);
+                v_float32 v_dst11 = vx_load(dst + x + step * 3);
+
+                v_mf00 = v_mf00 != zero;
+                v_mf01 = v_mf01 != zero;
+                v_mf10 = v_mf10 != zero;
+                v_mf11 = v_mf11 != zero;
+
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
+                v_dst10 = v_select(v_mf10, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
+                v_dst11 = v_select(v_mf11, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
+
+                v_store(dst + x           , v_dst00);
+                v_store(dst + x + step    , v_dst01);
+                v_store(dst + x + step * 2, v_dst10);
+                v_store(dst + x + step * 3, v_dst11);
+            }
+        } else if ( cn == 3 )
+        {
+            for (; x*cn <= size - cVectorWidth*cn; x += cVectorWidth )
+            {
+                v_uint8 v_src0, v_src1, v_src2;
+                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
+
+                v_uint16 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_expand(v_src0, v_src00, v_src01);
+                v_expand(v_src1, v_src10, v_src11);
+                v_expand(v_src2, v_src20, v_src21);
+
+                v_uint32 v_src000, v_src001, v_src010, v_src011, v_src100, v_src101, v_src110, v_src111, v_src200, v_src201, v_src210, v_src211;
+                v_expand(v_src00, v_src000, v_src001);
+                v_expand(v_src01, v_src010, v_src011);
+                v_expand(v_src10, v_src100, v_src101);
+                v_expand(v_src11, v_src110, v_src111);
+                v_expand(v_src20, v_src200, v_src201);
+                v_expand(v_src21, v_src210, v_src211);
+
+                v_float32 v_dst00, v_dst01, v_dst02, v_dst03, v_dst10, v_dst11, v_dst12, v_dst13;
+                v_float32 v_dst20, v_dst21, v_dst22, v_dst23;
+                v_load_deinterleave(dst + x * cn             , v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x +     step) * cn, v_dst01, v_dst11, v_dst21);
+                v_load_deinterleave(dst + (x + 2 * step) * cn, v_dst02, v_dst12, v_dst22);
+                v_load_deinterleave(dst + (x + 3 * step) * cn, v_dst03, v_dst13, v_dst23);
+
+                v_uint8 v_mask = vx_load(mask + x);
+
+                v_uint16 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+                v_uint32 v_m00, v_m01, v_m10, v_m11;
+                v_expand(v_m0, v_m00, v_m01);
+                v_expand(v_m1, v_m10, v_m11);
+
+                v_float32 v_mf00, v_mf01, v_mf10, v_mf11;
+                v_mf00 = v_cvt_f32(v_reinterpret_as_s32(v_m00));
+                v_mf01 = v_cvt_f32(v_reinterpret_as_s32(v_m01));
+                v_mf10 = v_cvt_f32(v_reinterpret_as_s32(v_m10));
+                v_mf11 = v_cvt_f32(v_reinterpret_as_s32(v_m11));
+
+                v_mf00 = v_mf00 != zero;
+                v_mf01 = v_mf01 != zero;
+                v_mf10 = v_mf10 != zero;
+                v_mf11 = v_mf11 != zero;
+
+                v_dst00 = v_select(v_mf00, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src000)) * v_alpha), v_dst00);
+                v_dst01 = v_select(v_mf01, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src001)) * v_alpha), v_dst01);
+                v_dst02 = v_select(v_mf10, v_fma(v_dst02, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src010)) * v_alpha), v_dst02);
+                v_dst03 = v_select(v_mf11, v_fma(v_dst03, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src011)) * v_alpha), v_dst03);
+
+                v_dst10 = v_select(v_mf00, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src100)) * v_alpha), v_dst10);
+                v_dst11 = v_select(v_mf01, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src101)) * v_alpha), v_dst11);
+                v_dst12 = v_select(v_mf10, v_fma(v_dst12, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src110)) * v_alpha), v_dst12);
+                v_dst13 = v_select(v_mf11, v_fma(v_dst13, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src111)) * v_alpha), v_dst13);
+
+                v_dst20 = v_select(v_mf00, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src200)) * v_alpha), v_dst20);
+                v_dst21 = v_select(v_mf01, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src201)) * v_alpha), v_dst21);
+                v_dst22 = v_select(v_mf10, v_fma(v_dst22, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src210)) * v_alpha), v_dst22);
+                v_dst23 = v_select(v_mf11, v_fma(v_dst23, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src211)) * v_alpha), v_dst23);
+
+                v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
+                v_store_interleave(dst + ( x + step * 2 ) * cn, v_dst02, v_dst12, v_dst22);
+                v_store_interleave(dst + ( x + step * 3 ) * cn, v_dst03, v_dst13, v_dst23);
+            }
+        }
     }
 #endif // CV_SIMD
     accW_general_(src, dst, mask, len, cn, alpha, x);
@@ -2657,9 +2773,81 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
             v_dst0 = v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int0)) * v_alpha);
             v_dst1 = v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_int1)) * v_alpha);
 
-            v_store(dst + x, v_dst0);
+            v_store(dst + x       , v_dst0);
             v_store(dst + x + step, v_dst1);
         }
+    } else {
+        const v_float32 zero = vx_setall_f32((float)0);
+        int size = len * cn;
+        if ( cn == 1 )
+        {
+            for (; x <= size - cVectorWidth; x += cVectorWidth)
+            {
+                v_uint16 v_src = vx_load(src + x);
+                v_uint16 v_mask = v_reinterpret_as_u16(vx_load_expand(mask + x));
+
+                v_uint32 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+
+                v_float32 v_mf0, v_mf1;
+                v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0));
+                v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1));
+
+                v_uint32 v_src0, v_src1;
+                v_expand(v_src, v_src0, v_src1);
+
+                v_float32 v_dst0 = vx_load(dst + x);
+                v_float32 v_dst1 = vx_load(dst + x + step);
+
+                v_mf0 = v_mf0 != zero;
+                v_mf1 = v_mf1 != zero;
+
+                v_dst0 = v_select(v_mf0, v_fma(v_dst0, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src0)) * v_alpha), v_dst0);
+                v_dst1 = v_select(v_mf1, v_fma(v_dst1, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src1)) * v_alpha), v_dst1);
+
+                v_store(dst + x       , v_dst0);
+                v_store(dst + x + step, v_dst1);
+            }
+        } else if ( cn == 3 )
+        {
+            for (; x*cn <= size - cVectorWidth*cn; x += cVectorWidth )
+            {
+                v_uint16 v_src0, v_src1, v_src2;
+                v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2);
+
+                v_uint16 v_mask = v_reinterpret_as_u16(vx_load_expand(mask + x));
+
+                v_uint32 v_m0, v_m1;
+                v_expand(v_mask, v_m0, v_m1);
+
+                v_uint32 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+                v_expand(v_src0, v_src00, v_src01);
+                v_expand(v_src1, v_src10, v_src11);
+                v_expand(v_src2, v_src20, v_src21);
+
+                v_float32 v_dst00, v_dst01, v_dst02, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_load_deinterleave(dst + x * cn             , v_dst00, v_dst10, v_dst20);
+                v_load_deinterleave(dst + (x +     step) * cn, v_dst01, v_dst11, v_dst21);
+
+                v_float32 v_mf0, v_mf1;
+                v_mf0 = v_cvt_f32(v_reinterpret_as_s32(v_m0));
+                v_mf1 = v_cvt_f32(v_reinterpret_as_s32(v_m1));
+
+                v_mf0 = v_mf0 != zero;
+                v_mf1 = v_mf1 != zero;
+
+                v_dst00 = v_select(v_mf0, v_fma(v_dst00, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src00)) * v_alpha), v_dst00);
+                v_dst10 = v_select(v_mf0, v_fma(v_dst10, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src10)) * v_alpha), v_dst10);
+                v_dst20 = v_select(v_mf0, v_fma(v_dst20, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src20)) * v_alpha), v_dst20);
+
+                v_dst01 = v_select(v_mf1, v_fma(v_dst01, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src01)) * v_alpha), v_dst01);
+                v_dst11 = v_select(v_mf1, v_fma(v_dst11, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src11)) * v_alpha), v_dst11);
+                v_dst21 = v_select(v_mf1, v_fma(v_dst21, v_beta, v_cvt_f32(v_reinterpret_as_s32(v_src21)) * v_alpha), v_dst21);
+
+                v_store_interleave(dst + x * cn               , v_dst00, v_dst10, v_dst20);
+                v_store_interleave(dst + ( x + step     ) * cn, v_dst01, v_dst11, v_dst21);
+            }
+        }
     }
 #endif // CV_SIMD
     accW_general_(src, dst, mask, len, cn, alpha, x);
diff --git a/modules/java/jar/CMakeLists.txt b/modules/java/jar/CMakeLists.txt
index 2cce25a76b..33817bcc62 100644
--- a/modules/java/jar/CMakeLists.txt
+++ b/modules/java/jar/CMakeLists.txt
@@ -27,6 +27,13 @@ endif()
 
 set(OPENCV_JAVADOC_DESTINATION "${OpenCV_BINARY_DIR}/doc/doxygen/html/javadoc" CACHE STRING "")
 
+# Old Javadoc URL looks like this: https://docs.oracle.com/javase/6/docs/api/
+# New Javadoc URL looks like this: https://docs.oracle.com/en/java/javase/11/docs/api/
+set(OPENCV_JAVADOC_LINK_URL "" CACHE STRING "See details in modules/java/jar/CMakeLists.txt")
+if(OPENCV_JAVADOC_LINK_URL)
+  set(CMAKE_CONFIG_OPENCV_JAVADOC_LINK "link=\"${OPENCV_JAVADOC_LINK_URL}\"")
+endif()
+
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.xml.in" "${OPENCV_JAVA_DIR}/build.xml" @ONLY)
 list(APPEND depends "${OPENCV_JAVA_DIR}/build.xml")
 
diff --git a/modules/java/jar/build.xml.in b/modules/java/jar/build.xml.in
index bf2830186e..732b398576 100644
--- a/modules/java/jar/build.xml.in
+++ b/modules/java/jar/build.xml.in
@@ -42,7 +42,7 @@
       bottom="Generated on ${timestamp} / OpenCV @OPENCV_VCSVERSION@"
       failonerror="true"
       encoding="UTF-8" charset="UTF-8" docencoding="UTF-8"
-      link="https://docs.oracle.com/javase/6/docs/api/"
+      @CMAKE_CONFIG_OPENCV_JAVADOC_LINK@
       additionalparam="--allow-script-in-comments"
       >
       <Header>
diff --git a/modules/python/test/test_misc.py b/modules/python/test/test_misc.py
index 892215b9a1..7114bea3af 100644
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@@ -96,7 +96,7 @@ class SamplesFindFile(NewOpenCVTests):
 
     def test_MissingFileException(self):
         try:
-            res = cv.samples.findFile('non_existed.file', True)
+            _res = cv.samples.findFile('non_existed.file', True)
             self.assertEqual("Dead code", 0)
         except cv.error as _e:
             pass
diff --git a/modules/ts/src/ts_tags.cpp b/modules/ts/src/ts_tags.cpp
index 4571e4462d..4b775722c1 100644
--- a/modules/ts/src/ts_tags.cpp
+++ b/modules/ts/src/ts_tags.cpp
@@ -46,7 +46,8 @@ static std::vector<std::string>& getTestTagsSkipList()
 #if OPENCV_32BIT_CONFIGURATION
         testSkipWithTags.push_back(CV_TEST_TAG_MEMORY_2GB);
 #else
-        testSkipWithTags.push_back(CV_TEST_TAG_MEMORY_6GB);
+        if (!cvtest::runBigDataTests)
+            testSkipWithTags.push_back(CV_TEST_TAG_MEMORY_6GB);
 #endif
         testSkipWithTags.push_back(CV_TEST_TAG_VERYLONG);
 #if defined(_DEBUG)
diff --git a/samples/cpp/intersectExample.cpp b/samples/cpp/intersectExample.cpp
new file mode 100644
index 0000000000..a8a897241f
--- /dev/null
+++ b/samples/cpp/intersectExample.cpp
@@ -0,0 +1,161 @@
+/*
+ * Author: Steve Nicholson
+ *
+ * A program that illustrates intersectConvexConvex in various scenarios
+ */
+
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+
+using namespace cv;
+using namespace std;
+
+// Create a vector of points describing a rectangle with the given corners
+static vector<Point> makeRectangle(Point topLeft, Point bottomRight)
+{
+    vector<Point> rectangle;
+    rectangle.push_back(topLeft);
+    rectangle.push_back(Point(bottomRight.x, topLeft.y));
+    rectangle.push_back(bottomRight);
+    rectangle.push_back(Point(topLeft.x, bottomRight.y));
+    return rectangle;
+}
+
+static vector<Point> makeTriangle(Point point1, Point point2, Point point3)
+{
+    vector<Point> triangle;
+    triangle.push_back(point1);
+    triangle.push_back(point2);
+    triangle.push_back(point3);
+    return triangle;
+}
+
+// Run intersectConvexConvex on two polygons then draw the polygons and their intersection (if there is one)
+// Return the area of the intersection
+static float drawIntersection(Mat &image, vector<Point> polygon1, vector<Point> polygon2, bool handleNested = true)
+{
+    vector<Point> intersectionPolygon;
+
+    vector<vector<Point> > polygons;
+    polygons.push_back(polygon1);
+    polygons.push_back(polygon2);
+
+    float intersectArea = intersectConvexConvex(polygon1, polygon2, intersectionPolygon, handleNested);
+
+    if (intersectArea > 0)
+    {
+        Scalar fillColor(200, 200, 200);
+        // If the input is invalid, draw the intersection in red
+        if (!isContourConvex(polygon1) || !isContourConvex(polygon2))
+        {
+            fillColor = Scalar(0, 0, 255);
+        }
+        vector<vector<Point> > pp;
+        pp.push_back(intersectionPolygon);
+        fillPoly(image, pp, fillColor);
+    }
+    polylines(image, polygons, true, Scalar(0, 0, 0));
+
+    return intersectArea;
+}
+
+static void drawDescription(Mat &image, int intersectionArea, string description, Point origin)
+{
+    const size_t bufSize=1024;
+    char caption[bufSize];
+    snprintf(caption, bufSize, "Intersection area: %d%s", intersectionArea, description.c_str());
+    putText(image, caption, origin, FONT_HERSHEY_SIMPLEX, 0.6, Scalar(0, 0, 0));
+}
+
+static void intersectConvexExample()
+{
+    Mat image(610, 550, CV_8UC3, Scalar(255, 255, 255));
+    float intersectionArea;
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 10), Point(50, 50)),
+        makeRectangle(Point(20, 20), Point(60, 60)));
+
+    drawDescription(image, (int)intersectionArea, "", Point(70, 40));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 70), Point(35, 95)),
+        makeRectangle(Point(35, 95), Point(60, 120)));
+
+    drawDescription(image, (int)intersectionArea, "", Point(70, 100));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 130), Point(60, 180)),
+        makeRectangle(Point(20, 140), Point(50, 170)),
+        true);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested true)", Point(70, 160));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 190), Point(60, 240)),
+        makeRectangle(Point(20, 200), Point(50, 230)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 220));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 250), Point(60, 300)),
+        makeRectangle(Point(20, 250), Point(50, 290)),
+        true);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested true)", Point(70, 280));
+
+    // These rectangles share an edge so handleNested can be false and an intersection is still found
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 310), Point(60, 360)),
+        makeRectangle(Point(20, 310), Point(50, 350)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 340));
+
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 370), Point(60, 420)),
+        makeRectangle(Point(20, 371), Point(50, 410)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 400));
+
+    // A vertex of the triangle lies on an edge of the rectangle so handleNested can be false and an intersection is still found
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 430), Point(60, 480)),
+        makeTriangle(Point(35, 430), Point(20, 470), Point(50, 470)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (handleNested false)", Point(70, 460));
+
+    // Show intersection of overlapping rectangle and triangle
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 490), Point(40, 540)),
+        makeTriangle(Point(25, 500), Point(25, 530), Point(60, 515)),
+        false);
+
+    drawDescription(image, (int)intersectionArea, "", Point(70, 520));
+
+    // This concave polygon is invalid input to intersectConvexConvex so it returns an invalid intersection
+    vector<Point> notConvex;
+    notConvex.push_back(Point(25, 560));
+    notConvex.push_back(Point(25, 590));
+    notConvex.push_back(Point(45, 580));
+    notConvex.push_back(Point(60, 600));
+    notConvex.push_back(Point(60, 550));
+    notConvex.push_back(Point(45, 570));
+    intersectionArea = drawIntersection(image,
+        makeRectangle(Point(10, 550), Point(50, 600)),
+        notConvex,
+        false);
+
+    drawDescription(image, (int)intersectionArea, " (invalid input: not convex)", Point(70, 580));
+
+    imshow("Intersections", image);
+    waitKey(0);
+}
+
+int main()
+{
+    intersectConvexExample();
+}
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index e94c68cd6d..a556217f8b 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -116,8 +116,10 @@ double compose_megapix = -1;
 float conf_thresh = 1.f;
 #ifdef HAVE_OPENCV_XFEATURES2D
 string features_type = "surf";
+float match_conf = 0.65f;
 #else
 string features_type = "orb";
+float match_conf = 0.3f;
 #endif
 string matcher_type = "homography";
 string estimator_type = "homography";
@@ -132,7 +134,6 @@ int expos_comp_type = ExposureCompensator::GAIN_BLOCKS;
 int expos_comp_nr_feeds = 1;
 int expos_comp_nr_filtering = 2;
 int expos_comp_block_size = 32;
-float match_conf = 0.3f;
 string seam_find_type = "gc_color";
 int blend_type = Blender::MULTI_BAND;
 int timelapse_type = Timelapser::AS_IS;
@@ -196,7 +197,7 @@ static int parseCmdArgs(int argc, char** argv)
         else if (string(argv[i]) == "--features")
         {
             features_type = argv[i + 1];
-            if (features_type == "orb")
+            if (string(features_type) == "orb")
                 match_conf = 0.3f;
             i++;
         }
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
index 73aa3c9819..aa5fce2091 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
@@ -14,9 +14,9 @@ using namespace cv;
 
 const char* keys =
     "{ help  h| | Print help message. }"
-    "{ input1 | | Path to input image 1. }"
-    "{ input2 | | Path to input image 2. }"
-    "{ input3 | | Path to input image 3. }";
+    "{ @input1 | | Path to input image 1. }"
+    "{ @input2 | | Path to input image 2. }"
+    "{ @input3 | | Path to input image 3. }";
 
 /**
  * @function main
diff --git a/samples/dnn/fast_neural_style.py b/samples/dnn/fast_neural_style.py
index 6afd166be5..912c2f0832 100644
--- a/samples/dnn/fast_neural_style.py
+++ b/samples/dnn/fast_neural_style.py
@@ -14,7 +14,7 @@ parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of
 args = parser.parse_args()
 
 net = cv.dnn.readNetFromTorch(cv.samples.findFile(args.model))
-net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV);
+net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
 
 if args.input:
     cap = cv.VideoCapture(args.input)
diff --git a/samples/dnn/mobilenet_ssd_accuracy.py b/samples/dnn/mobilenet_ssd_accuracy.py
index 58395acbdf..23fb06b921 100644
--- a/samples/dnn/mobilenet_ssd_accuracy.py
+++ b/samples/dnn/mobilenet_ssd_accuracy.py
@@ -27,7 +27,7 @@ args = parser.parse_args()
 
 ### Get OpenCV predictions #####################################################
 net = cv.dnn.readNetFromTensorflow(cv.samples.findFile(args.weights), cv.samples.findFile(args.prototxt))
-net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV);
+net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
 
 detections = []
 for imgName in os.listdir(args.images):
diff --git a/samples/dnn/text_detection.py b/samples/dnn/text_detection.py
index 9f7f159a54..9ea4c10190 100644
--- a/samples/dnn/text_detection.py
+++ b/samples/dnn/text_detection.py
@@ -134,7 +134,7 @@ def main():
             for j in range(4):
                 p1 = (vertices[j][0], vertices[j][1])
                 p2 = (vertices[(j + 1) % 4][0], vertices[(j + 1) % 4][1])
-                cv.line(frame, p1, p2, (0, 255, 0), 1);
+                cv.line(frame, p1, p2, (0, 255, 0), 1)
 
         # Put efficiency information
         cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py
index b46b1d492c..5aa1d30e39 100644
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@@ -21,7 +21,7 @@ def tokenize(s):
                 elif token:
                     tokens.append(token)
                     token = ""
-            isString = (symbol == '\"' or symbol == '\'') ^ isString;
+            isString = (symbol == '\"' or symbol == '\'') ^ isString
 
         elif symbol == '{' or symbol == '}' or symbol == '[' or symbol == ']':
             if token:
diff --git a/samples/dnn/tf_text_graph_ssd.py b/samples/dnn/tf_text_graph_ssd.py
index beaca3f4e4..e6017b227e 100644
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@@ -122,7 +122,7 @@ def createSSDGraph(modelPath, configPath, outputPath):
     print('Input image size: %dx%d' % (image_width, image_height))
 
     # Read the graph.
-    inpNames = ['image_tensor']
+    _inpNames = ['image_tensor']
     outNames = ['num_detections', 'detection_scores', 'detection_boxes', 'detection_classes']
 
     writeTextGraph(modelPath, outputPath, outNames)
diff --git a/samples/python/browse.py b/samples/python/browse.py
index 14bd05a05d..edc791f9cd 100755
--- a/samples/python/browse.py
+++ b/samples/python/browse.py
@@ -45,7 +45,7 @@ def main():
 
 
     small = img
-    for i in xrange(3):
+    for _i in xrange(3):
         small = cv.pyrDown(small)
 
     def onmouse(event, x, y, flags, param):
diff --git a/samples/python/calibrate.py b/samples/python/calibrate.py
index 2378d8bf1a..bca430b5a5 100755
--- a/samples/python/calibrate.py
+++ b/samples/python/calibrate.py
@@ -97,7 +97,7 @@ def main():
         obj_points.append(pattern_points)
 
     # calculate camera distortion
-    rms, camera_matrix, dist_coefs, rvecs, tvecs = cv.calibrateCamera(obj_points, img_points, (w, h), None, None)
+    rms, camera_matrix, dist_coefs, _rvecs, _tvecs = cv.calibrateCamera(obj_points, img_points, (w, h), None, None)
 
     print("\nRMS:", rms)
     print("camera matrix:\n", camera_matrix)
@@ -106,7 +106,7 @@ def main():
     # undistort the image with the calibration
     print('')
     for fn in img_names if debug_dir else []:
-        path, name, ext = splitfn(fn)
+        _path, name, _ext = splitfn(fn)
         img_found = os.path.join(debug_dir, name + '_chess.png')
         outfile = os.path.join(debug_dir, name + '_undistorted.png')
 
diff --git a/samples/python/camera_calibration_show_extrinsics.py b/samples/python/camera_calibration_show_extrinsics.py
index 610138bc7b..0118b5b913 100755
--- a/samples/python/camera_calibration_show_extrinsics.py
+++ b/samples/python/camera_calibration_show_extrinsics.py
@@ -184,7 +184,7 @@ def main():
     extrinsics = fs.getNode('extrinsic_parameters').mat()
 
     import matplotlib.pyplot as plt
-    from mpl_toolkits.mplot3d import Axes3D
+    from mpl_toolkits.mplot3d import Axes3D  # pylint: disable=unused-variable
 
     fig = plt.figure()
     ax = fig.gca(projection='3d')
diff --git a/samples/python/color_histogram.py b/samples/python/color_histogram.py
index 0422d7282c..a1924bab8b 100755
--- a/samples/python/color_histogram.py
+++ b/samples/python/color_histogram.py
@@ -46,7 +46,7 @@ class App():
         cam = video.create_capture(fn, fallback='synth:bg=baboon.jpg:class=chess:noise=0.05')
 
         while True:
-            flag, frame = cam.read()
+            _flag, frame = cam.read()
             cv.imshow('camera', frame)
 
             small = cv.pyrDown(frame)
diff --git a/samples/python/edge.py b/samples/python/edge.py
index ba04adecfe..e85c2f6288 100755
--- a/samples/python/edge.py
+++ b/samples/python/edge.py
@@ -38,7 +38,7 @@ def main():
 
     cap = video.create_capture(fn)
     while True:
-        flag, img = cap.read()
+        _flag, img = cap.read()
         gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
         thrs1 = cv.getTrackbarPos('thrs1', 'edge')
         thrs2 = cv.getTrackbarPos('thrs2', 'edge')
diff --git a/samples/python/facedetect.py b/samples/python/facedetect.py
index 1050cc5aff..488c92d5e5 100755
--- a/samples/python/facedetect.py
+++ b/samples/python/facedetect.py
@@ -48,7 +48,7 @@ def main():
     cam = create_capture(video_src, fallback='synth:bg={}:noise=0.05'.format(cv.samples.findFile('samples/data/lena.jpg')))
 
     while True:
-        ret, img = cam.read()
+        _ret, img = cam.read()
         gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
         gray = cv.equalizeHist(gray)
 
diff --git a/samples/python/fitline.py b/samples/python/fitline.py
index 6705f39abb..db695cbb2b 100755
--- a/samples/python/fitline.py
+++ b/samples/python/fitline.py
@@ -88,6 +88,7 @@ def main():
         update()
         ch = cv.waitKey(0)
         if ch == ord('f'):
+            global cur_func_name
             if PY3:
                 cur_func_name = next(dist_func_names)
             else:
diff --git a/samples/python/houghcircles.py b/samples/python/houghcircles.py
index b8d3a1a019..416309aab0 100755
--- a/samples/python/houghcircles.py
+++ b/samples/python/houghcircles.py
@@ -30,7 +30,7 @@ def main():
     circles = cv.HoughCircles(img, cv.HOUGH_GRADIENT, 1, 10, np.array([]), 100, 30, 1, 30)
 
     if circles is not None: # Check if circles have been found and only then iterate over these and add them to the image
-        a, b, c = circles.shape
+        _a, b, _c = circles.shape
         for i in range(b):
             cv.circle(cimg, (circles[0][i][0], circles[0][i][1]), circles[0][i][2], (0, 0, 255), 3, cv.LINE_AA)
             cv.circle(cimg, (circles[0][i][0], circles[0][i][1]), 2, (0, 255, 0), 3, cv.LINE_AA)  # draw center of circle
diff --git a/samples/python/houghlines.py b/samples/python/houghlines.py
index 7c99cf2ae9..022b680f56 100755
--- a/samples/python/houghlines.py
+++ b/samples/python/houghlines.py
@@ -29,14 +29,14 @@ def main():
 
     if True: # HoughLinesP
         lines = cv.HoughLinesP(dst, 1, math.pi/180.0, 40, np.array([]), 50, 10)
-        a,b,c = lines.shape
+        a, b, _c = lines.shape
         for i in range(a):
             cv.line(cdst, (lines[i][0][0], lines[i][0][1]), (lines[i][0][2], lines[i][0][3]), (0, 0, 255), 3, cv.LINE_AA)
 
     else:    # HoughLines
         lines = cv.HoughLines(dst, 1, math.pi/180.0, 50, np.array([]), 0, 0)
         if lines is not None:
-            a,b,c = lines.shape
+            a, b, _c = lines.shape
             for i in range(a):
                 rho = lines[i][0][0]
                 theta = lines[i][0][1]
diff --git a/samples/python/kmeans.py b/samples/python/kmeans.py
index d7fcbe8083..1b1c9d6a04 100755
--- a/samples/python/kmeans.py
+++ b/samples/python/kmeans.py
@@ -33,7 +33,7 @@ def main():
         points, _ = make_gaussians(cluster_n, img_size)
 
         term_crit = (cv.TERM_CRITERIA_EPS, 30, 0.1)
-        ret, labels, centers = cv.kmeans(points, cluster_n, None, term_crit, 10, 0)
+        _ret, labels, _centers = cv.kmeans(points, cluster_n, None, term_crit, 10, 0)
 
         img = np.zeros((img_size, img_size, 3), np.uint8)
         for (x, y), label in zip(np.int32(points), labels.ravel()):
diff --git a/samples/python/lappyr.py b/samples/python/lappyr.py
index 2ee73ecb1d..2835b98d13 100755
--- a/samples/python/lappyr.py
+++ b/samples/python/lappyr.py
@@ -60,7 +60,7 @@ def main():
         cv.createTrackbar('%d'%i, 'level control', 5, 50, nothing)
 
     while True:
-        ret, frame = cap.read()
+        _ret, frame = cap.read()
 
         pyr = build_lappyr(frame, leveln)
         for i in xrange(leveln):
diff --git a/samples/python/opt_flow.py b/samples/python/opt_flow.py
index c4515582e7..76a0ac2caf 100755
--- a/samples/python/opt_flow.py
+++ b/samples/python/opt_flow.py
@@ -64,14 +64,14 @@ def main():
         fn = 0
 
     cam = video.create_capture(fn)
-    ret, prev = cam.read()
+    _ret, prev = cam.read()
     prevgray = cv.cvtColor(prev, cv.COLOR_BGR2GRAY)
     show_hsv = False
     show_glitch = False
     cur_glitch = prev.copy()
 
     while True:
-        ret, img = cam.read()
+        _ret, img = cam.read()
         gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
         flow = cv.calcOpticalFlowFarneback(prevgray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
         prevgray = gray
diff --git a/samples/python/peopledetect.py b/samples/python/peopledetect.py
index d2a7fdeee5..bdd49cab6f 100755
--- a/samples/python/peopledetect.py
+++ b/samples/python/peopledetect.py
@@ -51,7 +51,7 @@ def main():
             print('loading error')
             continue
 
-        found, w = hog.detectMultiScale(img, winStride=(8,8), padding=(32,32), scale=1.05)
+        found, _w = hog.detectMultiScale(img, winStride=(8,8), padding=(32,32), scale=1.05)
         found_filtered = []
         for ri, r in enumerate(found):
             for qi, q in enumerate(found):
diff --git a/samples/python/stereo_match.py b/samples/python/stereo_match.py
index 969ea11dbb..4d5875b814 100755
--- a/samples/python/stereo_match.py
+++ b/samples/python/stereo_match.py
@@ -69,8 +69,8 @@ def main():
     out_points = points[mask]
     out_colors = colors[mask]
     out_fn = 'out.ply'
-    write_ply('out.ply', out_points, out_colors)
-    print('%s saved' % 'out.ply')
+    write_ply(out_fn, out_points, out_colors)
+    print('%s saved' % out_fn)
 
     cv.imshow('left', imgL)
     cv.imshow('disparity', (disp-min_disp)/num_disp)
diff --git a/samples/python/turing.py b/samples/python/turing.py
index 27dbe02ad3..dc920d1295 100755
--- a/samples/python/turing.py
+++ b/samples/python/turing.py
@@ -32,7 +32,7 @@ def main():
 
     w, h = 512, 512
 
-    args, args_list = getopt.getopt(sys.argv[1:], 'o:', [])
+    args, _args_list = getopt.getopt(sys.argv[1:], 'o:', [])
     args = dict(args)
     out = None
     if '-o' in args:
diff --git a/samples/python/tutorial_code/core/mat_operations/mat_operations.py b/samples/python/tutorial_code/core/mat_operations/mat_operations.py
index e9ec03699d..f237074fb6 100644
--- a/samples/python/tutorial_code/core/mat_operations/mat_operations.py
+++ b/samples/python/tutorial_code/core/mat_operations/mat_operations.py
@@ -25,13 +25,13 @@ def access_pixel():
     y = 0
     x = 0
     ## [Pixel access 1]
-    intensity = img[y,x]
+    _intensity = img[y,x]
     ## [Pixel access 1]
 
     ## [Pixel access 3]
-    blue = img[y,x,0]
-    green = img[y,x,1]
-    red = img[y,x,2]
+    _blue = img[y,x,0]
+    _green = img[y,x,1]
+    _red = img[y,x,2]
     ## [Pixel access 3]
 
     ## [Pixel access 5]
@@ -42,12 +42,12 @@ def reference_counting():
     # Memory management and reference counting
     ## [Reference counting 2]
     img = cv.imread('image.jpg')
-    img1 = np.copy(img)
+    _img1 = np.copy(img)
     ## [Reference counting 2]
 
     ## [Reference counting 3]
     img = cv.imread('image.jpg')
-    sobelx = cv.Sobel(img, cv.CV_32F, 1, 0);
+    _sobelx = cv.Sobel(img, cv.CV_32F, 1, 0)
     ## [Reference counting 3]
 
 def primitive_operations():
@@ -57,17 +57,17 @@ def primitive_operations():
     ## [Set image to black]
 
     ## [Select ROI]
-    smallImg = img[10:110,10:110]
+    _smallImg = img[10:110,10:110]
     ## [Select ROI]
 
     ## [BGR to Gray]
     img = cv.imread('image.jpg')
-    grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+    _grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
     ## [BGR to Gray]
 
     src = np.ones((4,4), np.uint8)
     ## [Convert to CV_32F]
-    dst = src.astype(np.float32)
+    _dst = src.astype(np.float32)
     ## [Convert to CV_32F]
 
 def visualize_images():
diff --git a/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py b/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py
index b3f316396a..127a0f4325 100644
--- a/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py
+++ b/samples/python/tutorial_code/imgProc/changing_contrast_brightness_image/changing_contrast_brightness_image.py
@@ -25,8 +25,8 @@ def gammaCorrection():
     res = cv.LUT(img_original, lookUpTable)
     ## [changing-contrast-brightness-gamma-correction]
 
-    img_gamma_corrected = cv.hconcat([img_original, res]);
-    cv.imshow("Gamma correction", img_gamma_corrected);
+    img_gamma_corrected = cv.hconcat([img_original, res])
+    cv.imshow("Gamma correction", img_gamma_corrected)
 
 def on_linear_transform_alpha_trackbar(val):
     global alpha
diff --git a/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py b/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
index c83f7980f5..64fd07b174 100644
--- a/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
+++ b/samples/python/tutorial_code/ml/introduction_to_pca/introduction_to_pca.py
@@ -85,13 +85,13 @@ contours, _ = cv.findContours(bw, cv.RETR_LIST, cv.CHAIN_APPROX_NONE)
 
 for i, c in enumerate(contours):
     # Calculate the area of each contour
-    area = cv.contourArea(c);
+    area = cv.contourArea(c)
     # Ignore contours that are too small or too large
     if area < 1e2 or 1e5 < area:
         continue
 
     # Draw each contour only for visualisation purposes
-    cv.drawContours(src, contours, i, (0, 0, 255), 2);
+    cv.drawContours(src, contours, i, (0, 0, 255), 2)
     # Find the orientation of each shape
     getOrientation(c, src)
 ## [contours]
diff --git a/samples/python/video_threaded.py b/samples/python/video_threaded.py
index 4886db3d80..cbc73d296b 100755
--- a/samples/python/video_threaded.py
+++ b/samples/python/video_threaded.py
@@ -70,7 +70,7 @@ def main():
             draw_str(res, (20, 60), "frame interval :  %.1f ms" % (frame_interval.value*1000))
             cv.imshow('threaded video', res)
         if len(pending) < threadn:
-            ret, frame = cap.read()
+            _ret, frame = cap.read()
             t = clock()
             frame_interval.update(t - last_frame_time)
             last_frame_time = t
diff --git a/samples/python/video_v4l2.py b/samples/python/video_v4l2.py
index 68f22699b1..61b1e35804 100644
--- a/samples/python/video_v4l2.py
+++ b/samples/python/video_v4l2.py
@@ -42,7 +42,7 @@ def main():
     cv.createTrackbar("Focus", "Video", focus, 100, lambda v: cap.set(cv.CAP_PROP_FOCUS, v / 100))
 
     while True:
-        status, img = cap.read()
+        _status, img = cap.read()
 
         fourcc = decode_fourcc(cap.get(cv.CAP_PROP_FOURCC))