From 42b1d049996779de0b8d5de2de1e671c11b687dd Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <terfendail@mediana.jetos.com>
Date: Fri, 12 Jul 2019 01:34:19 +0300
Subject: [PATCH 01/13] StereoSGBM algorithm updated to use wide universal
 intrinsics

---
 modules/calib3d/src/stereosgbm.cpp | 1712 ++++++++++++++--------------
 1 file changed, 872 insertions(+), 840 deletions(-)

diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp
index 88b28ff598..3b721ccf66 100644
--- a/modules/calib3d/src/stereosgbm.cpp
+++ b/modules/calib3d/src/stereosgbm.cpp
@@ -61,7 +61,9 @@ typedef uchar PixType;
 typedef short CostType;
 typedef short DispType;
 
-enum { NR = 16, NR2 = NR/2 };
+// NR - the number of directions. the loop on x that computes Lr assumes that NR == 8.
+// if you change NR, please, modify the loop as well.
+enum { NR = 8, NR2 = NR/2 };
 
 
 struct StereoSGBMParams
@@ -110,6 +112,41 @@ struct StereoSGBMParams
     int mode;
 };
 
+#if CV_SIMD
+#if CV_SIMD_WIDTH == 16
+static inline v_int16 vx_setseq_s16()
+{ return v_int16(0, 1, 2, 3, 4, 5, 6, 7); }
+#elif CV_SIMD_WIDTH == 32
+static inline v_int16 vx_setseq_s16()
+{ return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
+#elif CV_SIMD_WIDTH == 64
+static inline v_int16 vx_setseq_s16()
+{ return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
+#else
+struct vseq_s16
+{
+    short data[v_int16::nlanes];
+    vseq_s16()
+    {
+        for (int i = 0; i < v_int16::nlanes; i++)
+            data[i] = i;
+    }
+};
+static inline v_int16 vx_setseq_s16()
+{
+    static vseq_s16 vseq;
+    return vx_load(vseq.data);
+}
+#endif
+// define some additional reduce operations:
+static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos)
+{
+    min_val = v_reduce_min(val);
+    v_int16 v_mask = (vx_setall_s16(min_val) == val);
+    min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
+}
+#endif
+
 static const int DEFAULT_RIGHT_BORDER = -1;
 /*
  For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
@@ -128,7 +165,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 {
     int x, c, width = img1.cols, cn = img1.channels();
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = maxD - minD, width1 = maxX1 - minX1;
+    int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
     //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
     //That is needs of parallel algorithm
     xrange_min = (xrange_min < 0) ? 0: xrange_min;
@@ -191,7 +228,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 
     memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) );
 
-    buffer -= width-1-maxX2;
+    buffer -= width-maxX2;
     cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop
 
     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
@@ -201,7 +238,9 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
         // precompute
         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
-        for( x = width-1-maxX2; x < width-1- minX2; x++ )
+        //   to process values from [minX2, maxX2) we should check memory location (width - 1 - maxX2, width - 1 - minX2]
+        //   so iterate through [width - maxX2, width - minX2)
+        for( x = width-maxX2; x < width-minX2; x++ )
         {
             int v = prow2[x];
             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
@@ -220,43 +259,38 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
             int u0 = std::min(ul, ur); u0 = std::min(u0, u);
             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
 
-        #if CV_SIMD128
-            if (true)
+            int d = minD;
+        #if CV_SIMD
+            v_uint8 _u  = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0);
+            v_uint8 _u1 = vx_setall_u8((uchar)u1);
+
+            for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
             {
-                v_uint8x16 _u  = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
-                v_uint8x16 _u1 = v_setall_u8((uchar)u1);
+                v_uint8 _v  = vx_load(prow2  + width-x-1 + d);
+                v_uint8 _v0 = vx_load(buffer + width-x-1 + d);
+                v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2);
+                v_uint8 c0 = v_max(_u - _v1, _v0 - _u);
+                v_uint8 c1 = v_max(_v - _u1, _u0 - _v);
+                v_uint8 diff = v_min(c0, c1);
 
-                for( int d = minD; d < maxD; d += 16 )
-                {
-                    v_uint8x16 _v  = v_load(prow2  + width-x-1 + d);
-                    v_uint8x16 _v0 = v_load(buffer + width-x-1 + d);
-                    v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2);
-                    v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
-                    v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
-                    v_uint8x16 diff = v_min(c0, c1);
+                v_int16 _c0 = vx_load_aligned(cost + x*D + d);
+                v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes);
 
-                    v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
-                    v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);
-
-                    v_uint16x8 diff1,diff2;
-                    v_expand(diff,diff1,diff2);
-                    v_store_aligned(cost + x*D + d,     _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
-                    v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
-                }
+                v_uint16 diff1,diff2;
+                v_expand(diff,diff1,diff2);
+                v_store_aligned(cost + x*D + d,                   _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
+                v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
             }
-            else
         #endif
+            for( ; d < maxD; d++ )
             {
-                for( int d = minD; d < maxD; d++ )
-                {
-                    int v = prow2[width-x-1 + d];
-                    int v0 = buffer[width-x-1 + d];
-                    int v1 = buffer[width-x-1 + d + width2];
-                    int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
-                    int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
+                int v = prow2[width-x-1 + d];
+                int v0 = buffer[width-x-1 + d];
+                int v1 = buffer[width-x-1 + d + width2];
+                int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
+                int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
 
-                    cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
-                }
+                cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
             }
         }
     }
@@ -287,23 +321,6 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                                  Mat& disp1, const StereoSGBMParams& params,
                                  Mat& buffer )
 {
-#if CV_SIMD128
-    // maxDisparity is supposed to multiple of 16, so we can forget doing else
-    static const uchar LSBTab[] =
-    {
-        0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-        5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-    };
-    static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
-#endif
-
-    const int ALIGN = 16;
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
     const CostType MAX_COST = SHRT_MAX;
@@ -318,6 +335,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     int k, width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
     int D = maxD - minD, width1 = maxX1 - minX1;
+    int Da = (int)alignSize(D, v_int16::nlanes);
+    int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
     bool fullDP = params.mode == StereoSGBM::MODE_HH;
@@ -334,42 +353,33 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
         return;
     }
 
-    CV_Assert( D % 16 == 0 );
-
-    // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
-    // if you change NR, please, modify the loop as well.
-    int D2 = D+16, NRD2 = NR2*D2;
-
-    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
-    // for 8-way dynamic programming we need the current row and
-    // the previous row, i.e. 2 rows in total
-    const int NLR = 2;
-    const int LrBorder = NLR - 1;
-
     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
-    size_t costBufSize = width1*D;
+    size_t costBufSize = width1*Da;
     size_t CSBufSize = costBufSize*(fullDP ? height : 1);
-    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
+    size_t minLrSize = (width1 + 2)*NR2, LrSize = minLrSize*Dlra;
     int hsumBufNRows = SH2*2 + 2;
-    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
+    // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
+    // for 8-way dynamic programming we need the current row and
+    // the previous row, i.e. 2 rows in total
+    size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // alignment, C, S
     costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
-    CSBufSize*2*sizeof(CostType) + // C, S
-    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
-    width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
+    ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType) + // minLr[] and Lr[]
+    width*(sizeof(CostType) + sizeof(DispType)) + // disp2cost + disp2
+    width * (4*img1.channels() + 2) * sizeof(PixType); // temp buffer for computing per-pixel cost
 
     if( buffer.empty() || !buffer.isContinuous() ||
         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
         buffer.reserveBuffer(totalBufSize);
 
     // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
+    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
     CostType* Sbuf = Cbuf + CSBufSize;
     CostType* hsumBuf = Sbuf + CSBufSize;
     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 
-    CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
+    CostType* disp2cost = pixDiff + costBufSize + ((LrSize + minLrSize)*2 + v_int16::nlanes);
     DispType* disp2ptr = (DispType*)(disp2cost + width);
     PixType* tempBuf = (PixType*)(disp2ptr + width);
 
@@ -392,19 +402,19 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
             x1 = width1-1; x2 = -1; dx = -1;
         }
 
-        CostType *Lr[NLR]={0}, *minLr[NLR]={0};
+        CostType *Lr[2]={0}, *minLr[2]={0};
 
-        for( k = 0; k < NLR; k++ )
+        for( k = 0; k < 2; k++ )
         {
             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
             // and will occasionally use negative indices with the arrays
             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
             // however, then the alignment will be imperfect, i.e. bad for SSE,
-            // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-            Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
-            memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
-            minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
-            memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
+            // thus we shift the pointers by SIMD vector size
+            Lr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*k + NR2*Dlra;
+            memset( Lr[k] - NR2*Dlra, 0, LrSize*sizeof(CostType) );
+            minLr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*2 + minLrSize*k + NR2;
+            memset( minLr[k] - NR2, 0, minLrSize*sizeof(CostType) );
         }
 
         for( int y = y1; y != y2; y += dy )
@@ -426,83 +436,124 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     {
                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
 
-                        memset(hsumAdd, 0, D*sizeof(CostType));
-                        for( x = 0; x <= SW2*D; x += D )
+                        memset(hsumAdd, 0, Da*sizeof(CostType));
+#if CV_SIMD
+                        v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
+                        for( d = 0; d < Da; d += v_int16::nlanes )
                         {
-                            int scale = x == 0 ? SW2 + 1 : 1;
-                            for( d = 0; d < D; d++ )
-                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
+                            v_int16 v_hsumAdd = vx_load_aligned(pixDiff + d) * h_scale;
+                            for( x = Da; x <= SW2*Da; x += Da )
+                                v_hsumAdd += vx_load_aligned(pixDiff + x + d);
+                            v_store_aligned(hsumAdd + d, v_hsumAdd);
                         }
+#else
+                        for (d = 0; d < D; d++)
+                        {
+                            hsumAdd[d] = (CostType)(pixDiff[d] * (SW2 + 1));
+                            for( x = Da; x <= SW2*Da; x += Da )
+                                hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]);
+                        }
+#endif
 
                         if( y > 0 )
                         {
                             const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
                             const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
 
-                            for( x = D; x < width1*D; x += D )
-                            {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+#if CV_SIMD
+                            for (d = 0; d < Da; d += v_int16::nlanes)
+                                v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#else
+                            for (d = 0; d < D; d++)
+                                C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]);
+#endif
 
-                            #if CV_SIMD128
-                                if (true)
+                            for( x = Da; x < width1*Da; x += Da )
+                            {
+                                const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+#if CV_SIMD
+                                for( d = 0; d < Da; d += v_int16::nlanes )
                                 {
-                                    for( d = 0; d < D; d += 8 )
-                                    {
-                                        v_int16x8 hv = v_load(hsumAdd + x - D + d);
-                                        v_int16x8 Cx = v_load(Cprev + x + d);
-                                        v_int16x8 psub = v_load(pixSub + d);
-                                        v_int16x8 padd = v_load(pixAdd + d);
-                                        hv = (hv - psub + padd);
-                                        psub = v_load(hsumSub + x + d);
-                                        Cx = Cx - psub + hv;
-                                        v_store(hsumAdd + x + d, hv);
-                                        v_store(C + x + d, Cx);
-                                    }
+                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                    v_store_aligned(hsumAdd + x + d, hv);
+                                    v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
                                 }
-                                else
-                            #endif
+#else
+                                for( d = 0; d < D; d++ )
                                 {
-                                    for( d = 0; d < D; d++ )
-                                    {
-                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
-                                    }
+                                    int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
+                                    C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
                                 }
+#endif
                             }
                         }
                         else
                         {
-                            for( x = D; x < width1*D; x += D )
+#if CV_SIMD
+                            v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
+                            for (d = 0; d < Da; d += v_int16::nlanes)
+                                v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+#else
+                            int scale = k == 0 ? SH2 + 1 : 1;
+                            for (d = 0; d < D; d++)
+                                C[d] = (CostType)(C[d] + hsumAdd[d] * scale);
+#endif
+                            for( x = Da; x < width1*Da; x += Da )
                             {
-                                const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+                                const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
 
+#if CV_SIMD
+                                for (d = 0; d < Da; d += v_int16::nlanes)
+                                {
+                                    v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                    v_store_aligned(hsumAdd + x + d, hv);
+                                    v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                }
+#else
                                 for( d = 0; d < D; d++ )
-                                    hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                                {
+                                    CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
+                                    hsumAdd[x + d] = hv;
+                                    C[x + d] = (CostType)(C[x + d] + hv * scale);
+                                }
+#endif
                             }
                         }
                     }
-
-                    if( y == 0 )
+                    else
                     {
-                        int scale = k == 0 ? SH2 + 1 : 1;
-                        for( x = 0; x < width1*D; x++ )
-                            C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+                        if( y > 0 )
+                        {
+                            const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+                            const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;
+#if CV_SIMD
+                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
+                                v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#else
+                            for (x = 0; x < width1*Da; x++)
+                                C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
+#endif
+                        }
+                        else
+                        {
+#if CV_SIMD
+                            for (x = 0; x < width1*Da; x += v_int16::nlanes)
+                                v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#else
+                            for (x = 0; x < width1*Da; x++)
+                                C[x] = (CostType)(C[x] + hsumAdd[x]);
+#endif
+                        }
                     }
+
                 }
 
                 // also, clear the S buffer
-                for( k = 0; k < width1*D; k++ )
-                    S[k] = 0;
+                memset(S, 0, width1*Da * sizeof(CostType));
             }
 
-            // clear the left and the right borders
-            memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
-            memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
-            memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
-            memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
-
             /*
              [formula 13 in the paper]
              compute L_r(p, d) = C(p, d) +
@@ -515,7 +566,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
              0: r=(-dx, 0)
              1: r=(-1, -dy)
              2: r=(0, -dy)
-             3: r=(1, -dy)
+             3: r=(1, -dy)   !!!Note that only directions 0 to 3 are processed
              4: r=(-2, -dy)
              5: r=(-1, -dy*2)
              6: r=(1, -dy*2)
@@ -524,135 +575,139 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
             for( x = x1; x != x2; x += dx )
             {
-                int xm = x*NR2, xd = xm*D2;
+                int xm = x*NR2, xd = xm*Dlra;
 
                 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
                 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
 
-                CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
-                CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
-                CostType* Lr_p2 = Lr[1] + xd + D2*2;
-                CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
+                CostType* Lr_p0 = Lr[0] + xd - dx*NR2*Dlra;
+                CostType* Lr_p1 = Lr[1] + xd - NR2*Dlra + Dlra;
+                CostType* Lr_p2 = Lr[1] + xd + Dlra*2;
+                CostType* Lr_p3 = Lr[1] + xd + NR2*Dlra + Dlra*3;
 
                 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
                 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
 
                 CostType* Lr_p = Lr[0] + xd;
-                const CostType* Cp = C + x*D;
-                CostType* Sp = S + x*D;
+                const CostType* Cp = C + x*Da;
+                CostType* Sp = S + x*Da;
 
-            #if CV_SIMD128
-                if (true)
+                CostType* minL = minLr[0] + xm;
+                d = 0;
+#if CV_SIMD
+                v_int16 _P1 = vx_setall_s16((short)P1);
+
+                v_int16 _delta0 = vx_setall_s16((short)delta0);
+                v_int16 _delta1 = vx_setall_s16((short)delta1);
+                v_int16 _delta2 = vx_setall_s16((short)delta2);
+                v_int16 _delta3 = vx_setall_s16((short)delta3);
+                v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
+                v_int16 _minL1 = vx_setall_s16((short)MAX_COST);
+                v_int16 _minL2 = vx_setall_s16((short)MAX_COST);
+                v_int16 _minL3 = vx_setall_s16((short)MAX_COST);
+
+                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
                 {
-                    v_int16x8 _P1 = v_setall_s16((short)P1);
+                    v_int16 Cpd = vx_load_aligned(Cp + d);
+                    v_int16 Spd = vx_load_aligned(Sp + d);
+                    v_int16 L;
 
-                    v_int16x8 _delta0 = v_setall_s16((short)delta0);
-                    v_int16x8 _delta1 = v_setall_s16((short)delta1);
-                    v_int16x8 _delta2 = v_setall_s16((short)delta2);
-                    v_int16x8 _delta3 = v_setall_s16((short)delta3);
-                    v_int16x8 _minL0 = v_setall_s16((short)MAX_COST);
+                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
+                    v_store_aligned(Lr_p + d, L);
+                    _minL0 = v_min(_minL0, L);
+                    Spd += L;
 
-                    for( d = 0; d < D; d += 8 )
-                    {
-                        v_int16x8 Cpd = v_load(Cp + d);
-                        v_int16x8 L0, L1, L2, L3;
+                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd;
+                    v_store_aligned(Lr_p + d + Dlra, L);
+                    _minL1 = v_min(_minL1, L);
+                    Spd += L;
 
-                        L0 = v_load(Lr_p0 + d);
-                        L1 = v_load(Lr_p1 + d);
-                        L2 = v_load(Lr_p2 + d);
-                        L3 = v_load(Lr_p3 + d);
+                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd;
+                    v_store_aligned(Lr_p + d + Dlra*2, L);
+                    _minL2 = v_min(_minL2, L);
+                    Spd += L;
 
-                        L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1));
-                        L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1));
+                    L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd;
+                    v_store_aligned(Lr_p + d + Dlra*3, L);
+                    _minL3 = v_min(_minL3, L);
+                    Spd += L;
 
-                        L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1));
-                        L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1));
-
-                        L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1));
-                        L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1));
-
-                        L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1));
-                        L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1));
-
-                        L0 = v_min(L0, _delta0);
-                        L0 = ((L0 - _delta0) + Cpd);
-
-                        L1 = v_min(L1, _delta1);
-                        L1 = ((L1 - _delta1) + Cpd);
-
-                        L2 = v_min(L2, _delta2);
-                        L2 = ((L2 - _delta2) + Cpd);
-
-                        L3 = v_min(L3, _delta3);
-                        L3 = ((L3 - _delta3) + Cpd);
-
-                        v_store(Lr_p + d, L0);
-                        v_store(Lr_p + d + D2, L1);
-                        v_store(Lr_p + d + D2*2, L2);
-                        v_store(Lr_p + d + D2*3, L3);
-
-                        // Get minimum from in L0-L3
-                        v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H;
-                        v_zip(L0, L2, t02L, t02H);            // L0[0] L2[0] L0[1] L2[1]...
-                        v_zip(L1, L3, t13L, t13H);            // L1[0] L3[0] L1[1] L3[1]...
-                        v_int16x8 t02 = v_min(t02L, t02H);    // L0[i] L2[i] L0[i] L2[i]...
-                        v_int16x8 t13 = v_min(t13L, t13H);    // L1[i] L3[i] L1[i] L3[i]...
-                        v_zip(t02, t13, t0123L, t0123H);      // L0[i] L1[i] L2[i] L3[i]...
-                        v_int16x8 t0 = v_min(t0123L, t0123H);
-                        _minL0 = v_min(_minL0, t0);
-
-                        v_int16x8 Sval = v_load(Sp + d);
-
-                        L0 = L0 + L1;
-                        L2 = L2 + L3;
-                        Sval = Sval + L0;
-                        Sval = Sval + L2;
-
-                        v_store(Sp + d, Sval);
-                    }
-
-                    v_int32x4 minL, minH;
-                    v_expand(_minL0, minL, minH);
-                    v_pack_store(&minLr[0][xm], v_min(minL, minH));
+                    v_store_aligned(Sp + d, Spd);
                 }
-                else
-            #endif
+
+#if CV_SIMD_WIDTH > 32
+                minL[0] = v_reduce_min(_minL0);
+                minL[1] = v_reduce_min(_minL1);
+                minL[2] = v_reduce_min(_minL2);
+                minL[3] = v_reduce_min(_minL3);
+#else
+                // Get minimum for L0-L3
+                v_int16 t0, t1, t2, t3;
+                v_zip(_minL0, _minL2, t0, t2);
+                v_zip(_minL1, _minL3, t1, t3);
+                v_zip(v_min(t0, t2), v_min(t1, t3), t0, t1);
+                t0 = v_min(t0, t1);
+                t0 = v_min(t0, v_rotate_right<4>(t0));
+#if CV_SIMD_WIDTH == 32
+                CostType buf[v_int16::nlanes];
+                v_store_low(buf, v_min(t0, v_rotate_right<8>(t0)));
+                minL[0] = buf[0];
+                minL[1] = buf[1];
+                minL[2] = buf[2];
+                minL[3] = buf[3];
+#else
+                v_store_low(minL, t0);
+#endif
+#endif
+#else
+                minL[0] = MAX_COST;
+                minL[1] = MAX_COST;
+                minL[2] = MAX_COST;
+                minL[3] = MAX_COST;
+#endif
+                for( ; d < D; d++ )
                 {
-                    int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
+                    int Cpd = Cp[d], L;
+                    int Spd = Sp[d];
 
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L0, L1, L2, L3;
+                    L = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0;
+                    Lr_p[d] = (CostType)L;
+                    minL[0] = std::min(minL[0], (CostType)L);
+                    Spd += L;
 
-                        L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
-                        L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1;
-                        L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2;
-                        L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3;
+                    L = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d - 1] + P1, std::min(Lr_p1[d + 1] + P1, delta1))) - delta1;
+                    Lr_p[d + Dlra] = (CostType)L;
+                    minL[1] = std::min(minL[1], (CostType)L);
+                    Spd += L;
 
-                        Lr_p[d] = (CostType)L0;
-                        minL0 = std::min(minL0, L0);
+                    L = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d - 1] + P1, std::min(Lr_p2[d + 1] + P1, delta2))) - delta2;
+                    Lr_p[d + Dlra*2] = (CostType)L;
+                    minL[2] = std::min(minL[2], (CostType)L);
+                    Spd += L;
 
-                        Lr_p[d + D2] = (CostType)L1;
-                        minL1 = std::min(minL1, L1);
+                    L = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d - 1] + P1, std::min(Lr_p3[d + 1] + P1, delta3))) - delta3;
+                    Lr_p[d + Dlra*3] = (CostType)L;
+                    minL[3] = std::min(minL[3], (CostType)L);
+                    Spd += L;
 
-                        Lr_p[d + D2*2] = (CostType)L2;
-                        minL2 = std::min(minL2, L2);
-
-                        Lr_p[d + D2*3] = (CostType)L3;
-                        minL3 = std::min(minL3, L3);
-
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
-                    }
-                    minLr[0][xm] = (CostType)minL0;
-                    minLr[0][xm+1] = (CostType)minL1;
-                    minLr[0][xm+2] = (CostType)minL2;
-                    minLr[0][xm+3] = (CostType)minL3;
+                    Sp[d] = saturate_cast<CostType>(Spd);
                 }
             }
 
             if( pass == npasses )
             {
-                for( x = 0; x < width; x++ )
+                x = 0;
+#if CV_SIMD
+                v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
+                v_int16 v_max_cost = vx_setall_s16(MAX_COST);
+                for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
+                {
+                    v_store(disp1ptr + x, v_inv_dist);
+                    v_store(disp2ptr + x, v_inv_dist);
+                    v_store(disp2cost + x, v_max_cost);
+                }
+#endif
+                for( ; x < width; x++ )
                 {
                     disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
                     disp2cost[x] = MAX_COST;
@@ -660,127 +715,81 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 
                 for( x = width1 - 1; x >= 0; x-- )
                 {
-                    CostType* Sp = S + x*D;
-                    int minS = MAX_COST, bestDisp = -1;
+                    CostType* Sp = S + x*Da;
+                    CostType minS = MAX_COST;
+                    short bestDisp = -1;
 
                     if( npasses == 1 )
                     {
-                        int xm = x*NR2, xd = xm*D2;
+                        int xm = x*NR2, xd = xm*Dlra;
 
-                        int minL0 = MAX_COST;
-                        int delta0 = minLr[0][xm + NR2] + P2;
-                        CostType* Lr_p0 = Lr[0] + xd + NRD2;
+                        CostType* Lr_p0 = Lr[0] + xd + NR2*Dlra;
                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
                         CostType* Lr_p = Lr[0] + xd;
 
-                        const CostType* Cp = C + x*D;
+                        const CostType* Cp = C + x*Da;
 
-                    #if CV_SIMD128
-                        if (true)
+                        d = 0;
+                        int delta0 = minLr[0][xm + NR2] + P2;
+                        int minL0 = MAX_COST;
+#if CV_SIMD
+                        v_int16 _P1 = vx_setall_s16((short)P1);
+                        v_int16 _delta0 = vx_setall_s16((short)delta0);
+
+                        v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
+                        v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
+                        for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
                         {
-                            v_int16x8 _P1 = v_setall_s16((short)P1);
-                            v_int16x8 _delta0 = v_setall_s16((short)delta0);
+                            v_int16 Cpd = vx_load_aligned(Cp + d);
+                            v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
 
-                            v_int16x8 _minL0 = v_setall_s16((short)minL0);
-                            v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
-                            v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
+                            v_store_aligned(Lr_p + d, L0);
+                            _minL0 = v_min(_minL0, L0);
+                            L0 += vx_load_aligned(Sp + d);
+                            v_store_aligned(Sp + d, L0);
 
-                            for( d = 0; d < D; d += 8 )
-                            {
-                                v_int16x8 Cpd = v_load(Cp + d);
-                                v_int16x8 L0 = v_load(Lr_p0 + d);
-
-                                L0 = v_min(L0, v_load(Lr_p0 + d - 1) + _P1);
-                                L0 = v_min(L0, v_load(Lr_p0 + d + 1) + _P1);
-                                L0 = v_min(L0, _delta0);
-                                L0 = L0 - _delta0 + Cpd;
-
-                                v_store(Lr_p + d, L0);
-                                _minL0 = v_min(_minL0, L0);
-                                L0 = L0 + v_load(Sp + d);
-                                v_store(Sp + d, L0);
-
-                                v_int16x8 mask = _minS > L0;
-                                _minS = v_min(_minS, L0);
-                                _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
-                                _d8 += _8;
-                            }
-                            short bestDispBuf[8];
-                            v_store(bestDispBuf, _bestDisp);
-
-                            v_int32x4 min32L, min32H;
-                            v_expand(_minL0, min32L, min32H);
-                            minLr[0][xm] = (CostType)std::min(v_reduce_min(min32L), v_reduce_min(min32H));
-
-                            v_expand(_minS, min32L, min32H);
-                            minS = std::min(v_reduce_min(min32L), v_reduce_min(min32H));
-
-                            v_int16x8 ss = v_setall_s16((short)minS);
-                            v_uint16x8 minMask = v_reinterpret_as_u16(ss == _minS);
-                            v_uint16x8 minBit = minMask & v_LSB;
-
-                            v_uint32x4 minBitL, minBitH;
-                            v_expand(minBit, minBitL, minBitH);
-
-                            int idx = v_reduce_sum(minBitL) + v_reduce_sum(minBitH);
-                            bestDisp = bestDispBuf[LSBTab[idx]];
+                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _minS = v_min(_minS, L0);
                         }
-                        else
-                    #endif
+                        minL0 = (CostType)v_reduce_min(_minL0);
+                        min_pos(_minS, _bestDisp, minS, bestDisp);
+#endif
+                        for( ; d < D; d++ )
                         {
-                            for( d = 0; d < D; d++ )
+                            int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
+
+                            Lr_p[d] = (CostType)L0;
+                            minL0 = std::min(minL0, L0);
+
+                            CostType Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
+                            if( Sval < minS )
                             {
-                                int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
-
-                                Lr_p[d] = (CostType)L0;
-                                minL0 = std::min(minL0, L0);
-
-                                int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
-                                if( Sval < minS )
-                                {
-                                    minS = Sval;
-                                    bestDisp = d;
-                                }
+                                minS = Sval;
+                                bestDisp = (short)d;
                             }
-                            minLr[0][xm] = (CostType)minL0;
                         }
+                        minLr[0][xm] = (CostType)minL0;
                     }
                     else
                     {
-                    #if CV_SIMD128
-                        if (true)
+                        d = 0;
+#if CV_SIMD
+                        v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
+                        for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes )
                         {
-                            v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
-                            v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
-
-                            for( d = 0; d < D; d+= 8 )
-                            {
-                                v_int16x8 L0 = v_load(Sp + d);
-                                v_int16x8 mask = L0 < _minS;
-                                _minS = v_min( L0, _minS );
-                                _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
-                                _d8 = _d8 + _8;
-                            }
-                            v_int32x4 _d0, _d1;
-                            v_expand(_minS, _d0, _d1);
-                            minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
-                            v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
-
-                            _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
-                            v_expand(_bestDisp, _d0, _d1);
-                            bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
+                            v_int16 L0 = vx_load_aligned(Sp + d);
+                            _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
+                            _minS = v_min( L0, _minS );
                         }
-                        else
-                    #endif
+                        min_pos(_minS, _bestDisp, minS, bestDisp);
+#endif
+                        for( ; d < D; d++ )
                         {
-                            for( d = 0; d < D; d++ )
+                            int Sval = Sp[d];
+                            if( Sval < minS )
                             {
-                                int Sval = Sp[d];
-                                if( Sval < minS )
-                                {
-                                    minS = Sval;
-                                    bestDisp = d;
-                                }
+                                minS = (CostType)Sval;
+                                bestDisp = (short)d;
                             }
                         }
                     }
@@ -853,12 +862,13 @@ struct CalcVerticalSums: public ParallelLoopBody
         width = img1.cols;
         int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
         D = maxD - minD;
+        Da = (int)alignSize(D, v_int16::nlanes);
+        Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
-        D2 = D + 16;
-        costBufSize = width1*D;
+        costBufSize = width1*Da;
         CSBufSize = costBufSize*height;
         minLrSize = width1;
-        LrSize = minLrSize*D2;
+        LrSize = minLrSize*Dlra;
         hsumBufNRows = SH2*2 + 2;
         Cbuf = alignedBuf;
         Sbuf = Cbuf + CSBufSize;
@@ -868,20 +878,19 @@ struct CalcVerticalSums: public ParallelLoopBody
     void operator()(const Range& range) const CV_OVERRIDE
     {
         static const CostType MAX_COST = SHRT_MAX;
-        static const int ALIGN = 16;
         static const int TAB_OFS = 256*4;
         static const int npasses = 2;
         int x1 = range.start, x2 = range.end, k;
-        size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D;
-        size_t auxBufsSize = pixDiffSize*sizeof(CostType) +                 //pixdiff size
-                             width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf
+        size_t pixDiffSize = ((x2 - x1) + 2*SW2)*Da;
+        size_t auxBufsSize = CV_SIMD_WIDTH + pixDiffSize*sizeof(CostType) + //alignment and pixdiff size
+                             width*(4*img1.channels()+2)*sizeof(PixType);   //tempBuf
         Mat auxBuff;
         auxBuff.create(1, (int)auxBufsSize, CV_8U);
-        CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN);
+        CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH);
         PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize);
 
         // Simplification of index calculation
-        pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D;
+        pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*Da;
 
         for( int pass = 1; pass <= npasses; pass++ )
         {
@@ -896,18 +905,18 @@ struct CalcVerticalSums: public ParallelLoopBody
                 y1 = height-1; y2 = -1; dy = -1;
             }
 
-            CostType *Lr[NLR]={0}, *minLr[NLR]={0};
+            CostType *Lr[2]={0}, *minLr[2]={0};
 
-            for( k = 0; k < NLR; k++ )
+            for( k = 0; k < 2; k++ )
             {
                 // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
                 // and will occasionally use negative indices with the arrays
                 // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
                 // however, then the alignment will be imperfect, i.e. bad for SSE,
-                // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
-                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + 8;
-                memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) );
-                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k;
+                // thus we shift the pointers by SIMD vector size
+                Lr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*k;
+                memset( Lr[k] + x1*Dlra, 0, (x2-x1)*Dlra*sizeof(CostType) );
+                minLr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*2 + minLrSize*k;
                 memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) );
             }
 
@@ -929,78 +938,115 @@ struct CalcVerticalSums: public ParallelLoopBody
                         {
                             calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2);
 
-                            memset(hsumAdd + x1*D, 0, D*sizeof(CostType));
-                            for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D )
+                            memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType));
+                            for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
                             {
-                                int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x);
+                                int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x);
+#if CV_SIMD
+                                for( d = 0; d < Da; d += v_int16::nlanes )
+                                    v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d));
+#else
                                 for( d = 0; d < D; d++ )
-                                    hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]);
+                                    hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]);
+#endif
                             }
 
                             if( y > 0 )
                             {
                                 const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
                                 const CostType* Cprev = C - costBufSize;
-
+#if CV_SIMD
+                                for( d = 0; d < Da; d += v_int16::nlanes )
+                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
+#else
                                 for( d = 0; d < D; d++ )
-                                    C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]);
-
-                                for( x = (x1+1)*D; x < x2*D; x += D )
+                                    C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]);
+#endif
+                                for( x = (x1+1)*Da; x < x2*Da; x += Da )
                                 {
-                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
+                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
 
-                                #if CV_SIMD128
-                                    if (true)
+#if CV_SIMD
+                                    for( d = 0; d < Da; d += v_int16::nlanes )
                                     {
-                                        for( d = 0; d < D; d += 8 )
-                                        {
-                                            v_int16x8 hv = v_load(hsumAdd + x - D + d);
-                                            v_int16x8 Cx = v_load(Cprev + x + d);
-                                            v_int16x8 psub = v_load(pixSub + d);
-                                            v_int16x8 padd = v_load(pixAdd + d);
-                                            hv = (hv - psub + padd);
-                                            psub = v_load(hsumSub + x + d);
-                                            Cx = Cx - psub + hv;
-                                            v_store(hsumAdd + x + d, hv);
-                                            v_store(C + x + d, Cx);
-                                        }
+                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
+                                        v_store_aligned(hsumAdd + x + d, hv);
+                                        v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
                                     }
-                                    else
-                                #endif
+#else
+                                    for( d = 0; d < D; d++ )
                                     {
-                                        for( d = 0; d < D; d++ )
-                                        {
-                                            int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                                            C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
-                                        }
+                                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
+                                        C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
                                     }
+#endif
                                 }
                             }
                             else
                             {
-                                for( x = (x1+1)*D; x < x2*D; x += D )
+#if CV_SIMD
+                                v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
+                                for (d = 0; d < Da; d += v_int16::nlanes)
+                                    v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale);
+#else
+                                int scale = k == 0 ? SH2 + 1 : 1;
+                                for (d = 0; d < D; d++)
+                                    C[x1*Da + d] = (CostType)(C[x1*Da + d] + hsumAdd[x1*Da + d] * scale);
+#endif
+                                for( x = (x1+1)*Da; x < x2*Da; x += Da )
                                 {
-                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
+                                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+#if CV_SIMD
+                                    for (d = 0; d < Da; d += v_int16::nlanes)
+                                    {
+                                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                                        v_store_aligned(hsumAdd + x + d, hv);
+                                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                                    }
+#else
                                     for( d = 0; d < D; d++ )
-                                        hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                                    {
+                                        CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
+                                        hsumAdd[x + d] = hv;
+                                        C[x + d] = (CostType)(C[x + d] + hv * scale);
+                                    }
+#endif
                                 }
                             }
                         }
-
-                        if( y == 0 )
+                        else
                         {
-                            int scale = k == 0 ? SH2 + 1 : 1;
-                            for( x = x1*D; x < x2*D; x++ )
-                                C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+/*                            if (y > 0)
+                            {
+                                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+                                const CostType* Cprev = C - costBufSize;
+
+#if CV_SIMD
+                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
+                                    v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
+#else
+                                for( x = x1*Da; x < x2*Da; x++ )
+                                    C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
+#endif
+                            }
+                            else*/
+                            if(y == 0)
+                            {
+#if CV_SIMD
+                                for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
+                                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#else
+                                for( x = x1*Da; x < x2*Da; x++ )
+                                    C[x] = (CostType)(C[x] + hsumAdd[x]);
+#endif
+                            }
                         }
                     }
 
                     // also, clear the S buffer
-                    for( k = x1*D; k < x2*D; k++ )
-                        S[k] = 0;
+                    memset(S + x1*Da, 0, (x2-x1)*Da*sizeof(CostType));
                 }
 
 //              [formula 13 in the paper]
@@ -1015,7 +1061,7 @@ struct CalcVerticalSums: public ParallelLoopBody
 
                 for( x = x1; x != x2; x++ )
                 {
-                    int xd = x*D2;
+                    int xd = x*Dlra;
 
                     int delta = minLr[1][x] + P2;
 
@@ -1024,64 +1070,39 @@ struct CalcVerticalSums: public ParallelLoopBody
                     Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
 
                     CostType* Lr_p = Lr[0] + xd;
-                    const CostType* Cp = C + x*D;
-                    CostType* Sp = S + x*D;
+                    const CostType* Cp = C + x*Da;
+                    CostType* Sp = S + x*Da;
 
-                #if CV_SIMD128
-                    if (true)
+                    CostType& minL = minLr[0][x];
+                    d = 0;
+#if CV_SIMD
+                    v_int16 _P1 = vx_setall_s16((short)P1);
+
+                    v_int16 _delta = vx_setall_s16((short)delta);
+                    v_int16 _minL = vx_setall_s16((short)MAX_COST);
+
+                    for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
                     {
-                        v_int16x8 _P1 = v_setall_s16((short)P1);
-
-                        v_int16x8 _delta = v_setall_s16((short)delta);
-                        v_int16x8 _minL = v_setall_s16((short)MAX_COST);
-
-                        for( d = 0; d < D; d += 8 )
-                        {
-                            v_int16x8 Cpd = v_load(Cp + d);
-                            v_int16x8 L;
-
-                            L = v_load(Lr_ppr + d);
-
-                            L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
-                            L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
-
-                            L = v_min(L, _delta);
-                            L = ((L - _delta) + Cpd);
-
-                            v_store(Lr_p + d, L);
-
-                            // Get minimum from in L-L3
-                            _minL = v_min(_minL, L);
-
-                            v_int16x8 Sval = v_load(Sp + d);
-
-                            Sval = Sval + L;
-
-                            v_store(Sp + d, Sval);
-                        }
-
-                        v_int32x4 min1, min2, min12;
-                        v_expand(_minL, min1, min2);
-                        min12 = v_min(min1,min2);
-                        minLr[0][x] = (CostType)v_reduce_min(min12);
+                        v_int16 Cpd = vx_load_aligned(Cp + d);
+                        v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                        v_store_aligned(Lr_p + d, L);
+                        _minL = v_min(_minL, L);
+                        v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
                     }
-                    else
-                #endif
+                    minL = v_reduce_min(_minL);
+#else
+                    minL = MAX_COST;
+#endif
+                    for( ; d < D; d++ )
                     {
-                        int minL = MAX_COST;
+                        int Cpd = Cp[d], L;
 
-                        for( d = 0; d < D; d++ )
-                        {
-                            int Cpd = Cp[d], L;
+                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
 
-                            L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+                        Lr_p[d] = (CostType)L;
+                        minL = std::min(minL, (CostType)L);
 
-                            Lr_p[d] = (CostType)L;
-                            minL = std::min(minL, L);
-
-                            Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                        }
-                        minLr[0][x] = (CostType)minL;
+                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                     }
                 }
 
@@ -1091,7 +1112,6 @@ struct CalcVerticalSums: public ParallelLoopBody
             }
         }
     }
-    static const int NLR = 2;
     const Mat& img1;
     const Mat& img2;
     CostType* Cbuf;
@@ -1100,8 +1120,7 @@ struct CalcVerticalSums: public ParallelLoopBody
     PixType* clipTab;
     int minD;
     int maxD;
-    int D;
-    int D2;
+    int D, Da, Dlra;
     int SH2;
     int SW2;
     int width;
@@ -1135,11 +1154,12 @@ struct CalcHorizontalSums: public ParallelLoopBody
         INVALID_DISP = minD - 1;
         INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
         D = maxD - minD;
+        Da = (int)alignSize(D, v_int16::nlanes);
+        Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
         width1 = maxX1 - minX1;
-        costBufSize = width1*D;
+        costBufSize = width1*Da;
         CSBufSize = costBufSize*height;
-        D2 = D + 16;
-        LrSize = 2 * D2;
+        LrSize = 2 * Dlra;
         Cbuf = alignedBuf;
         Sbuf = Cbuf + CSBufSize;
     }
@@ -1147,11 +1167,11 @@ struct CalcHorizontalSums: public ParallelLoopBody
     void operator()(const Range& range) const CV_OVERRIDE
     {
         int y1 = range.start, y2 = range.end;
-        size_t auxBufsSize = LrSize * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)) + 32;
+        size_t auxBufsSize = CV_SIMD_WIDTH + (v_int16::nlanes + LrSize) * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType));
 
         Mat auxBuff;
         auxBuff.create(1, (int)auxBufsSize, CV_8U);
-        CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), ALIGN)) + 8;
+        CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH)) + v_int16::nlanes;
         CostType* disp2cost = Lr + LrSize;
         DispType* disp2ptr = (DispType*)(disp2cost + width);
 
@@ -1164,15 +1184,26 @@ struct CalcHorizontalSums: public ParallelLoopBody
             CostType* C = Cbuf + y*costBufSize;
             CostType* S = Sbuf + y*costBufSize;
 
-            for( x = 0; x < width; x++ )
+            x = 0;
+#if CV_SIMD
+            v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
+            v_int16 v_max_cost = vx_setall_s16(MAX_COST);
+            for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
+            {
+                v_store(disp1ptr + x, v_inv_dist);
+                v_store(disp2ptr + x, v_inv_dist);
+                v_store(disp2cost + x, v_max_cost);
+            }
+#endif
+            for( ; x < width; x++ )
             {
                 disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
                 disp2cost[x] = MAX_COST;
             }
 
             // clear buffers
-            memset( Lr - 8, 0, LrSize*sizeof(CostType) );
-            Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
+            memset( Lr, 0, LrSize*sizeof(CostType) );
+            Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
 
             minLr = 0;
 //          [formula 13 in the paper]
@@ -1189,70 +1220,43 @@ struct CalcHorizontalSums: public ParallelLoopBody
             {
                 int delta = minLr + P2;
 
-                CostType* Lr_ppr = Lr + ((x&1)? 0 : D2);
+                CostType* Lr_ppr = Lr + ((x&1)? 0 : Dlra);
 
-                CostType* Lr_p = Lr + ((x&1)? D2 :0);
-                const CostType* Cp = C + x*D;
-                CostType* Sp = S + x*D;
+                CostType* Lr_p = Lr + ((x&1)? Dlra :0);
+                const CostType* Cp = C + x*Da;
+                CostType* Sp = S + x*Da;
 
-            #if CV_SIMD128
-                if (true)
+                d = 0;
+#if CV_SIMD
+                v_int16 _P1 = vx_setall_s16((short)P1);
+
+                v_int16 _delta = vx_setall_s16((short)delta);
+                v_int16 _minL = vx_setall_s16((short)MAX_COST);
+
+                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
                 {
-                    v_int16x8 _P1 = v_setall_s16((short)P1);
-
-                    v_int16x8 _delta = v_setall_s16((short)delta);
-                    v_int16x8 _minL = v_setall_s16((short)MAX_COST);
-
-                    for( d = 0; d < D; d += 8 )
-                    {
-                        v_int16x8 Cpd = v_load(Cp + d);
-                        v_int16x8 L;
-
-                        L = v_load(Lr_ppr + d);
-
-                        L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
-                        L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
-
-                        L = v_min(L, _delta);
-                        L = ((L - _delta) + Cpd);
-
-                        v_store(Lr_p + d, L);
-
-                        // Get minimum from in L-L3
-                        _minL = v_min(_minL, L);
-
-                        v_int16x8 Sval = v_load(Sp + d);
-
-                        Sval = Sval + L;
-
-                        v_store(Sp + d, Sval);
-                    }
-
-                    v_int32x4 min1, min2, min12;
-                    v_expand(_minL, min1, min2);
-                    min12 = v_min(min1,min2);
-                    minLr = (CostType)v_reduce_min(min12);
+                    v_int16 Cpd = vx_load_aligned(Cp + d);
+                    v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_store_aligned(Lr_p + d, L);
+                    _minL = v_min(_minL, L);
+                    v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
                 }
-                else
-            #endif
+                minLr = v_reduce_min(_minL);
+#else
+                minLr = MAX_COST;
+#endif
+                for( ; d < D; d++ )
                 {
-                    minLr = MAX_COST;
-                    for( d = 0; d < D; d++ )
-                    {
-                        int Cpd = Cp[d], L;
-
-                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
-
-                        Lr_p[d] = (CostType)L;
-                        minLr = (CostType)std::min((int)minLr, L);
-
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                    }
+                    int Cpd = Cp[d], L;
+                    L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+                    Lr_p[d] = (CostType)L;
+                    minLr = std::min(minLr, (CostType)L);
+                    Sp[d] = saturate_cast<CostType>(Sp[d] + L);
                 }
             }
 
-            memset( Lr - 8, 0, LrSize*sizeof(CostType) );
-            Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST;
+            memset( Lr, 0, LrSize*sizeof(CostType) );
+            Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST;
 
             minLr = 0;
 
@@ -1260,88 +1264,55 @@ struct CalcHorizontalSums: public ParallelLoopBody
             {
                 int delta = minLr + P2;
 
-                CostType* Lr_ppr = Lr + ((x&1)? 0 :D2);
+                CostType* Lr_ppr = Lr + ((x&1)? 0 :Dlra);
 
-                CostType* Lr_p = Lr + ((x&1)? D2 :0);
-                const CostType* Cp = C + x*D;
-                CostType* Sp = S + x*D;
-                int minS = MAX_COST, bestDisp = -1;
+                CostType* Lr_p = Lr + ((x&1)? Dlra :0);
+                const CostType* Cp = C + x*Da;
+                CostType* Sp = S + x*Da;
+                CostType minS = MAX_COST;
+                short bestDisp = -1;
                 minLr = MAX_COST;
 
-            #if CV_SIMD128
-                if (true)
+                d = 0;
+#if CV_SIMD
+                v_int16 _P1 = vx_setall_s16((short)P1);
+                v_int16 _delta = vx_setall_s16((short)delta);
+
+                v_int16 _minL = vx_setall_s16((short)MAX_COST);
+                v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
+                for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
                 {
-                    v_int16x8 _P1 = v_setall_s16((short)P1);
+                    v_int16 Cpd = vx_load_aligned(Cp + d);
+                    v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
+                    v_store_aligned(Lr_p + d, L);
+                    _minL = v_min(_minL, L);
+                    L += vx_load_aligned(Sp + d);
+                    v_store_aligned(Sp + d, L);
 
-                    v_int16x8 _delta = v_setall_s16((short)delta);
-                    v_int16x8 _minL = v_setall_s16((short)MAX_COST);
-
-                    v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
-                    v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
-
-                    for( d = 0; d < D; d+= 8 )
-                    {
-                        v_int16x8 Cpd = v_load(Cp + d);
-                        v_int16x8 L;
-
-                        L = v_load(Lr_ppr + d);
-
-                        L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
-                        L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
-
-                        L = v_min(L, _delta);
-                        L = ((L - _delta) + Cpd);
-
-                        v_store(Lr_p + d, L);
-
-                        // Get minimum from in L-L3
-                        _minL = v_min(_minL, L);
-
-                        v_int16x8 Sval = v_load(Sp + d);
-
-                        Sval = Sval + L;
-
-                        v_int16x8 mask = Sval < _minS;
-                        _minS = v_min( Sval, _minS );
-                        _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
-                        _d8 = _d8 + _8;
-
-                        v_store(Sp + d, Sval);
-                    }
-                    v_int32x4 min1, min2, min12;
-                    v_expand(_minL, min1, min2);
-                    min12 = v_min(min1,min2);
-                    minLr = (CostType)v_reduce_min(min12);
-
-                    v_int32x4 _d0, _d1;
-                    v_expand(_minS, _d0, _d1);
-                    minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
-                    v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
-
-                    _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
-                    v_expand(_bestDisp, _d0, _d1);
-                    bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
+                    _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp);
+                    _minS = v_min( L, _minS );
                 }
-                else
-            #endif
+                minLr = v_reduce_min(_minL);
+
+                min_pos(_minS, _bestDisp, minS, bestDisp);
+#endif
+                for( ; d < D; d++ )
                 {
-                    for( d = 0; d < D; d++ )
+                    int Cpd = Cp[d], L;
+
+                    L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
+
+                    Lr_p[d] = (CostType)L;
+                    minLr = std::min(minLr, (CostType)L);
+
+                    Sp[d] = saturate_cast<CostType>(Sp[d] + L);
+                    if( Sp[d] < minS )
                     {
-                        int Cpd = Cp[d], L;
-
-                        L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
-
-                        Lr_p[d] = (CostType)L;
-                        minLr = (CostType)std::min((int)minLr, L);
-
-                        Sp[d] = saturate_cast<CostType>(Sp[d] + L);
-                        if( Sp[d] < minS )
-                        {
-                            minS = Sp[d];
-                            bestDisp = d;
-                        }
+                        minS = Sp[d];
+                        bestDisp = (short)d;
                     }
                 }
+
                 //Some postprocessing procedures and saving
                 for( d = 0; d < D; d++ )
                 {
@@ -1392,7 +1363,6 @@ struct CalcHorizontalSums: public ParallelLoopBody
     static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     static const int DISP_SCALE = (1 << DISP_SHIFT);
     static const CostType MAX_COST = SHRT_MAX;
-    static const int ALIGN = 16;
     const Mat& img1;
     const Mat& img2;
     Mat& disp1;
@@ -1400,8 +1370,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
     CostType* Sbuf;
     int minD;
     int maxD;
-    int D;
-    int D2;
+    int D, Da, Dlra;
     int width;
     int width1;
     int height;
@@ -1435,7 +1404,6 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
                                  Mat& disp1, const StereoSGBMParams& params,
                                  Mat& buffer )
 {
-    const int ALIGN = 16;
     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
     const int DISP_SCALE = (1 << DISP_SHIFT);
     int minD = params.minDisparity, maxD = minD + params.numDisparities;
@@ -1445,7 +1413,8 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
     int k, width = disp1.cols, height = disp1.rows;
     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
-    int D = maxD - minD, width1 = maxX1 - minX1;
+    int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
+    int Dlra = D + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
     int SH2 = SADWindowSize.height/2;
     int INVALID_DISP = minD - 1;
     int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
@@ -1461,25 +1430,20 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
         return;
     }
 
-    CV_Assert( D % 16 == 0 );
-
-    int D2 = D+16;
+    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
+    // we keep pixel difference cost (C) and the summary cost over 4 directions (S).
+    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
 
     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
     // for dynamic programming we need the current row and
     // the previous row, i.e. 2 rows in total
-    const int NLR = 2;
-
-    // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
-    // we keep pixel difference cost (C) and the summary cost over 4 directions (S).
-    // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
     size_t costBufSize = width1*D;
     size_t CSBufSize = costBufSize*height;
-    size_t minLrSize = width1 , LrSize = minLrSize*D2;
+    size_t minLrSize = width1 , LrSize = minLrSize*Dlra;
     int hsumBufNRows = SH2*2 + 2;
-    size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-                          costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf
-                          CSBufSize*2*sizeof(CostType) + 1024;        // C, S
+    size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // Alignment, C, S
+                          costBufSize*hsumBufNRows * sizeof(CostType) + // hsumBuf
+                          ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType); // minLr[] and Lr[]
 
     if( buffer.empty() || !buffer.isContinuous() ||
         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
@@ -1488,7 +1452,7 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
     }
 
     // summary cost over different (nDirs) directions
-    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
+    CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
 
     // add P2 to every C(x,y). it saves a few operations in the inner loops
     for(k = 0; k < (int)CSBufSize; k++ )
@@ -1501,7 +1465,7 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
+void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
                        PixType*& tmpBuf, CostType*& horPassCostVolume,
                        CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
@@ -1517,7 +1481,7 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
     int stripe_overlap;
 
     int width,height;
-    int minD, maxD, D;
+    int minD, maxD, D, Da;
     int minX1, maxX1, width1;
 
     int SW2, SH2;
@@ -1528,10 +1492,13 @@ struct SGBM3WayMainLoop : public ParallelLoopBody
     int TAB_OFS, ftzero;
 
     PixType* clipTab;
-
+#if CV_SIMD
+    short idx_row[v_int16::nlanes];
+#endif
     SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap);
     void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const;
     void operator () (const Range& range) const CV_OVERRIDE;
+    template<bool x_nlanes> void impl(const Range& range) const;
 };
 
 SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap):
@@ -1544,7 +1511,7 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
     width = img1->cols; height = img1->rows;
     minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
     minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
-    CV_Assert( D % 16 == 0 );
+    Da = (int)alignSize(D, v_int16::nlanes);
 
     SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
 
@@ -1552,22 +1519,26 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli
     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 
-    costBufSize = width1*D;
+    costBufSize = width1*Da;
     hsumBufNRows = SH2*2 + 2;
     TAB_OFS = 256*4;
     ftzero = std::max(params.preFilterCap, 15) | 1;
+#if CV_SIMD
+    for(short i = 0; i < v_int16::nlanes; ++i)
+        idx_row[i] = i;
+#endif
 }
 
-void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2,
+void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2,
                        CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff,
                        PixType*& tmpBuf, CostType*& horPassCostVolume,
                        CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf,
                        CostType*& disp2CostBuf, short*& disp2Buf)
 {
     // allocating all the required memory:
-    int costVolumeLineSize = width1*D;
+    int costVolumeLineSize = width1*Da;
     int width1_ext = width1+2;
-    int costVolumeLineSize_ext = width1_ext*D;
+    int costVolumeLineSize_ext = width1_ext*Da;
     int hsumBufNRows = SH2*2 + 2;
 
     // main buffer to store matching costs for the current line:
@@ -1576,49 +1547,55 @@ void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, in
     // auxiliary buffers for the raw matching cost computation:
     int hsumBufSize  = costVolumeLineSize*hsumBufNRows*sizeof(CostType);
     int pixDiffSize  = costVolumeLineSize*sizeof(CostType);
-    int tmpBufSize   = width*16*num_ch*sizeof(PixType);
+    int tmpBufSize = width * (4 * num_ch + 2) * sizeof(PixType);
 
     // auxiliary buffers for the matching cost aggregation:
     int horPassCostVolumeSize  = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation
     int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation
+    int rightPassBufSize = Da * sizeof(CostType);                     // additional small buffer for the right-to-left pass
     int vertPassMinSize        = width1_ext*sizeof(CostType);             // buffer for storing minimum costs from the previous line
-    int rightPassBufSize       = D*sizeof(CostType);                      // additional small buffer for the right-to-left pass
 
     // buffers for the pseudo-LRC check:
     int disp2CostBufSize = width*sizeof(CostType);
     int disp2BufSize     = width*sizeof(short);
 
     // sum up the sizes of all the buffers:
-    size_t totalBufSize = curCostVolumeLineSize +
+    size_t totalBufSize = CV_SIMD_WIDTH + curCostVolumeLineSize +
                           hsumBufSize +
                           pixDiffSize +
-                          tmpBufSize  +
                           horPassCostVolumeSize +
                           vertPassCostVolumeSize +
-                          vertPassMinSize +
                           rightPassBufSize +
+                          vertPassMinSize +
                           disp2CostBufSize +
                           disp2BufSize +
-                          16;  //to compensate for the alignPtr shifts
+                          tmpBufSize;
 
     if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
         buffer.reserveBuffer(totalBufSize);
 
     // set up all the pointers:
-    curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), 16);
+    curCostVolumeLine  = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH);
     hsumBuf            = curCostVolumeLine + costVolumeLineSize;
     pixDiff            = hsumBuf + costVolumeLineSize*hsumBufNRows;
-    tmpBuf             = (PixType*)(pixDiff + costVolumeLineSize);
-    horPassCostVolume  = (CostType*)(tmpBuf + width*16*num_ch);
+    horPassCostVolume  = pixDiff + costVolumeLineSize;
     vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext;
     rightPassBuf       = vertPassCostVolume + costVolumeLineSize_ext;
-    vertPassMin        = rightPassBuf + D;
+    vertPassMin        = rightPassBuf + Da;
+
     disp2CostBuf       = vertPassMin + width1_ext;
     disp2Buf           = disp2CostBuf + width;
+    tmpBuf = (PixType*)(disp2Buf + width);
 
     // initialize memory:
     memset(buffer.ptr(),0,totalBufSize);
-    for(int i=0;i<costVolumeLineSize;i++)
+    int i = 0;
+#if CV_SIMD
+    v_int16 _P2 = vx_setall_s16((CostType)P2);
+    for (; i<=costVolumeLineSize-v_int16::nlanes; i+=v_int16::nlanes)
+        v_store_aligned(curCostVolumeLine + i, _P2);
+#endif
+    for(;i<costVolumeLineSize;i++)
         curCostVolumeLine[i] = (CostType)P2; //such initialization simplifies the cost aggregation loops a bit
 }
 
@@ -1637,172 +1614,216 @@ void SGBM3WayMainLoop::getRawMatchingCost(CostType* C, // target cost-volume row
         {
             calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab, TAB_OFS, ftzero );
 
-            memset(hsumAdd, 0, D*sizeof(CostType));
-            for(x = 0; x <= SW2*D; x += D )
+#if CV_SIMD
+            v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
+            for (d = 0; d < Da; d += v_int16::nlanes)
             {
-                int scale = x == 0 ? SW2 + 1 : 1;
-
-                for( d = 0; d < D; d++ )
-                    hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
+                v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1;
+                for (x = Da; x <= SW2 * Da; x += Da)
+                    hsA += vx_load_aligned(pixDiff + x + d);
+                v_store_aligned(hsumAdd + d, hsA);
             }
-
+#else
+            for (d = 0; d < D; d++)
+            {
+                CostType hsA = (CostType)(pixDiff[d] * (SW2 + 1));
+                for (x = Da; x <= SW2 * Da; x += Da)
+                    hsA += pixDiff[x + d];
+                hsumAdd[d] = hsA;
+            }
+#endif
             if( y > src_start_idx )
             {
                 const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
 
-                for( x = D; x < width1*D; x += D )
-                {
-                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
-#if CV_SIMD128
-                    if (true)
-                    {
-                        v_int16x8 hv_reg;
-                        for( d = 0; d < D; d+=8 )
-                        {
-                            hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d));
-                            v_store_aligned(hsumAdd+x+d,hv_reg);
-                            v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d)));
-                        }
-                    }
-                    else
+#if CV_SIMD
+                for (d = 0; d < Da; d += v_int16::nlanes)
+                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
+#else
+                for (d = 0; d < D; d++)
+                    C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]);
 #endif
+
+                for( x = Da; x < width1*Da; x += Da )
+                {
+                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+#if CV_SIMD
+                    v_int16 hv_reg;
+                    for( d = 0; d < Da; d+=v_int16::nlanes )
                     {
-                        for( d = 0; d < D; d++ )
-                        {
-                            int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
-                            C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
-                        }
+                        hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d);
+                        v_store_aligned(hsumAdd+x+d,hv_reg);
+                        v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d));
                     }
+#else
+                    for( d = 0; d < D; d++ )
+                    {
+                        int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                        C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
+                    }
+#endif
                 }
             }
             else
             {
-                for( x = D; x < width1*D; x += D )
+#if CV_SIMD
+                v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1);
+                for (d = 0; d < Da; d += v_int16::nlanes)
+                    v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
+#else
+                int scale = k == src_start_idx ? SH2 + 1 : 1;
+                for (d = 0; d < D; d++)
+                    C[d] = (CostType)(C[d] + hsumAdd[d] * scale);
+#endif
+                for( x = Da; x < width1*Da; x += Da )
                 {
-                    const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
-                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
-
-                    for( d = 0; d < D; d++ )
-                        hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                    const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
+                    const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
+#if CV_SIMD
+                    for (d = 0; d < Da; d += v_int16::nlanes)
+                    {
+                        v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
+                        v_store_aligned(hsumAdd + x + d, hv);
+                        v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
+                    }
+#else
+                    for (d = 0; d < D; d++)
+                    {
+                        CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
+                        hsumAdd[x + d] = hv;
+                        C[x + d] = (CostType)(C[x + d] + hv * scale);
+                    }
+#endif
                 }
             }
         }
-
-        if( y == src_start_idx )
+        else
         {
-            int scale = k == src_start_idx ? SH2 + 1 : 1;
-            for( x = 0; x < width1*D; x++ )
-                C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
+            if( y > src_start_idx )
+            {
+                const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize;
+#if CV_SIMD
+                for( x = 0; x < width1*Da; x += v_int16::nlanes)
+                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
+#else
+                for( x = 0; x < width1*Da; x++ )
+                    C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
+#endif
+            }
+            else
+            {
+#if CV_SIMD
+                for( x = 0; x < width1*Da; x += v_int16::nlanes)
+                    v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
+#else
+                for( x = 0; x < width1*Da; x++ )
+                    C[x] = (CostType)(C[x] + hsumAdd[x]);
+#endif
+            }
         }
     }
 }
 
-#if CV_SIMD128
-// define some additional reduce operations:
-inline short min_pos(const v_int16x8& val, const v_int16x8& pos, const short min_val)
-{
-    v_int16x8 v_min = v_setall_s16(min_val);
-    v_int16x8 v_mask = v_min == val;
-    v_int16x8 v_pos = (pos & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
-
-    return v_reduce_min(v_pos);
-}
-#endif
-
 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
 // in-place cost accumulation from top to bottom (result is stored in topBuf)
+template<bool x_nlanes>
 inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs,
                                    CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2)
 {
-#if CV_SIMD128
-    if (true)
+    int i = 0;
+#if CV_SIMD
+    int Da = (int)alignSize(D, v_int16::nlanes);
+    v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
+
+    v_int16 leftMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
+    v_int16 leftMinCost_new_reg = vx_setall_s16(SHRT_MAX);
+    v_int16 src0_leftBuf        = vx_setall_s16(SHRT_MAX);
+    v_int16 src1_leftBuf        = vx_load_aligned(leftBuf_prev);
+
+    v_int16 topMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
+    v_int16 topMinCost_new_reg = vx_setall_s16(SHRT_MAX);
+    v_int16 src0_topBuf        = vx_setall_s16(SHRT_MAX);
+    v_int16 src1_topBuf        = vx_load_aligned(topBuf);
+
+    v_int16 src2;
+    v_int16 src_shifted_left,src_shifted_right;
+    v_int16 res;
+
+    for(;i<Da-v_int16::nlanes;i+= v_int16::nlanes)
     {
-        v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
-
-        v_int16x8 leftMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
-        v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX);
-        v_int16x8 src0_leftBuf        = v_setall_s16(SHRT_MAX);
-        v_int16x8 src1_leftBuf        = v_load_aligned(leftBuf_prev);
-
-        v_int16x8 topMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
-        v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX);
-        v_int16x8 src0_topBuf        = v_setall_s16(SHRT_MAX);
-        v_int16x8 src1_topBuf        = v_load_aligned(topBuf);
-
-        v_int16x8 src2;
-        v_int16x8 src_shifted_left,src_shifted_right;
-        v_int16x8 res;
-
-        for(int i=0;i<D-8;i+=8)
-        {
-            //process leftBuf:
-            //lookahead load:
-            src2 = v_load_aligned(leftBuf_prev+i+8);
-
-            //get shifted versions of the current block and add P1:
-            src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
-            src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
-
-            // process and save current block:
-            res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
-            leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
-            v_store_aligned(leftBuf+i, res);
-
-            //update src buffers:
-            src0_leftBuf = src1_leftBuf;
-            src1_leftBuf = src2;
-
-            //process topBuf:
-            //lookahead load:
-            src2 = v_load_aligned(topBuf+i+8);
-
-            //get shifted versions of the current block and add P1:
-            src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
-            src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
-
-            // process and save current block:
-            res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
-            topMinCost_new_reg = v_min(topMinCost_new_reg,res);
-            v_store_aligned(topBuf+i, res);
-
-            //update src buffers:
-            src0_topBuf = src1_topBuf;
-            src1_topBuf = src2;
-        }
-
-        // a bit different processing for the last cycle of the loop:
         //process leftBuf:
-        src2 = v_setall_s16(SHRT_MAX);
-        src_shifted_left  = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg;
-        src_shifted_right = v_extract<1> (src1_leftBuf,src2        ) + P1_reg;
+        //lookahead load:
+        src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes);
 
-        res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
-        leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
-        v_store_aligned(leftBuf+D-8, res);
+        //get shifted versions of the current block and add P1:
+        src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
+        src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
+
+        // process and save current block:
+        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
+        v_store_aligned(leftBuf+i, res);
+
+        //update src buffers:
+        src0_leftBuf = src1_leftBuf;
+        src1_leftBuf = src2;
 
         //process topBuf:
-        src2 = v_setall_s16(SHRT_MAX);
-        src_shifted_left  = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg;
-        src_shifted_right = v_extract<1> (src1_topBuf,src2       ) + P1_reg;
+        //lookahead load:
+        src2 = vx_load_aligned(topBuf+i+v_int16::nlanes);
 
-        res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        //get shifted versions of the current block and add P1:
+        src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
+        src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
+
+        // process and save current block:
+        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
+        topMinCost_new_reg = v_min(topMinCost_new_reg,res);
+        v_store_aligned(topBuf+i, res);
+
+        //update src buffers:
+        src0_topBuf = src1_topBuf;
+        src1_topBuf = src2;
+    }
+
+    // a bit different processing for the last cycle of the loop:
+    if(x_nlanes)
+    {
+        src2 = vx_setall_s16(SHRT_MAX);
+        //process leftBuf:
+        src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
+        src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
+
+        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
+        leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
+        v_store_aligned(leftBuf+Da-v_int16::nlanes, res);
+
+        //process topBuf:
+        src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
+        src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
+
+        res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
         topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
-        v_store_aligned(topBuf+D-8, res);
+        v_store_aligned(topBuf+Da-v_int16::nlanes, res);
     }
     else
-#endif
+    {
+        CostType leftMinCost_new = v_reduce_min(leftMinCost_new_reg);
+        CostType topMinCost_new  = v_reduce_min(topMinCost_new_reg);
+        CostType leftBuf_prev_i_minus_1 = i > 0 ? leftBuf_prev[i-1] : SHRT_MAX;
+        CostType topBuf_i_minus_1       = i > 0 ? topBuf[i-1] : SHRT_MAX;
+#else
     {
         CostType leftMinCost_new = SHRT_MAX;
         CostType topMinCost_new  = SHRT_MAX;
-        int leftMinCost_P2  = leftMinCost + P2;
-        int topMinCost_P2   = topMinCost  + P2;
         CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
         CostType topBuf_i_minus_1       = SHRT_MAX;
+#endif
+        int leftMinCost_P2  = leftMinCost + P2;
+        int topMinCost_P2   = topMinCost  + P2;
         CostType tmp;
-
-        for(int i=0;i<D-1;i++)
+        for(;i<D-1;i++)
         {
             leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
             leftBuf_prev_i_minus_1 = leftBuf_prev[i];
@@ -1825,83 +1846,86 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co
 // performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
 // optimal disparity value with minimum accumulated cost
+template<bool x_nlanes>
 inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs,
-                                 CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost)
+                                 CostType& rightMinCost, int D, int P1, int P2, short& optimal_disp, CostType& min_cost)
 {
-#if CV_SIMD128
-    if (true)
+    int i = 0;
+#if CV_SIMD
+    int Da = (int)alignSize(D, v_int16::nlanes);
+    v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
+
+    v_int16 rightMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
+    v_int16 rightMinCost_new_reg = vx_setall_s16(SHRT_MAX);
+    v_int16 src0_rightBuf        = vx_setall_s16(SHRT_MAX);
+    v_int16 src1_rightBuf        = vx_load(rightBuf);
+
+    v_int16 src2;
+    v_int16 src_shifted_left,src_shifted_right;
+    v_int16 res;
+
+    v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX);
+    v_int16 min_sum_pos_reg  = vx_setall_s16(0);
+
+    for(;i<Da-v_int16::nlanes;i+=v_int16::nlanes)
     {
-        v_int16x8 P1_reg = v_setall_s16(cv::saturate_cast<CostType>(P1));
+        //lookahead load:
+        src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes);
 
-        v_int16x8 rightMinCostP2_reg   = v_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
-        v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX);
-        v_int16x8 src0_rightBuf        = v_setall_s16(SHRT_MAX);
-        v_int16x8 src1_rightBuf        = v_load(rightBuf);
+        //get shifted versions of the current block and add P1:
+        src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
+        src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
 
-        v_int16x8 src2;
-        v_int16x8 src_shifted_left,src_shifted_right;
-        v_int16x8 res;
+        // process and save current block:
+        res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
+        v_store_aligned(rightBuf+i, res);
 
-        v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX);
-        v_int16x8 min_sum_pos_reg  = v_setall_s16(0);
-        v_int16x8 loop_idx(0,1,2,3,4,5,6,7);
-        v_int16x8 eight_reg = v_setall_s16(8);
+        // compute and save total cost:
+        res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i);
+        v_store_aligned(leftBuf+i, res);
 
-        for(int i=0;i<D-8;i+=8)
-        {
-            //lookahead load:
-            src2 = v_load_aligned(rightBuf+i+8);
+        // track disparity value with the minimum cost:
+        min_sum_cost_reg = v_min(min_sum_cost_reg,res);
+        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg));
 
-            //get shifted versions of the current block and add P1:
-            src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
-            src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
+        //update src:
+        src0_rightBuf    = src1_rightBuf;
+        src1_rightBuf    = src2;
+    }
 
-            // process and save current block:
-            res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
-            rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
-            v_store_aligned(rightBuf+i, res);
+    // a bit different processing for the last cycle of the loop:
+    if(x_nlanes)
+    {
+        src2 = vx_setall_s16(SHRT_MAX);
+        src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
+        src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
 
-            // compute and save total cost:
-            res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i);
-            v_store_aligned(leftBuf+i, res);
-
-            // track disparity value with the minimum cost:
-            min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-            min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
-            loop_idx = loop_idx+eight_reg;
-
-            //update src:
-            src0_rightBuf    = src1_rightBuf;
-            src1_rightBuf    = src2;
-        }
-
-        // a bit different processing for the last cycle of the loop:
-        src2 = v_setall_s16(SHRT_MAX);
-        src_shifted_left  = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg;
-        src_shifted_right = v_extract<1> (src1_rightBuf,src2         ) + P1_reg;
-
-        res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
+        res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
         rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
-        v_store_aligned(rightBuf+D-8, res);
+        v_store_aligned(rightBuf+D-v_int16::nlanes, res);
 
-        res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8);
-        v_store_aligned(leftBuf+D-8, res);
+        res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes);
+        v_store_aligned(leftBuf+D-v_int16::nlanes, res);
 
         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
-        min_cost = v_reduce_min(min_sum_cost_reg);
-        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg));
-        optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost);
+        min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)D-v_int16::nlanes) - min_sum_pos_reg));
+        min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
     }
     else
-#endif
+    {
+        CostType rightMinCost_new = v_reduce_min(rightMinCost_new_reg);
+        CostType rightBuf_i_minus_1 = i > 0 ? rightBuf[i] : SHRT_MAX;
+        min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
+#else
     {
         CostType rightMinCost_new = SHRT_MAX;
-        int rightMinCost_P2  = rightMinCost + P2;
         CostType rightBuf_i_minus_1 = SHRT_MAX;
-        CostType tmp;
         min_cost = SHRT_MAX;
-
-        for(int i=0;i<D-1;i++)
+#endif
+        int rightMinCost_P2  = rightMinCost + P2;
+        CostType tmp;
+        for(;i<D-1;i++)
         {
             tmp = rightBuf[i];
             rightBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
@@ -1910,7 +1934,7 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType*
             leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
             if(leftBuf[i]<min_cost)
             {
-                optimal_disp = i;
+                optimal_disp = (short)i;
                 min_cost = leftBuf[i];
             }
         }
@@ -1920,13 +1944,19 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType*
         leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
         if(leftBuf[D-1]<min_cost)
         {
-            optimal_disp = D-1;
+            optimal_disp = (short)D-1;
             min_cost = leftBuf[D-1];
         }
     }
 }
 
 void SGBM3WayMainLoop::operator () (const Range& range) const
+{
+    if (D == Da) impl<true>(range);
+    else impl<false>(range);
+}
+template<bool x_nlanes>
+void SGBM3WayMainLoop::impl(const Range& range) const
 {
     // force separate processing of stripes:
     if(range.end>range.start+1)
@@ -1958,7 +1988,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
     PixType* tmpBuf;
     CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf;
     short* disp2Buf;
-    getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2,
+    getBufferPointers(cur_buffer,width,width1,Da,img1->channels(),SH2,P2,
                       curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume,
                       vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf);
 
@@ -1975,73 +2005,75 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
             disp2Buf[x] = (short)INVALID_DISP_SCALED;
             disp2CostBuf[x] = SHRT_MAX;
         }
-        CostType* C = curCostVolumeLine - D;
+        CostType* C = curCostVolumeLine - Da;
         CostType prev_min, min_cost;
-        int d, best_d;
+        int d;
+        short best_d;
         d = best_d = 0;
 
         // forward pass
         prev_min=0;
-        for (int x=D;x<(1+width1)*D;x+=D)
-            accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2);
+        for (int x=Da;x<(1+width1)*Da;x+=Da)
+            accumulateCostsLeftTop<x_nlanes>(horPassCostVolume+x,horPassCostVolume+x-Da,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/Da],D,P1,P2);
 
         //backward pass
-        memset(rightPassBuf,0,D*sizeof(CostType));
+        memset(rightPassBuf,0,Da*sizeof(CostType));
         prev_min=0;
-        for (int x=width1*D;x>=D;x-=D)
+        for (int x=width1*Da;x>=Da;x-=Da)
         {
-            accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
+            accumulateCostsRight<x_nlanes>(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost);
 
             if(uniquenessRatio>0)
             {
-#if CV_SIMD128
-                if (true)
+                d = 0;
+#if CV_SIMD
+                horPassCostVolume+=x;
+                int thresh = (100*min_cost)/(100-uniquenessRatio);
+                v_int16 thresh_reg = vx_setall_s16((short)(thresh+1));
+                v_int16 d1 = vx_setall_s16((short)(best_d-1));
+                v_int16 d2 = vx_setall_s16((short)(best_d+1));
+                v_int16 eight_reg = vx_setall_s16(v_int16::nlanes);
+                v_int16 cur_d = vx_load(idx_row);
+                v_int16 mask;
+
+                for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes )
                 {
-                    horPassCostVolume+=x;
-                    int thresh = (100*min_cost)/(100-uniquenessRatio);
-                    v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1));
-                    v_int16x8 d1 = v_setall_s16((short)(best_d-1));
-                    v_int16x8 d2 = v_setall_s16((short)(best_d+1));
-                    v_int16x8 eight_reg = v_setall_s16(8);
-                    v_int16x8 cur_d(0,1,2,3,4,5,6,7);
-                    v_int16x8 mask,cost1,cost2;
-
-                    for( d = 0; d < D; d+=16 )
-                    {
-                        cost1 = v_load_aligned(horPassCostVolume+d);
-                        cost2 = v_load_aligned(horPassCostVolume+d+8);
-
-                        mask = cost1 < thresh_reg;
-                        mask = mask & ( (cur_d<d1) | (cur_d>d2) );
-                        if( v_check_any(mask) )
-                            break;
-
-                        cur_d = cur_d+eight_reg;
-
-                        mask = cost2 < thresh_reg;
-                        mask = mask & ( (cur_d<d1) | (cur_d>d2) );
-                        if( v_check_any(mask) )
-                            break;
-
-                        cur_d = cur_d+eight_reg;
-                    }
-                    horPassCostVolume-=x;
+                    mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
+                    cur_d = cur_d+eight_reg;
+                    if( v_check_any(mask) )
+                        break;
+                    mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
+                    cur_d = cur_d+eight_reg;
+                    if( v_check_any(mask) )
+                        break;
                 }
-                else
-#endif
+                if( d <= D - 2*v_int16::nlanes )
                 {
-                    for( d = 0; d < D; d++ )
+                    horPassCostVolume-=x;
+                    continue;
+                }
+                if( d <= D - v_int16::nlanes )
+                {
+                    if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) )
                     {
-                        if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
-                            break;
+                        horPassCostVolume-=x;
+                        continue;
                     }
+                    d+=v_int16::nlanes;
+                }
+                horPassCostVolume-=x;
+#endif
+                for( ; d < D; d++ )
+                {
+                    if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
+                        break;
                 }
                 if( d < D )
                     continue;
             }
             d = best_d;
 
-            int _x2 = x/D - 1 + minX1 - d - minD;
+            int _x2 = x/Da - 1 + minX1 - d - minD;
             if( _x2>=0 && _x2<width && disp2CostBuf[_x2] > min_cost )
             {
                 disp2CostBuf[_x2] = min_cost;
@@ -2059,7 +2091,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const
             else
                 d *= DISP_SCALE;
 
-            disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
+            disp_row[(x/Da)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
         }
 
         for(int x = minX1; x < maxX1; x++ )

From a9d23a64792f0724601cb84908529e35eaf354b0 Mon Sep 17 00:00:00 2001
From: Igor Murzov <igor.murzov@xperience.ai>
Date: Fri, 25 Oct 2019 19:45:11 +0300
Subject: [PATCH 02/13] Fix wording in some tutorials

---
 .../basic_linear_transform.markdown           |  6 +--
 .../how_to_scan_images.markdown               | 42 +++++++++----------
 .../mat_mask_operations.markdown              |  6 +--
 .../mat_the_basic_image_container.markdown    | 40 +++++++++---------
 .../core/include/opencv2/core/saturate.hpp    |  2 -
 5 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
index 803de71acb..26a5152e63 100644
--- a/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
+++ b/doc/tutorials/core/basic_linear_transform/basic_linear_transform.markdown
@@ -150,7 +150,7 @@ We observe that @ref cv::Mat::zeros returns a Matlab-style zero initializer base
 
 Notice the following (**C++ code only**):
 -   To access each pixel in the images we are using this syntax: *image.at\<Vec3b\>(y,x)[c]*
-    where *y* is the row, *x* is the column and *c* is R, G or B (0, 1 or 2).
+    where *y* is the row, *x* is the column and *c* is B, G or R (0, 1 or 2).
 -   Since the operation \f$\alpha \cdot p(i,j) + \beta\f$ can give values out of range or not
     integers (if \f$\alpha\f$ is float), we use cv::saturate_cast to make sure the
     values are valid.
@@ -220,12 +220,12 @@ gamma correction.
 ### Brightness and contrast adjustments
 
 Increasing (/ decreasing) the \f$\beta\f$ value will add (/ subtract) a constant value to every pixel. Pixel values outside of the [0 ; 255]
-range will be saturated (i.e. a pixel value higher (/ lesser) than 255 (/ 0) will be clamp to 255 (/ 0)).
+range will be saturated (i.e. a pixel value higher (/ lesser) than 255 (/ 0) will be clamped to 255 (/ 0)).
 
 ![In light gray, histogram of the original image, in dark gray when brightness = 80 in Gimp](images/Basic_Linear_Transform_Tutorial_hist_beta.png)
 
 The histogram represents for each color level the number of pixels with that color level. A dark image will have many pixels with
-low color value and thus the histogram will present a peak in his left part. When adding a constant bias, the histogram is shifted to the
+low color value and thus the histogram will present a peak in its left part. When adding a constant bias, the histogram is shifted to the
 right as we have added a constant bias to all the pixels.
 
 The \f$\alpha\f$ parameter will modify how the levels spread. If \f$ \alpha < 1 \f$, the color levels will be compressed and the result
diff --git a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
index 0140e14058..d41844c4f7 100644
--- a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
+++ b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
@@ -10,7 +10,7 @@ Goal
 We'll seek answers for the following questions:
 
 -   How to go through each and every pixel of an image?
--   How is OpenCV matrix values stored?
+-   How are OpenCV matrix values stored?
 -   How to measure the performance of our algorithm?
 -   What are lookup tables and why use them?
 
@@ -45,13 +45,13 @@ operation. In case of the *uchar* system this is 256 to be exact.
 Therefore, for larger images it would be wise to calculate all possible values beforehand and during
 the assignment just make the assignment, by using a lookup table. Lookup tables are simple arrays
 (having one or more dimensions) that for a given input value variation holds the final output value.
-Its strength lies that we do not need to make the calculation, we just need to read the result.
+Its strength is that we do not need to make the calculation, we just need to read the result.
 
-Our test case program (and the sample presented here) will do the following: read in a console line
-argument image (that may be either color or gray scale - console line argument too) and apply the
-reduction with the given console line argument integer value. In OpenCV, at the moment there are
+Our test case program (and the code sample below) will do the following: read in an image passed
+as a command line argument (it may be either color or grayscale) and apply the reduction
+with the given command line argument integer value. In OpenCV, at the moment there are
 three major ways of going through an image pixel by pixel. To make things a little more interesting
-will make the scanning for each image using all of these methods, and print out how long it took.
+we'll make the scanning of the image using each of these methods, and print out how long it took.
 
 You can download the full source code [here
 ](https://github.com/opencv/opencv/tree/3.4/samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp) or look it up in
@@ -59,7 +59,7 @@ the samples directory of OpenCV at the cpp tutorial code for the core section. I
 @code{.bash}
 how_to_scan_images imageName.jpg intValueToReduce [G]
 @endcode
-The final argument is optional. If given the image will be loaded in gray scale format, otherwise
+The final argument is optional. If given the image will be loaded in grayscale format, otherwise
 the BGR color space is used. The first thing is to calculate the lookup table.
 
 @snippet how_to_scan_images.cpp dividewith
@@ -71,8 +71,8 @@ No OpenCV specific stuff here.
 Another issue is how do we measure time? Well OpenCV offers two simple functions to achieve this
 cv::getTickCount() and cv::getTickFrequency() . The first returns the number of ticks of
 your systems CPU from a certain event (like since you booted your system). The second returns how
-many times your CPU emits a tick during a second. So to measure in seconds the number of time
-elapsed between two operations is easy as:
+many times your CPU emits a tick during a second. So, measuring amount of time elapsed between
+two operations is as easy as:
 @code{.cpp}
 double t = (double)getTickCount();
 // do something ...
@@ -85,8 +85,8 @@ How is the image matrix stored in memory?
 -----------------------------------------
 
 As you could already read in my @ref tutorial_mat_the_basic_image_container tutorial the size of the matrix
-depends on the color system used. More accurately, it depends from the number of channels used. In
-case of a gray scale image we have something like:
+depends on the color system used. More accurately, it depends on the number of channels used. In
+case of a grayscale image we have something like:
 
 ![](tutorial_how_matrix_stored_1.png)
 
@@ -117,12 +117,12 @@ three channels so we need to pass through three times more items in each row.
 There's another way of this. The *data* data member of a *Mat* object returns the pointer to the
 first row, first column. If this pointer is null you have no valid input in that object. Checking
 this is the simplest method to check if your image loading was a success. In case the storage is
-continuous we can use this to go through the whole data pointer. In case of a gray scale image this
+continuous we can use this to go through the whole data pointer. In case of a grayscale image this
 would look like:
 @code{.cpp}
 uchar* p = I.data;
 
-for( unsigned int i =0; i < ncol*nrows; ++i)
+for( unsigned int i = 0; i < ncol*nrows; ++i)
     *p++ = table[*p];
 @endcode
 You would get the same result. However, this code is a lot harder to read later on. It gets even
@@ -135,7 +135,7 @@ The iterator (safe) method
 
 In case of the efficient way making sure that you pass through the right amount of *uchar* fields
 and to skip the gaps that may occur between the rows was your responsibility. The iterator method is
-considered a safer way as it takes over these tasks from the user. All you need to do is ask the
+considered a safer way as it takes over these tasks from the user. All you need to do is to ask the
 begin and the end of the image matrix and then just increase the begin iterator until you reach the
 end. To acquire the value *pointed* by the iterator use the \* operator (add it before it).
 
@@ -152,17 +152,17 @@ On-the-fly address calculation with reference returning
 
 The final method isn't recommended for scanning. It was made to acquire or modify somehow random
 elements in the image. Its basic usage is to specify the row and column number of the item you want
-to access. During our earlier scanning methods you could already observe that is important through
+to access. During our earlier scanning methods you could already notice that it is important through
 what type we are looking at the image. It's no different here as you need to manually specify what
-type to use at the automatic lookup. You can observe this in case of the gray scale images for the
+type to use at the automatic lookup. You can observe this in case of the grayscale images for the
 following source code (the usage of the + cv::Mat::at() function):
 
 @snippet how_to_scan_images.cpp scan-random
 
-The functions takes your input type and coordinates and calculates on the fly the address of the
+The function takes your input type and coordinates and calculates the address of the
 queried item. Then returns a reference to that. This may be a constant when you *get* the value and
-non-constant when you *set* the value. As a safety step in **debug mode only**\* there is performed
-a check that your input coordinates are valid and does exist. If this isn't the case you'll get a
+non-constant when you *set* the value. As a safety step in **debug mode only**\* there is a check
+performed that your input coordinates are valid and do exist. If this isn't the case you'll get a
 nice output message of this on the standard error output stream. Compared to the efficient way in
 release mode the only difference in using this is that for every element of the image you'll get a
 new row pointer for what we use the C operator[] to acquire the column element.
@@ -173,7 +173,7 @@ OpenCV has a cv::Mat_ data type. It's the same as Mat with the extra need that a
 you need to specify the data type through what to look at the data matrix, however in return you can
 use the operator() for fast access of items. To make things even better this is easily convertible
 from and to the usual cv::Mat data type. A sample usage of this you can see in case of the
-color images of the upper function. Nevertheless, it's important to note that the same operation
+color images of the function above. Nevertheless, it's important to note that the same operation
 (with the same runtime speed) could have been done with the cv::Mat::at function. It's just a less
 to write for the lazy programmer trick.
 
@@ -195,7 +195,7 @@ Finally call the function (I is our input image and J the output one):
 Performance Difference
 ----------------------
 
-For the best result compile the program and run it on your own speed. To make the differences more
+For the best result compile the program and run it yourself. To make the differences more
 clear, I've used a quite large (2560 X 1600) image. The performance presented here are for
 color images. For a more accurate value I've averaged the value I got from the call of the function
 for hundred times.
diff --git a/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown b/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
index bd74267f54..97e4052a94 100644
--- a/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
+++ b/doc/tutorials/core/mat-mask-operations/mat_mask_operations.markdown
@@ -4,7 +4,7 @@ Mask operations on matrices {#tutorial_mat_mask_operations}
 @prev_tutorial{tutorial_how_to_scan_images}
 @next_tutorial{tutorial_mat_operations}
 
-Mask operations on matrices are quite simple. The idea is that we recalculate each pixels value in
+Mask operations on matrices are quite simple. The idea is that we recalculate each pixel's value in
 an image according to a mask matrix (also known as kernel). This mask holds values that will adjust
 how much influence neighboring pixels (and the current pixel) have on the new pixel value. From a
 mathematical point of view we make a weighted average, with our specified values.
@@ -12,7 +12,7 @@ mathematical point of view we make a weighted average, with our specified values
 Our test case
 -------------
 
-Let us consider the issue of an image contrast enhancement method. Basically we want to apply for
+Let's consider the issue of an image contrast enhancement method. Basically we want to apply for
 every pixel of the image the following formula:
 
 \f[I(i,j) = 5*I(i,j) - [ I(i-1,j) + I(i+1,j) + I(i,j-1) + I(i,j+1)]\f]\f[\iff I(i,j)*M, \text{where }
@@ -144,7 +144,7 @@ Then we apply the sum and put the new value in the Result matrix.
 The filter2D function
 ---------------------
 
-Applying such filters are so common in image processing that in OpenCV there exist a function that
+Applying such filters are so common in image processing that in OpenCV there is a function that
 will take care of applying the mask (also called a kernel in some places). For this you first need
 to define an object that holds the mask:
 
diff --git a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
index 882b7a4a0b..f6a1a0a4fb 100644
--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
@@ -61,7 +61,7 @@ The last thing we want to do is further decrease the speed of your program by ma
 copies of potentially *large* images.
 
 To tackle this issue OpenCV uses a reference counting system. The idea is that each *Mat* object has
-its own header, however the matrix may be shared between two instance of them by having their matrix
+its own header, however a matrix may be shared between two *Mat* objects by having their matrix
 pointers point to the same address. Moreover, the copy operators **will only copy the headers** and
 the pointer to the large matrix, not the data itself.
 
@@ -74,32 +74,32 @@ Mat B(A);                                 // Use the copy constructor
 C = A;                                    // Assignment operator
 @endcode
 
-All the above objects, in the end, point to the same single data matrix. Their headers are
-different, however, and making a modification using any of them will affect all the other ones as
-well. In practice the different objects just provide different access method to the same underlying
-data. Nevertheless, their header parts are different. The real interesting part is that you can
-create headers which refer to only a subsection of the full data. For example, to create a region of
-interest (*ROI*) in an image you just create a new header with the new boundaries:
+All the above objects, in the end, point to the same single data matrix and making a modification
+using any of them will affect all the other ones as well. In practice the different objects just
+provide different access methods to the same underlying data. Nevertheless, their header parts are
+different. The real interesting part is that you can create headers which refer to only a subsection
+of the full data. For example, to create a region of interest (*ROI*) in an image you just create
+a new header with the new boundaries:
 @code{.cpp}
 Mat D (A, Rect(10, 10, 100, 100) ); // using a rectangle
 Mat E = A(Range::all(), Range(1,3)); // using row and column boundaries
 @endcode
-Now you may ask if the matrix itself may belong to multiple *Mat* objects who takes responsibility
+Now you may ask -- if the matrix itself may belong to multiple *Mat* objects who takes responsibility
 for cleaning it up when it's no longer needed. The short answer is: the last object that used it.
 This is handled by using a reference counting mechanism. Whenever somebody copies a header of a
-*Mat* object, a counter is increased for the matrix. Whenever a header is cleaned this counter is
-decreased. When the counter reaches zero the matrix too is freed. Sometimes you will want to copy
-the matrix itself too, so OpenCV provides the @ref cv::Mat::clone() and @ref cv::Mat::copyTo() functions.
+*Mat* object, a counter is increased for the matrix. Whenever a header is cleaned, this counter
+is decreased. When the counter reaches zero the matrix is freed. Sometimes you will want to copy
+the matrix itself too, so OpenCV provides @ref cv::Mat::clone() and @ref cv::Mat::copyTo() functions.
 @code{.cpp}
 Mat F = A.clone();
 Mat G;
 A.copyTo(G);
 @endcode
-Now modifying *F* or *G* will not affect the matrix pointed by the *Mat* header. What you need to
+Now modifying *F* or *G* will not affect the matrix pointed by the *A*'s header. What you need to
 remember from all this is that:
 
 -   Output image allocation for OpenCV functions is automatic (unless specified otherwise).
--   You do not need to think about memory management with OpenCVs C++ interface.
+-   You do not need to think about memory management with OpenCV's C++ interface.
 -   The assignment operator and the copy constructor only copies the header.
 -   The underlying matrix of an image may be copied using the @ref cv::Mat::clone() and @ref cv::Mat::copyTo()
     functions.
@@ -109,7 +109,7 @@ Storing methods
 
 This is about how you store the pixel values. You can select the color space and the data type used.
 The color space refers to how we combine color components in order to code a given color. The
-simplest one is the gray scale where the colors at our disposal are black and white. The combination
+simplest one is the grayscale where the colors at our disposal are black and white. The combination
 of these allows us to create many shades of gray.
 
 For *colorful* ways we have a lot more methods to choose from. Each of them breaks it down to three
@@ -121,15 +121,15 @@ added.
 There are, however, many other color systems each with their own advantages:
 
 -   RGB is the most common as our eyes use something similar, however keep in mind that OpenCV standard display
-    system composes colors using the BGR color space (a switch of the red and blue channel).
+    system composes colors using the BGR color space (red and blue channels are swapped places).
 -   The HSV and HLS decompose colors into their hue, saturation and value/luminance components,
     which is a more natural way for us to describe colors. You might, for example, dismiss the last
     component, making your algorithm less sensible to the light conditions of the input image.
 -   YCrCb is used by the popular JPEG image format.
--   CIE L\*a\*b\* is a perceptually uniform color space, which comes handy if you need to measure
+-   CIE L\*a\*b\* is a perceptually uniform color space, which comes in handy if you need to measure
     the *distance* of a given color to another color.
 
-Each of the building components has their own valid domains. This leads to the data type used. How
+Each of the building components has its own valid domains. This leads to the data type used. How
 we store a component defines the control we have over its domain. The smallest data type possible is
 *char*, which means one byte or 8 bits. This may be unsigned (so can store values from 0 to 255) or
 signed (values from -127 to +127). Although in case of three components this already gives 16
@@ -165,8 +165,8 @@ object in multiple ways:
     CV_[The number of bits per item][Signed or Unsigned][Type Prefix]C[The channel number]
     @endcode
     For instance, *CV_8UC3* means we use unsigned char types that are 8 bit long and each pixel has
-    three of these to form the three channels. This are predefined for up to four channel numbers. The
-    @ref cv::Scalar is four element short vector. Specify this and you can initialize all matrix
+    three of these to form the three channels. There are types predefined for up to four channels. The
+    @ref cv::Scalar is four element short vector. Specify it and you can initialize all matrix
     points with a custom value. If you need more you can create the type with the upper macro, setting
     the channel number in parenthesis as you can see below.
 
@@ -210,7 +210,7 @@ object in multiple ways:
 
     @note
     You can fill out a matrix with random values using the @ref cv::randu() function. You need to
-    give the lower and upper value for the random values:
+    give a lower and upper limit for the random values:
     @snippet mat_the_basic_image_container.cpp random
 
 
diff --git a/modules/core/include/opencv2/core/saturate.hpp b/modules/core/include/opencv2/core/saturate.hpp
index 118599f8f9..36d312154f 100644
--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@@ -74,8 +74,6 @@ namespace cv
  the floating-point value is first rounded to the nearest integer and then clipped if needed (when
  the target type is 8- or 16-bit).
 
- This operation is used in the simplest or most complex image processing functions in OpenCV.
-
  @param v Function parameter.
  @sa add, subtract, multiply, divide, Mat::convertTo
  */

From 79f792ad0585a94c178eb1b36b0a49360320f7be Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Thu, 31 Oct 2019 15:10:42 +0300
Subject: [PATCH 03/13] ts: do not block reporting of launched "DISABLED_"
 tests

If tests are run through GTest option `--gtest_also_run_disabled_tests`
---
 modules/ts/misc/testlog_parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/ts/misc/testlog_parser.py b/modules/ts/misc/testlog_parser.py
index 6152f991af..2e9718be3e 100755
--- a/modules/ts/misc/testlog_parser.py
+++ b/modules/ts/misc/testlog_parser.py
@@ -30,7 +30,8 @@ class TestInfo(object):
             self.status = xmlnode.getAttribute("status")
 
         if self.name.startswith("DISABLED_"):
-            self.status = "disabled"
+            if self.status == 'notrun':
+                self.status = "disabled"
             self.fixture = self.fixture.replace("DISABLED_", "")
             self.name = self.name.replace("DISABLED_", "")
         self.properties = {

From edc5518f6869fd99bc38f59ae78e9cd4a5a606d2 Mon Sep 17 00:00:00 2001
From: Dizhenin Vlad <39303687+SimpleVlad@users.noreply.github.com>
Date: Thu, 31 Oct 2019 22:09:33 +0300
Subject: [PATCH 04/13] Merge pull request #15608 from SimpleVlad:3.4

* Add flags for build js

* Add poi.json

* Rebase whitelist into JSON file

* Rework generator of white_list

* Fix small typos

* Transfer opencv_js.josn in opencv_js.config.py

* Edit OPENCV_JS_WHITELIST

* Write comment

* Add description

* Fix typos in desc

* flag's append deleeted

* Fix whitespace

* variable deleted

* fix comment on lines 229 and 235
---
 modules/js/src/embindgen.py      | 56 ++------------------------------
 platforms/js/build_js.py         | 14 ++++++++
 platforms/js/opencv_js.config.py | 52 +++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 53 deletions(-)
 create mode 100644 platforms/js/opencv_js.config.py

diff --git a/modules/js/src/embindgen.py b/modules/js/src/embindgen.py
index ea5a939903..0ec4488946 100644
--- a/modules/js/src/embindgen.py
+++ b/modules/js/src/embindgen.py
@@ -93,58 +93,6 @@ ignore_list = ['locate',  #int&
                'meanShift' #Rect&
                ]
 
-# Classes and methods whitelist
-core = {'': ['absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bitwise_or', 'bitwise_xor', 'cartToPolar',\
-             'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \
-             'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \
-             'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \
-             'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed', \
-             'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'],
-        'Algorithm': []}
-
-imgproc = {'': ['Canny', 'GaussianBlur', 'Laplacian', 'HoughLines', 'HoughLinesP', 'HoughCircles', 'Scharr','Sobel', \
-                'adaptiveThreshold','approxPolyDP','arcLength','bilateralFilter','blur','boundingRect','boxFilter',\
-                'calcBackProject','calcHist','circle','compareHist','connectedComponents','connectedComponentsWithStats', \
-                'contourArea', 'convexHull', 'convexityDefects', 'cornerHarris','cornerMinEigenVal','createCLAHE', \
-                'createLineSegmentDetector','cvtColor','demosaicing','dilate', 'distanceTransform','distanceTransformWithLabels', \
-                'drawContours','ellipse','ellipse2Poly','equalizeHist','erode', 'filter2D', 'findContours','fitEllipse', \
-                'fitLine', 'floodFill','getAffineTransform', 'getPerspectiveTransform', 'getRotationMatrix2D', 'getStructuringElement', \
-                'goodFeaturesToTrack','grabCut','initUndistortRectifyMap', 'integral','integral2', 'isContourConvex', 'line', \
-                'matchShapes', 'matchTemplate','medianBlur', 'minAreaRect', 'minEnclosingCircle', 'moments', 'morphologyEx', \
-                'pointPolygonTest', 'putText','pyrDown','pyrUp','rectangle','remap', 'resize','sepFilter2D','threshold', \
-                'undistort','warpAffine','warpPerspective','warpPolar','watershed', \
-                'fillPoly', 'fillConvexPoly'],
-           'CLAHE': ['apply', 'collectGarbage', 'getClipLimit', 'getTilesGridSize', 'setClipLimit', 'setTilesGridSize']}
-
-objdetect = {'': ['groupRectangles'],
-             'HOGDescriptor': ['load', 'HOGDescriptor', 'getDefaultPeopleDetector', 'getDaimlerPeopleDetector', 'setSVMDetector', 'detectMultiScale'],
-             'CascadeClassifier': ['load', 'detectMultiScale2', 'CascadeClassifier', 'detectMultiScale3', 'empty', 'detectMultiScale']}
-
-video = {'': ['CamShift', 'calcOpticalFlowFarneback', 'calcOpticalFlowPyrLK', 'createBackgroundSubtractorMOG2', \
-             'findTransformECC', 'meanShift'],
-         'BackgroundSubtractorMOG2': ['BackgroundSubtractorMOG2', 'apply'],
-         'BackgroundSubtractor': ['apply', 'getBackgroundImage']}
-
-dnn = {'dnn_Net': ['setInput', 'forward'],
-       '': ['readNetFromCaffe', 'readNetFromTensorflow', 'readNetFromTorch', 'readNetFromDarknet',
-            'readNetFromONNX', 'readNet', 'blobFromImage']}
-
-features2d = {'Feature2D': ['detect', 'compute', 'detectAndCompute', 'descriptorSize', 'descriptorType', 'defaultNorm', 'empty', 'getDefaultName'],
-              'BRISK': ['create', 'getDefaultName'],
-              'ORB': ['create', 'setMaxFeatures', 'setScaleFactor', 'setNLevels', 'setEdgeThreshold', 'setFirstLevel', 'setWTA_K', 'setScoreType', 'setPatchSize', 'getFastThreshold', 'getDefaultName'],
-              'MSER': ['create', 'detectRegions', 'setDelta', 'getDelta', 'setMinArea', 'getMinArea', 'setMaxArea', 'getMaxArea', 'setPass2Only', 'getPass2Only', 'getDefaultName'],
-              'FastFeatureDetector': ['create', 'setThreshold', 'getThreshold', 'setNonmaxSuppression', 'getNonmaxSuppression', 'setType', 'getType', 'getDefaultName'],
-              'AgastFeatureDetector': ['create', 'setThreshold', 'getThreshold', 'setNonmaxSuppression', 'getNonmaxSuppression', 'setType', 'getType', 'getDefaultName'],
-              'GFTTDetector': ['create', 'setMaxFeatures', 'getMaxFeatures', 'setQualityLevel', 'getQualityLevel', 'setMinDistance', 'getMinDistance', 'setBlockSize', 'getBlockSize', 'setHarrisDetector', 'getHarrisDetector', 'setK', 'getK', 'getDefaultName'],
-              # 'SimpleBlobDetector': ['create'],
-              'KAZE': ['create', 'setExtended', 'getExtended', 'setUpright', 'getUpright', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'],
-              'AKAZE': ['create', 'setDescriptorType', 'getDescriptorType', 'setDescriptorSize', 'getDescriptorSize', 'setDescriptorChannels', 'getDescriptorChannels', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'],
-              'DescriptorMatcher': ['add', 'clear', 'empty', 'isMaskSupported', 'train', 'match', 'knnMatch', 'radiusMatch', 'clone', 'create'],
-              'BFMatcher': ['isMaskSupported', 'create'],
-              '': ['drawKeypoints', 'drawMatches', 'drawMatchesKnn']}
-
-calib3d = {'': ['findHomography', 'estimateAffine2D', 'Rodrigues']}
-
 def makeWhiteList(module_list):
     wl = {}
     for m in module_list:
@@ -155,7 +103,9 @@ def makeWhiteList(module_list):
                 wl[k] = m[k]
     return wl
 
-white_list = makeWhiteList([core, imgproc, objdetect, video, dnn, features2d, calib3d])
+white_list = None
+exec(open(os.environ["OPENCV_JS_WHITELIST"]).read())
+assert(white_list)
 
 # Features to be exported
 export_enums = False
diff --git a/platforms/js/build_js.py b/platforms/js/build_js.py
index 3a4612a89a..fbeb1e4fb3 100644
--- a/platforms/js/build_js.py
+++ b/platforms/js/build_js.py
@@ -138,6 +138,8 @@ class Builder:
                "-DBUILD_PACKAGE=OFF",
                "-DBUILD_TESTS=OFF",
                "-DBUILD_PERF_TESTS=OFF"]
+        if self.options.cmake_option:
+            cmd += self.options.cmake_option
         if self.options.build_doc:
             cmd.append("-DBUILD_DOCS=ON")
         else:
@@ -178,6 +180,8 @@ class Builder:
             flags += "-s DISABLE_EXCEPTION_CATCHING=0 "
         if self.options.simd:
             flags += "-msimd128 "
+        if self.options.build_flags:
+            flags += self.options.build_flags
         return flags
 
     def config(self):
@@ -221,12 +225,22 @@ if __name__ == "__main__":
     parser.add_argument('--skip_config', action="store_true", help="Skip cmake config")
     parser.add_argument('--config_only', action="store_true", help="Only do cmake config")
     parser.add_argument('--enable_exception', action="store_true", help="Enable exception handling")
+    # Use flag --cmake option="-D...=ON" only for one argument, if you would add more changes write new cmake_option flags
+    parser.add_argument('--cmake_option', action='append', help="Append CMake options")
+    # Use flag --build_flags="-s USE_PTHREADS=0 -Os" for one and more arguments as in the example
+    parser.add_argument('--build_flags', help="Append Emscripten build options")
     parser.add_argument('--build_wasm_intrin_test', default=False, action="store_true", help="Build WASM intrin tests")
+    # Write a path to modify file like argument of this flag
+    parser.add_argument('--config', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'opencv_js.config.py'),
+                        help="Specify configuration file with own list of exported into JS functions")
+
     args = parser.parse_args()
 
     log.basicConfig(format='%(message)s', level=log.DEBUG)
     log.debug("Args: %s", args)
 
+    os.environ["OPENCV_JS_WHITELIST"] = args.config
+
     if args.emscripten_dir is None:
         log.info("Cannot get Emscripten path, please specify it either by EMSCRIPTEN environment variable or --emscripten_dir option.")
         sys.exit(-1)
diff --git a/platforms/js/opencv_js.config.py b/platforms/js/opencv_js.config.py
new file mode 100644
index 0000000000..bcbdcfdf96
--- /dev/null
+++ b/platforms/js/opencv_js.config.py
@@ -0,0 +1,52 @@
+core = {'': ['absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bitwise_or', 'bitwise_xor', 'cartToPolar',\
+             'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \
+             'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \
+             'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \
+             'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed', \
+             'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'],
+        'Algorithm': []}
+
+imgproc = {'': ['Canny', 'GaussianBlur', 'Laplacian', 'HoughLines', 'HoughLinesP', 'HoughCircles', 'Scharr','Sobel', \
+                'adaptiveThreshold','approxPolyDP','arcLength','bilateralFilter','blur','boundingRect','boxFilter',\
+                'calcBackProject','calcHist','circle','compareHist','connectedComponents','connectedComponentsWithStats', \
+                'contourArea', 'convexHull', 'convexityDefects', 'cornerHarris','cornerMinEigenVal','createCLAHE', \
+                'createLineSegmentDetector','cvtColor','demosaicing','dilate', 'distanceTransform','distanceTransformWithLabels', \
+                'drawContours','ellipse','ellipse2Poly','equalizeHist','erode', 'filter2D', 'findContours','fitEllipse', \
+                'fitLine', 'floodFill','getAffineTransform', 'getPerspectiveTransform', 'getRotationMatrix2D', 'getStructuringElement', \
+                'goodFeaturesToTrack','grabCut','initUndistortRectifyMap', 'integral','integral2', 'isContourConvex', 'line', \
+                'matchShapes', 'matchTemplate','medianBlur', 'minAreaRect', 'minEnclosingCircle', 'moments', 'morphologyEx', \
+                'pointPolygonTest', 'putText','pyrDown','pyrUp','rectangle','remap', 'resize','sepFilter2D','threshold', \
+                'undistort','warpAffine','warpPerspective','warpPolar','watershed', \
+                'fillPoly', 'fillConvexPoly'],
+           'CLAHE': ['apply', 'collectGarbage', 'getClipLimit', 'getTilesGridSize', 'setClipLimit', 'setTilesGridSize']}
+
+objdetect = {'': ['groupRectangles'],
+             'HOGDescriptor': ['load', 'HOGDescriptor', 'getDefaultPeopleDetector', 'getDaimlerPeopleDetector', 'setSVMDetector', 'detectMultiScale'],
+             'CascadeClassifier': ['load', 'detectMultiScale2', 'CascadeClassifier', 'detectMultiScale3', 'empty', 'detectMultiScale']}
+
+video = {'': ['CamShift', 'calcOpticalFlowFarneback', 'calcOpticalFlowPyrLK', 'createBackgroundSubtractorMOG2', \
+             'findTransformECC', 'meanShift'],
+         'BackgroundSubtractorMOG2': ['BackgroundSubtractorMOG2', 'apply'],
+         'BackgroundSubtractor': ['apply', 'getBackgroundImage']}
+
+dnn = {'dnn_Net': ['setInput', 'forward'],
+       '': ['readNetFromCaffe', 'readNetFromTensorflow', 'readNetFromTorch', 'readNetFromDarknet',
+            'readNetFromONNX', 'readNet', 'blobFromImage']}
+
+features2d = {'Feature2D': ['detect', 'compute', 'detectAndCompute', 'descriptorSize', 'descriptorType', 'defaultNorm', 'empty', 'getDefaultName'],
+              'BRISK': ['create', 'getDefaultName'],
+              'ORB': ['create', 'setMaxFeatures', 'setScaleFactor', 'setNLevels', 'setEdgeThreshold', 'setFirstLevel', 'setWTA_K', 'setScoreType', 'setPatchSize', 'getFastThreshold', 'getDefaultName'],
+              'MSER': ['create', 'detectRegions', 'setDelta', 'getDelta', 'setMinArea', 'getMinArea', 'setMaxArea', 'getMaxArea', 'setPass2Only', 'getPass2Only', 'getDefaultName'],
+              'FastFeatureDetector': ['create', 'setThreshold', 'getThreshold', 'setNonmaxSuppression', 'getNonmaxSuppression', 'setType', 'getType', 'getDefaultName'],
+              'AgastFeatureDetector': ['create', 'setThreshold', 'getThreshold', 'setNonmaxSuppression', 'getNonmaxSuppression', 'setType', 'getType', 'getDefaultName'],
+              'GFTTDetector': ['create', 'setMaxFeatures', 'getMaxFeatures', 'setQualityLevel', 'getQualityLevel', 'setMinDistance', 'getMinDistance', 'setBlockSize', 'getBlockSize', 'setHarrisDetector', 'getHarrisDetector', 'setK', 'getK', 'getDefaultName'],
+              # 'SimpleBlobDetector': ['create'],
+              'KAZE': ['create', 'setExtended', 'getExtended', 'setUpright', 'getUpright', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'],
+              'AKAZE': ['create', 'setDescriptorType', 'getDescriptorType', 'setDescriptorSize', 'getDescriptorSize', 'setDescriptorChannels', 'getDescriptorChannels', 'setThreshold', 'getThreshold', 'setNOctaves', 'getNOctaves', 'setNOctaveLayers', 'getNOctaveLayers', 'setDiffusivity', 'getDiffusivity', 'getDefaultName'],
+              'DescriptorMatcher': ['add', 'clear', 'empty', 'isMaskSupported', 'train', 'match', 'knnMatch', 'radiusMatch', 'clone', 'create'],
+              'BFMatcher': ['isMaskSupported', 'create'],
+              '': ['drawKeypoints', 'drawMatches', 'drawMatchesKnn']}
+
+calib3d = {'': ['findHomography', 'estimateAffine2D', 'Rodrigues']}
+
+white_list = makeWhiteList([core, imgproc, objdetect, video, dnn, features2d, calib3d])
\ No newline at end of file

From af433d0352f53af79affe0a083f5522b719e7730 Mon Sep 17 00:00:00 2001
From: Oleg Alexandrov <oleg.alexandrov@gmail.com>
Date: Thu, 31 Oct 2019 12:28:01 -0700
Subject: [PATCH 05/13] Merge pull request #15780 from oleg-alexandrov:master

* Doc bugfix

The documentation page StereoBinaryBM and StereoBinarySGBM says that it returns a disparity that is scaled multiplied by 16. This scaling must be undone before calling reprojectImageTo3D, otherwise the results are wrong. The function reprojectImageTo3D() could do this scaling internally, maybe, but at least the documentation must explain that this has to be done.

* calib3d: update reprojectImageTo3D documentation

* calib3d: add StereoBM/StereoSGBM into notes list
---
 modules/calib3d/include/opencv2/calib3d.hpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 4ae44f7ec2..8b2d993ad1 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -2206,8 +2206,11 @@ CV_EXPORTS_W void validateDisparity( InputOutputArray disparity, InputArray cost
 /** @brief Reprojects a disparity image to 3D space.
 
 @param disparity Input single-channel 8-bit unsigned, 16-bit signed, 32-bit signed or 32-bit
-floating-point disparity image. If 16-bit signed format is used, the values are assumed to have no
-fractional bits.
+floating-point disparity image.
+The values of 8-bit / 16-bit signed formats are assumed to have no fractional bits.
+If the disparity is 16-bit signed format as computed by
+StereoBM/StereoSGBM/StereoBinaryBM/StereoBinarySGBM and may be other algorithms,
+it should be divided by 16 (and scaled to float) before being used here.
 @param _3dImage Output 3-channel floating-point image of the same size as disparity . Each
 element of _3dImage(x,y) contains 3D coordinates of the point (x,y) computed from the disparity
 map.

From c2f2ea6b853fd0d1c43ad4f7691a757c638884ce Mon Sep 17 00:00:00 2001
From: CJ Smith <cjs.connor.smith@gmail.com>
Date: Thu, 31 Oct 2019 15:29:04 -0400
Subject: [PATCH 06/13] Merge pull request #15789 from
 CJSmith-0141:15779-scale-bug-in-stereo-match-sample

* Changes disparity image to float representation

Signed-off-by: Connor James Smith <cjs.connor.smith@gmail.com>

* samples: update disparity multiplier handling in stereo_match.cpp
---
 samples/cpp/stereo_match.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/samples/cpp/stereo_match.cpp b/samples/cpp/stereo_match.cpp
index 166a45086c..9194aa4943 100644
--- a/samples/cpp/stereo_match.cpp
+++ b/samples/cpp/stereo_match.cpp
@@ -247,10 +247,19 @@ int main(int argc, char** argv)
     //copyMakeBorder(img2, img2p, 0, 0, numberOfDisparities, 0, IPL_BORDER_REPLICATE);
 
     int64 t = getTickCount();
+    float disparity_multiplier = 1.0f;
     if( alg == STEREO_BM )
+    {
         bm->compute(img1, img2, disp);
+        if (disp.type() == CV_16S)
+            disparity_multiplier = 16.0f;
+    }
     else if( alg == STEREO_SGBM || alg == STEREO_HH || alg == STEREO_3WAY )
+    {
         sgbm->compute(img1, img2, disp);
+        if (disp.type() == CV_16S)
+            disparity_multiplier = 16.0f;
+    }
     t = getTickCount() - t;
     printf("Time elapsed: %fms\n", t*1000/getTickFrequency());
 
@@ -281,7 +290,9 @@ int main(int argc, char** argv)
         printf("storing the point cloud...");
         fflush(stdout);
         Mat xyz;
-        reprojectImageTo3D(disp, xyz, Q, true);
+        Mat floatDisp;
+        disp.convertTo(floatDisp, CV_32F, 1.0f / disparity_multiplier);
+        reprojectImageTo3D(floatDisp, xyz, Q, true);
         saveXYZ(point_cloud_filename.c_str(), xyz);
         printf("\n");
     }

From d2e02779c41253e1fac38904953e2ffe2a6761f5 Mon Sep 17 00:00:00 2001
From: Ciprian Alexandru Pitis <57091999+Cpitis@users.noreply.github.com>
Date: Thu, 31 Oct 2019 21:38:49 +0100
Subject: [PATCH 07/13] Merge pull request #15799 from
 Cpitis:feature/parallelization

Parallelize pyrDown & calcSharrDeriv

* ::pyrDown has been parallelized

* CalcSharrDeriv parallelized

* Fixed whitespace

* Set granularity based on amount of threads enabled

* Granularity changed to cv::getNumThreads, now each thread should receive 1/n sized stripes

* imgproc: move PyrDownInvoker<CastOp>::operator() implementation

* imgproc(pyramid): remove syloopboundary()

* video: SharrDerivInvoker replace 'Mat*' => 'Mat&' fields
---
 modules/imgproc/src/pyramids.cpp | 77 ++++++++++++++++++++++++--------
 modules/video/src/lkpyramid.cpp  | 17 ++++---
 modules/video/src/lkpyramid.hpp  | 12 +++++
 3 files changed, 83 insertions(+), 23 deletions(-)

diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 6fb61bf6cd..ec4427f219 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -719,29 +719,45 @@ template <> int PyrUpVecV<float, float>(float** src, float** dst, int width)
 
 #endif
 
+template<class CastOp>
+struct PyrDownInvoker : ParallelLoopBody
+{
+    PyrDownInvoker(const Mat& src, const Mat& dst, int borderType, int **tabR, int **tabM, int **tabL)
+    {
+        _src = &src;
+        _dst = &dst;
+        _borderType = borderType;
+        _tabR = tabR;
+        _tabM = tabM;
+        _tabL = tabL;
+    }
+
+    void operator()(const Range& range) const CV_OVERRIDE;
+
+    int **_tabR;
+    int **_tabM;
+    int **_tabL;
+    const Mat *_src;
+    const Mat *_dst;
+    int _borderType;
+};
+
 template<class CastOp> void
 pyrDown_( const Mat& _src, Mat& _dst, int borderType )
 {
     const int PD_SZ = 5;
-    typedef typename CastOp::type1 WT;
-    typedef typename CastOp::rtype T;
-
     CV_Assert( !_src.empty() );
     Size ssize = _src.size(), dsize = _dst.size();
     int cn = _src.channels();
-    int bufstep = (int)alignSize(dsize.width*cn, 16);
-    AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
-    WT* buf = alignPtr((WT*)_buf.data(), 16);
+
     int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
     AutoBuffer<int> _tabM(dsize.width*cn);
     int* tabM = _tabM.data();
-    WT* rows[PD_SZ];
-    CastOp castOp;
 
     CV_Assert( ssize.width > 0 && ssize.height > 0 &&
                std::abs(dsize.width*2 - ssize.width) <= 2 &&
                std::abs(dsize.height*2 - ssize.height) <= 2 );
-    int sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
+    int width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
 
     for (int x = 0; x <= PD_SZ+1; x++)
     {
@@ -754,27 +770,51 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType )
         }
     }
 
+    for (int x = 0; x < dsize.width*cn; x++)
+        tabM[x] = (x/cn)*2*cn + x % cn;
+
+    int *tabLPtr = tabL;
+    int *tabRPtr = tabR;
+
+    cv::parallel_for_(Range(0,dsize.height), cv::PyrDownInvoker<CastOp>(_src, _dst, borderType, &tabRPtr, &tabM, &tabLPtr), cv::getNumThreads());
+}
+
+template<class CastOp>
+void PyrDownInvoker<CastOp>::operator()(const Range& range) const
+{
+    const int PD_SZ = 5;
+    typedef typename CastOp::type1 WT;
+    typedef typename CastOp::rtype T;
+    Size ssize = _src->size(), dsize = _dst->size();
+    int cn = _src->channels();
+    int bufstep = (int)alignSize(dsize.width*cn, 16);
+    AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
+    WT* buf = alignPtr((WT*)_buf.data(), 16);
+    WT* rows[PD_SZ];
+    CastOp castOp;
+
+    int sy0 = -PD_SZ/2, sy = range.start * 2 + sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
+
     ssize.width *= cn;
     dsize.width *= cn;
     width0 *= cn;
 
-    for (int x = 0; x < dsize.width; x++)
-        tabM[x] = (x/cn)*2*cn + x % cn;
-
-    for (int y = 0; y < dsize.height; y++)
+    for (int y = range.start; y < range.end; y++)
     {
-        T* dst = _dst.ptr<T>(y);
+        T* dst = (T*)_dst->ptr<T>(y);
         WT *row0, *row1, *row2, *row3, *row4;
 
         // fill the ring buffer (horizontal convolution and decimation)
-        for( ; sy <= y*2 + 2; sy++ )
+        int sy_limit = y*2 + 2;
+        for( ; sy <= sy_limit; sy++ )
         {
             WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
-            int _sy = borderInterpolate(sy, ssize.height, borderType);
-            const T* src = _src.ptr<T>(_sy);
+            int _sy = borderInterpolate(sy, ssize.height, _borderType);
+            const T* src = _src->ptr<T>(_sy);
 
             do {
                 int x = 0;
+                const int* tabL = *_tabL;
                 for( ; x < cn; x++ )
                 {
                     row[x] = src[tabL[x+cn*2]]*6 + (src[tabL[x+cn]] + src[tabL[x+cn*3]])*4 +
@@ -832,13 +872,14 @@ pyrDown_( const Mat& _src, Mat& _dst, int borderType )
                 {
                     for( ; x < width0; x++ )
                     {
-                        int sx = tabM[x];
+                        int sx = (*_tabM)[x];
                         row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 +
                             src[sx - cn*2] + src[sx + cn*2];
                     }
                 }
 
                 // tabR
+                const int* tabR = *_tabR;
                 for (int x_ = 0; x < dsize.width; x++, x_++)
                 {
                     row[x] = src[tabR[x_+cn*2]]*6 + (src[tabR[x_+cn]] + src[tabR[x_+cn*3]])*4 +
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 3e81f3be58..22c6874f59 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -56,9 +56,18 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
 {
     using namespace cv;
     using cv::detail::deriv_type;
-    int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth();
+    int rows = src.rows, cols = src.cols, cn = src.channels(), depth = src.depth();
     CV_Assert(depth == CV_8U);
     dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
+    parallel_for_(Range(0, rows), cv::detail::SharrDerivInvoker(src, dst), cv::getNumThreads());
+}
+
+}//namespace
+
+void cv::detail::SharrDerivInvoker::operator()(const Range& range) const
+{
+    using cv::detail::deriv_type;
+    int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn;
 
 #ifdef HAVE_TEGRA_OPTIMIZATION
     if (tegra::useTegra() && tegra::calcSharrDeriv(src, dst))
@@ -73,12 +82,12 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
     v_int16x8 c3 = v_setall_s16(3), c10 = v_setall_s16(10);
 #endif
 
-    for( y = 0; y < rows; y++ )
+    for( y = range.start; y < range.end; y++ )
     {
         const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0);
         const uchar* srow1 = src.ptr<uchar>(y);
         const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
-        deriv_type* drow = dst.ptr<deriv_type>(y);
+        deriv_type* drow = (deriv_type *)dst.ptr<deriv_type>(y);
 
         // do vertical convolution
         x = 0;
@@ -143,8 +152,6 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
     }
 }
 
-}//namespace
-
 cv::detail::LKTrackerInvoker::LKTrackerInvoker(
                       const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
                       const Point2f* _prevPts, Point2f* _nextPts,
diff --git a/modules/video/src/lkpyramid.hpp b/modules/video/src/lkpyramid.hpp
index 9e62d06b81..16b0da189e 100644
--- a/modules/video/src/lkpyramid.hpp
+++ b/modules/video/src/lkpyramid.hpp
@@ -7,6 +7,18 @@ namespace detail
 
     typedef short deriv_type;
 
+    struct SharrDerivInvoker : ParallelLoopBody
+    {
+        SharrDerivInvoker(const Mat& _src, const Mat& _dst)
+            : src(_src), dst(_dst)
+        { }
+
+        void operator()(const Range& range) const CV_OVERRIDE;
+
+        const Mat& src;
+        const Mat& dst;
+    };
+
     struct LKTrackerInvoker : ParallelLoopBody
     {
         LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,

From ed7e4273cdc207f3f67ffa2692b728cb10de21b5 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com>
Date: Fri, 1 Nov 2019 15:30:48 -0400
Subject: [PATCH 08/13] Merge pull request #15555 from
 ChipKerchner:flipVectorize

* Vectorize flipHoriz and flipVert functions.

* Change v_load_mirror_1 to use vec_revb for VSX

* Only use vec_revb in ISA3.0

* Removing vec_revb code since some of the older compilers don't fully support it.

* Use new v_reverse intrinsic and cleanup code.

* Ensure there are no alignment issues with copies
---
 modules/core/src/copy.cpp | 217 +++++++++++++++++++++++++++++++++++---
 1 file changed, 204 insertions(+), 13 deletions(-)

diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index c1478de763..3f68a2555a 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -563,25 +563,206 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
     return *this;
 }
 
+#if CV_SIMD128
+template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    typedef typename V::lane_type T;
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+    int width_1 = width & -v_uint8x16::nlanes;
+    int i, j;
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+        {
+            V t0, t1;
+
+            t0 = v_load((T*)((uchar*)src + i));
+            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
+            t0 = v_reverse(t0);
+            t1 = v_reverse(t1);
+            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
+            v_store((T*)(dst + i), t1);
+        }
+        if (((size_t)src|(size_t)dst) % sizeof(T) == 0)
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                T t0, t1;
+
+                t0 = *((T*)((uchar*)src + i));
+                t1 = *((T*)((uchar*)src + j - sizeof(T)));
+                *((T*)(dst + j - sizeof(T))) = t0;
+                *((T*)(dst + i)) = t1;
+            }
+        }
+        else
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                for (int k = 0; k < (int)sizeof(T); k++)
+                {
+                    uchar t0, t1;
+
+                    t0 = *((uchar*)src + i + k);
+                    t1 = *((uchar*)src + j + k - sizeof(T));
+                    *(dst + j + k - sizeof(T)) = t0;
+                    *(dst + i + k) = t1;
+                }
+            }
+        }
+    }
+}
+
+template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
+        {
+            T1 t0, t1;
+            T2 t2, t3;
+
+            t0 = *((T1*)((uchar*)src + i));
+            t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
+            t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
+            t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
+            *((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
+            *((T2*)(dst + j - sizeof(T2))) = t2;
+            *((T1*)(dst + i)) = t1;
+            *((T2*)(dst + i + sizeof(T1))) = t3;
+        }
+    }
+}
+#endif
 
 static void
 flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
-    int i, j, limit = (int)(((size.width + 1)/2)*esz);
-    AutoBuffer<int> _tab(size.width*esz);
-    int* tab = _tab.data();
-
-    for( i = 0; i < size.width; i++ )
-        for( size_t k = 0; k < esz; k++ )
-            tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
-
-    for( ; size.height--; src += sstep, dst += dstep )
+#if CV_SIMD
+    if (esz == 2 * v_uint8x16::nlanes)
     {
-        for( i = 0; i < limit; i++ )
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
         {
-            j = tab[i];
-            uchar t0 = src[i], t1 = src[j];
-            dst[i] = t1; dst[j] = t0;
+            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
+            {
+#if CV_SIMD256
+                v_uint8x32 t0, t1;
+
+                t0 = v256_load((uchar*)src + i);
+                t1 = v256_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+#else
+                v_uint8x16 t0, t1, t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
+                t2 = v_load((uchar*)src + j);
+                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
+                v_store(dst + j, t0);
+                v_store(dst + j + v_uint8x16::nlanes, t1);
+                v_store(dst + i, t2);
+                v_store(dst + i + v_uint8x16::nlanes, t3);
+#endif
+            }
+        }
+    }
+    else if (esz == v_uint8x16::nlanes)
+    {
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+            {
+                v_uint8x16 t0, t1;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+            }
+        }
+    }
+    else if (esz == 8)
+    {
+        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 4)
+    {
+        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 2)
+    {
+        flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 1)
+    {
+        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 24)
+    {
+        int end = (int)(size.width*esz);
+        int width = (end + 1)/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
+            {
+                v_uint8x16 t0, t1;
+                uint64_t t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
+                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
+                t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
+                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
+                *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
+                v_store(dst + i, t1);
+                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
+            }
+        }
+    }
+    else if (esz == 12)
+    {
+        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 6)
+    {
+        flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 3)
+    {
+        flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
+    }
+    else
+#endif
+    {
+        int i, j, limit = (int)(((size.width + 1)/2)*esz);
+        AutoBuffer<int> _tab(size.width*esz);
+        int* tab = _tab.data();
+
+        for( i = 0; i < size.width; i++ )
+            for( size_t k = 0; k < esz; k++ )
+                tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( i = 0; i < limit; i++ )
+            {
+                j = tab[i];
+                uchar t0 = src[i], t1 = src[j];
+                dst[i] = t1; dst[j] = t0;
+            }
         }
     }
 }
@@ -597,6 +778,16 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
                                                   dst0 += dstep, dst1 -= dstep )
     {
         int i = 0;
+#if CV_SIMD
+        for( ; i <= size.width - (v_int32::nlanes * 4); i += v_int32::nlanes * 4 )
+        {
+            v_int32 t0 = vx_load((int*)(src0 + i));
+            v_int32 t1 = vx_load((int*)(src1 + i));
+            vx_store((int*)(dst0 + i), t1);
+            vx_store((int*)(dst1 + i), t0);
+        }
+#endif
+
         if( ((size_t)src0|(size_t)dst0|(size_t)src1|(size_t)dst1) % sizeof(int) == 0 )
         {
             for( ; i <= size.width - 16; i += 16 )

From 4e156a162f6cfd3ed9243d4bdc4328b4fa13a023 Mon Sep 17 00:00:00 2001
From: yuriyluxriot <yuriy.shmals@luxriot.com>
Date: Fri, 1 Nov 2019 21:33:12 +0200
Subject: [PATCH 09/13] Merge pull request #15812 from
 yuriyluxriot:fls_replaces_tls

* Use FlsAlloc/FlsFree/FlsGetValue/FlsSetValue instead of TlsAlloc/TlsFree/TlsGetValue/TlsSetValue to implment TLS value cleanup when thread has been terminated on Windows Vista and above

* Fix 32-bit build

* Fixed calling convention of cleanup callback

* WINAPI changed to NTAPI

* Use proper guard macro
---
 modules/core/src/system.cpp | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index b39173de0d..47cb63ee87 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -131,6 +131,10 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #if (_WIN32_WINNT >= 0x0602)
   #include <synchapi.h>
 #endif
+#if ((_WIN32_WINNT >= 0x0600) && !defined(CV_DISABLE_FLS)) || defined(CV_FORCE_FLS)
+  #include <fibersapi.h>
+  #define CV_USE_FLS
+#endif
 #undef small
 #undef min
 #undef max
@@ -142,7 +146,7 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #ifndef __cplusplus_winrt
 #include <windows.storage.h>
 #pragma comment(lib, "runtimeobject.lib")
-#endif
+#endif // WINRT
 
 std::wstring GetTempPathWinRT()
 {
@@ -1422,24 +1426,43 @@ void  TlsAbstraction::SetData(void *pData)
     tlsData = pData;
 }
 #else //WINRT
+#ifdef CV_USE_FLS
+static void NTAPI opencv_fls_destructor(void* pData);
+#endif // CV_USE_FLS
 TlsAbstraction::TlsAbstraction()
 {
+#ifndef CV_USE_FLS
     tlsKey = TlsAlloc();
+#else // CV_USE_FLS
+    tlsKey = FlsAlloc(opencv_fls_destructor);
+#endif // CV_USE_FLS
     CV_Assert(tlsKey != TLS_OUT_OF_INDEXES);
 }
 TlsAbstraction::~TlsAbstraction()
 {
+#ifndef CV_USE_FLS
     TlsFree(tlsKey);
+#else // CV_USE_FLS
+    FlsFree(tlsKey);
+#endif // CV_USE_FLS
 }
 void* TlsAbstraction::GetData() const
 {
+#ifndef CV_USE_FLS
     return TlsGetValue(tlsKey);
+#else // CV_USE_FLS
+    return FlsGetValue(tlsKey);
+#endif // CV_USE_FLS
 }
 void  TlsAbstraction::SetData(void *pData)
 {
+#ifndef CV_USE_FLS
     CV_Assert(TlsSetValue(tlsKey, pData) == TRUE);
+#else // CV_USE_FLS
+    CV_Assert(FlsSetValue(tlsKey, pData) == TRUE);
+#endif // CV_USE_FLS
 }
-#endif
+#endif // WINRT
 #else // _WIN32
 static void opencv_tls_destructor(void* pData);
 TlsAbstraction::TlsAbstraction()
@@ -1674,7 +1697,14 @@ static void opencv_tls_destructor(void* pData)
 {
     getTlsStorage().releaseThread(pData);
 }
-#endif
+#else // _WIN32
+#ifdef CV_USE_FLS
+static void WINAPI opencv_fls_destructor(void* pData)
+{
+    getTlsStorage().releaseThread(pData);
+}
+#endif // CV_USE_FLS
+#endif // _WIN32
 
 } // namespace details
 using namespace details;

From d56535afce595e52823cb7d6223c1242bba8beff Mon Sep 17 00:00:00 2001
From: Oleg Alexandrov <oleg.alexandrov@gmail.com>
Date: Fri, 1 Nov 2019 12:34:11 -0700
Subject: [PATCH 10/13] Merge pull request #15820 from oleg-alexandrov:patch-1

Clarify stereoRectify() doc

The function stereoRectify() takes as input a coordinate transform between two cameras. It is ambiguous how it goes. I clarified that it goes from the second camera to the first.
---
 modules/calib3d/include/opencv2/calib3d.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 8b2d993ad1..c4ca492a4d 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1543,8 +1543,8 @@ CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,
 @param cameraMatrix2 Second camera matrix.
 @param distCoeffs2 Second camera distortion parameters.
 @param imageSize Size of the image used for stereo calibration.
-@param R Rotation matrix between the coordinate systems of the first and the second cameras.
-@param T Translation vector between coordinate systems of the cameras.
+@param R Rotation matrix from the coordinate system of the second camera to the first.
+@param T Translation vector from the coordinate system of the second camera to the first.
 @param R1 Output 3x3 rectification transform (rotation matrix) for the first camera.
 @param R2 Output 3x3 rectification transform (rotation matrix) for the second camera.
 @param P1 Output 3x4 projection matrix in the new (rectified) coordinate systems for the first

From e65b51ca3c7995d8837b29e556a77db85ef07f05 Mon Sep 17 00:00:00 2001
From: Gael Colas <gael.colas@plus.ai>
Date: Fri, 1 Nov 2019 12:37:34 -0700
Subject: [PATCH 11/13] Merge pull request #15821 from
 ColasGael:colasg-viz-color

Fix wrong definition of viz::Color::navy()
---
 modules/viz/include/opencv2/viz/types.hpp | 76 ++++++++++++-----------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/modules/viz/include/opencv2/viz/types.hpp b/modules/viz/include/opencv2/viz/types.hpp
index 62068ac898..1a7bde292c 100644
--- a/modules/viz/include/opencv2/viz/types.hpp
+++ b/modules/viz/include/opencv2/viz/types.hpp
@@ -75,42 +75,42 @@ namespace cv
             static Color black();
             static Color blue();
             static Color green();
-            static Color cyan();
-
             static Color red();
-            static Color magenta();
+            static Color cyan();
             static Color yellow();
+            static Color magenta();
             static Color white();
 
             static Color gray();
+            static Color silver();
 
             static Color mlab();
 
             static Color navy();
-            static Color olive();
             static Color maroon();
             static Color teal();
-            static Color rose();
+            static Color olive();
+            static Color purple();
             static Color azure();
+            static Color chartreuse();
+            static Color rose();
+
             static Color lime();
             static Color gold();
-            static Color brown();
             static Color orange();
-            static Color chartreuse();
             static Color orange_red();
-            static Color purple();
             static Color indigo();
 
-            static Color pink();
-            static Color cherry();
-            static Color bluberry();
-            static Color raspberry();
-            static Color silver();
-            static Color violet();
+            static Color brown();
             static Color apricot();
-            static Color turquoise();
-            static Color celestial_blue();
+            static Color pink();
+            static Color raspberry();
+            static Color cherry();
+            static Color violet();
             static Color amethyst();
+            static Color bluberry();
+            static Color celestial_blue();
+            static Color turquoise();
 
             static Color not_set();
         };
@@ -343,42 +343,44 @@ inline cv::viz::Color::Color(const Scalar& color) : Scalar(color) {}
 inline cv::viz::Color::operator cv::Vec3b() const { return cv::Vec3d(val); }
 
 inline cv::viz::Color cv::viz::Color::black()   { return Color(  0,   0,   0); }
-inline cv::viz::Color cv::viz::Color::green()   { return Color(  0, 255,   0); }
 inline cv::viz::Color cv::viz::Color::blue()    { return Color(255,   0,   0); }
-inline cv::viz::Color cv::viz::Color::cyan()    { return Color(255, 255,   0); }
+inline cv::viz::Color cv::viz::Color::green()   { return Color(  0, 255,   0); }
 inline cv::viz::Color cv::viz::Color::red()     { return Color(  0,   0, 255); }
+inline cv::viz::Color cv::viz::Color::cyan()    { return Color(255, 255,   0); }
 inline cv::viz::Color cv::viz::Color::yellow()  { return Color(  0, 255, 255); }
 inline cv::viz::Color cv::viz::Color::magenta() { return Color(255,   0, 255); }
 inline cv::viz::Color cv::viz::Color::white()   { return Color(255, 255, 255); }
+
 inline cv::viz::Color cv::viz::Color::gray()    { return Color(128, 128, 128); }
+inline cv::viz::Color cv::viz::Color::silver()  { return Color(192, 192, 192); }
 
 inline cv::viz::Color cv::viz::Color::mlab()    { return Color(255, 128, 128); }
 
-inline cv::viz::Color cv::viz::Color::navy()       { return Color(0,     0, 128); }
-inline cv::viz::Color cv::viz::Color::olive()      { return Color(0,   128, 128); }
-inline cv::viz::Color cv::viz::Color::maroon()     { return Color(0,     0, 128); }
+inline cv::viz::Color cv::viz::Color::navy()       { return Color(128,   0,   0); }
+inline cv::viz::Color cv::viz::Color::maroon()     { return Color(  0,   0, 128); }
 inline cv::viz::Color cv::viz::Color::teal()       { return Color(128, 128,   0); }
-inline cv::viz::Color cv::viz::Color::rose()       { return Color(128,   0, 255); }
-inline cv::viz::Color cv::viz::Color::azure()      { return Color(255, 128,   0); }
-inline cv::viz::Color cv::viz::Color::lime()       { return Color(0,   255, 191); }
-inline cv::viz::Color cv::viz::Color::gold()       { return Color(0,   215, 255); }
-inline cv::viz::Color cv::viz::Color::brown()      { return Color(42,    42, 165); }
-inline cv::viz::Color cv::viz::Color::orange()     { return Color(0,   165, 255); }
-inline cv::viz::Color cv::viz::Color::chartreuse() { return Color(0,   255, 128); }
-inline cv::viz::Color cv::viz::Color::orange_red() { return Color(0,    69, 255); }
+inline cv::viz::Color cv::viz::Color::olive()      { return Color(  0, 128, 128); }
 inline cv::viz::Color cv::viz::Color::purple()     { return Color(128,   0, 128); }
+inline cv::viz::Color cv::viz::Color::azure()      { return Color(255, 128,   0); }
+inline cv::viz::Color cv::viz::Color::chartreuse() { return Color(  0, 255, 128); }
+inline cv::viz::Color cv::viz::Color::rose()       { return Color(128,   0, 255); }
+
+inline cv::viz::Color cv::viz::Color::lime()       { return Color(  0, 255, 191); }
+inline cv::viz::Color cv::viz::Color::gold()       { return Color(  0, 215, 255); }
+inline cv::viz::Color cv::viz::Color::orange()     { return Color(  0, 165, 255); }
+inline cv::viz::Color cv::viz::Color::orange_red() { return Color(  0,  69, 255); }
 inline cv::viz::Color cv::viz::Color::indigo()     { return Color(130,   0,  75); }
 
-inline cv::viz::Color cv::viz::Color::pink()           { return Color(203, 192, 255); }
-inline cv::viz::Color cv::viz::Color::cherry()         { return Color( 99,  29, 222); }
-inline cv::viz::Color cv::viz::Color::bluberry()       { return Color(247, 134,  79); }
-inline cv::viz::Color cv::viz::Color::raspberry()      { return Color( 92,  11, 227); }
-inline cv::viz::Color cv::viz::Color::silver()         { return Color(192, 192, 192); }
-inline cv::viz::Color cv::viz::Color::violet()         { return Color(226,  43, 138); }
+inline cv::viz::Color cv::viz::Color::brown()          { return Color( 42,  42, 165); }
 inline cv::viz::Color cv::viz::Color::apricot()        { return Color(177, 206, 251); }
-inline cv::viz::Color cv::viz::Color::turquoise()      { return Color(208, 224,  64); }
-inline cv::viz::Color cv::viz::Color::celestial_blue() { return Color(208, 151,  73); }
+inline cv::viz::Color cv::viz::Color::pink()           { return Color(203, 192, 255); }
+inline cv::viz::Color cv::viz::Color::raspberry()      { return Color( 92,  11, 227); }
+inline cv::viz::Color cv::viz::Color::cherry()         { return Color( 99,  29, 222); }
+inline cv::viz::Color cv::viz::Color::violet()         { return Color(226,  43, 138); }
 inline cv::viz::Color cv::viz::Color::amethyst()       { return Color(204, 102, 153); }
+inline cv::viz::Color cv::viz::Color::bluberry()       { return Color(247, 134,  79); }
+inline cv::viz::Color cv::viz::Color::celestial_blue() { return Color(208, 151,  73); }
+inline cv::viz::Color cv::viz::Color::turquoise()      { return Color(208, 224,  64); }
 
 inline cv::viz::Color cv::viz::Color::not_set()        { return Color(-1, -1, -1); }
 

From b7c8e9e8744dec4007144fde65c58ddd8a95a029 Mon Sep 17 00:00:00 2001
From: berak <px1704@web.de>
Date: Sat, 2 Nov 2019 08:17:07 +0100
Subject: [PATCH 12/13] python: fix type error msg

---
 modules/python/src2/gen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 1b3c329fbf..69d03a66fd 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -73,7 +73,7 @@ struct PyOpenCV_Converter< ${cname} >
             return true;
         }
         ${mappable_code}
-        failmsg("Expected ${cname} for argument '%%s'", name);
+        failmsg("Expected ${cname} for argument '%s'", name);
         return false;
     }
 };

From 53139e6ebe5cb76adddb9bf08590c878dc6a10e3 Mon Sep 17 00:00:00 2001
From: Oleg Alexandrov <oleg.alexandrov@gmail.com>
Date: Sun, 3 Nov 2019 05:37:25 -0800
Subject: [PATCH 13/13] Merge pull request #15838 from oleg-alexandrov:patch-2

Correct stereoRectify documentation
---
 modules/calib3d/include/opencv2/calib3d.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index c4ca492a4d..1dc45fc1b2 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1543,8 +1543,8 @@ CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,
 @param cameraMatrix2 Second camera matrix.
 @param distCoeffs2 Second camera distortion parameters.
 @param imageSize Size of the image used for stereo calibration.
-@param R Rotation matrix from the coordinate system of the second camera to the first.
-@param T Translation vector from the coordinate system of the second camera to the first.
+@param R Rotation matrix from the coordinate system of the first camera to the second.
+@param T Translation vector from the coordinate system of the first camera to the second.
 @param R1 Output 3x3 rectification transform (rotation matrix) for the first camera.
 @param R2 Output 3x3 rectification transform (rotation matrix) for the second camera.
 @param P1 Output 3x4 projection matrix in the new (rectified) coordinate systems for the first