From 42b1d049996779de0b8d5de2de1e671c11b687dd Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Fri, 12 Jul 2019 01:34:19 +0300 Subject: [PATCH] StereoSGBM algorithm updated to use wide universal intrinsics --- modules/calib3d/src/stereosgbm.cpp | 1712 ++++++++++++++-------------- 1 file changed, 872 insertions(+), 840 deletions(-) diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index 88b28ff598..3b721ccf66 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -61,7 +61,9 @@ typedef uchar PixType; typedef short CostType; typedef short DispType; -enum { NR = 16, NR2 = NR/2 }; +// NR - the number of directions. the loop on x that computes Lr assumes that NR == 8. +// if you change NR, please, modify the loop as well. +enum { NR = 8, NR2 = NR/2 }; struct StereoSGBMParams @@ -110,6 +112,41 @@ struct StereoSGBMParams int mode; }; +#if CV_SIMD +#if CV_SIMD_WIDTH == 16 +static inline v_int16 vx_setseq_s16() +{ return v_int16(0, 1, 2, 3, 4, 5, 6, 7); } +#elif CV_SIMD_WIDTH == 32 +static inline v_int16 vx_setseq_s16() +{ return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +#elif CV_SIMD_WIDTH == 64 +static inline v_int16 vx_setseq_s16() +{ return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } +#else +struct vseq_s16 +{ + short data[v_int16::nlanes]; + vseq_s16() + { + for (int i = 0; i < v_int16::nlanes; i++) + data[i] = i; + } +}; +static inline v_int16 vx_setseq_s16() +{ + static vseq_s16 vseq; + return vx_load(vseq.data); +} +#endif +// define some additional reduce operations: +static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos) +{ + min_val = v_reduce_min(val); + v_int16 v_mask = (vx_setall_s16(min_val) == val); + min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask)); +} +#endif + static const int DEFAULT_RIGHT_BORDER = -1; /* For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD), @@ -128,7 +165,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, { int x, c, width = img1.cols, cn = img1.channels(); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); - int D = maxD - minD, width1 = maxX1 - minX1; + int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1; //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated //That is needs of parallel algorithm xrange_min = (xrange_min < 0) ? 0: xrange_min; @@ -191,7 +228,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) ); - buffer -= width-1-maxX2; + buffer -= width-maxX2; cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width ) @@ -201,7 +238,9 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, // precompute // v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and // v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and - for( x = width-1-maxX2; x < width-1- minX2; x++ ) + // to process values from [minX2, maxX2) we should check memory location (width - 1 - maxX2, width - 1 - minX2] + // so iterate through [width - maxX2, width - minX2) + for( x = width-maxX2; x < width-minX2; x++ ) { int v = prow2[x]; int vl = x > 0 ? (v + prow2[x-1])/2 : v; @@ -220,43 +259,38 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, int u0 = std::min(ul, ur); u0 = std::min(u0, u); int u1 = std::max(ul, ur); u1 = std::max(u1, u); - #if CV_SIMD128 - if (true) + int d = minD; + #if CV_SIMD + v_uint8 _u = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0); + v_uint8 _u1 = vx_setall_u8((uchar)u1); + + for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes ) { - v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0); - v_uint8x16 _u1 = v_setall_u8((uchar)u1); + v_uint8 _v = vx_load(prow2 + width-x-1 + d); + v_uint8 _v0 = vx_load(buffer + width-x-1 + d); + v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2); + v_uint8 c0 = v_max(_u - _v1, _v0 - _u); + v_uint8 c1 = v_max(_v - _u1, _u0 - _v); + v_uint8 diff = v_min(c0, c1); - for( int d = minD; d < maxD; d += 16 ) - { - v_uint8x16 _v = v_load(prow2 + width-x-1 + d); - v_uint8x16 _v0 = v_load(buffer + width-x-1 + d); - v_uint8x16 _v1 = v_load(buffer + width-x-1 + d + width2); - v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u); - v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v); - v_uint8x16 diff = v_min(c0, c1); + v_int16 _c0 = vx_load_aligned(cost + x*D + d); + v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes); - v_int16x8 _c0 = v_load_aligned(cost + x*D + d); - v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8); - - v_uint16x8 diff1,diff2; - v_expand(diff,diff1,diff2); - v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale)); - v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale)); - } + v_uint16 diff1,diff2; + v_expand(diff,diff1,diff2); + v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale)); + v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale)); } - else #endif + for( ; d < maxD; d++ ) { - for( int d = minD; d < maxD; d++ ) - { - int v = prow2[width-x-1 + d]; - int v0 = buffer[width-x-1 + d]; - int v1 = buffer[width-x-1 + d + width2]; - int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u); - int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v); + int v = prow2[width-x-1 + d]; + int v0 = buffer[width-x-1 + d]; + int v1 = buffer[width-x-1 + d + width2]; + int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u); + int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v); - cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale)); - } + cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale)); } } } @@ -287,23 +321,6 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params, Mat& buffer ) { -#if CV_SIMD128 - // maxDisparity is supposed to multiple of 16, so we can forget doing else - static const uchar LSBTab[] = - { - 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 - }; - static const v_uint16x8 v_LSB = v_uint16x8(0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); -#endif - - const int ALIGN = 16; const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SCALE = (1 << DISP_SHIFT); const CostType MAX_COST = SHRT_MAX; @@ -318,6 +335,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, int k, width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int D = maxD - minD, width1 = maxX1 - minX1; + int Da = (int)alignSize(D, v_int16::nlanes); + int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2; bool fullDP = params.mode == StereoSGBM::MODE_HH; @@ -334,42 +353,33 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, return; } - CV_Assert( D % 16 == 0 ); - - // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8. - // if you change NR, please, modify the loop as well. - int D2 = D+16, NRD2 = NR2*D2; - - // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: - // for 8-way dynamic programming we need the current row and - // the previous row, i.e. 2 rows in total - const int NLR = 2; - const int LrBorder = NLR - 1; - // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) // we keep pixel difference cost (C) and the summary cost over NR directions (S). // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) - size_t costBufSize = width1*D; + size_t costBufSize = width1*Da; size_t CSBufSize = costBufSize*(fullDP ? height : 1); - size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2; + size_t minLrSize = (width1 + 2)*NR2, LrSize = minLrSize*Dlra; int hsumBufNRows = SH2*2 + 2; - size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[] + // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: + // for 8-way dynamic programming we need the current row and + // the previous row, i.e. 2 rows in total + size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // alignment, C, S costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff - CSBufSize*2*sizeof(CostType) + // C, S - width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost - width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2 + ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType) + // minLr[] and Lr[] + width*(sizeof(CostType) + sizeof(DispType)) + // disp2cost + disp2 + width * (4*img1.channels() + 2) * sizeof(PixType); // temp buffer for computing per-pixel cost if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) buffer.reserveBuffer(totalBufSize); // summary cost over different (nDirs) directions - CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN); + CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH); CostType* Sbuf = Cbuf + CSBufSize; CostType* hsumBuf = Sbuf + CSBufSize; CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows; - CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR; + CostType* disp2cost = pixDiff + costBufSize + ((LrSize + minLrSize)*2 + v_int16::nlanes); DispType* disp2ptr = (DispType*)(disp2cost + width); PixType* tempBuf = (PixType*)(disp2ptr + width); @@ -392,19 +402,19 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, x1 = width1-1; x2 = -1; dx = -1; } - CostType *Lr[NLR]={0}, *minLr[NLR]={0}; + CostType *Lr[2]={0}, *minLr[2]={0}; - for( k = 0; k < NLR; k++ ) + for( k = 0; k < 2; k++ ) { // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, // and will occasionally use negative indices with the arrays // we need to shift Lr[k] pointers by 1, to give the space for d=-1. // however, then the alignment will be imperfect, i.e. bad for SSE, - // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment) - Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8; - memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) ); - minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder; - memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) ); + // thus we shift the pointers by SIMD vector size + Lr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*k + NR2*Dlra; + memset( Lr[k] - NR2*Dlra, 0, LrSize*sizeof(CostType) ); + minLr[k] = pixDiff + costBufSize + v_int16::nlanes + LrSize*2 + minLrSize*k + NR2; + memset( minLr[k] - NR2, 0, minLrSize*sizeof(CostType) ); } for( int y = y1; y != y2; y += dy ) @@ -426,83 +436,124 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, { calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero ); - memset(hsumAdd, 0, D*sizeof(CostType)); - for( x = 0; x <= SW2*D; x += D ) + memset(hsumAdd, 0, Da*sizeof(CostType)); +#if CV_SIMD + v_int16 h_scale = vx_setall_s16((short)SW2 + 1); + for( d = 0; d < Da; d += v_int16::nlanes ) { - int scale = x == 0 ? SW2 + 1 : 1; - for( d = 0; d < D; d++ ) - hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale); + v_int16 v_hsumAdd = vx_load_aligned(pixDiff + d) * h_scale; + for( x = Da; x <= SW2*Da; x += Da ) + v_hsumAdd += vx_load_aligned(pixDiff + x + d); + v_store_aligned(hsumAdd + d, v_hsumAdd); } +#else + for (d = 0; d < D; d++) + { + hsumAdd[d] = (CostType)(pixDiff[d] * (SW2 + 1)); + for( x = Da; x <= SW2*Da; x += Da ) + hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]); + } +#endif if( y > 0 ) { const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; - for( x = D; x < width1*D; x += D ) - { - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); +#if CV_SIMD + for (d = 0; d < Da; d += v_int16::nlanes) + v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d)); +#else + for (d = 0; d < D; d++) + C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]); +#endif - #if CV_SIMD128 - if (true) + for( x = Da; x < width1*Da; x += Da ) + { + const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); +#if CV_SIMD + for( d = 0; d < Da; d += v_int16::nlanes ) { - for( d = 0; d < D; d += 8 ) - { - v_int16x8 hv = v_load(hsumAdd + x - D + d); - v_int16x8 Cx = v_load(Cprev + x + d); - v_int16x8 psub = v_load(pixSub + d); - v_int16x8 padd = v_load(pixAdd + d); - hv = (hv - psub + padd); - psub = v_load(hsumSub + x + d); - Cx = Cx - psub + hv; - v_store(hsumAdd + x + d, hv); - v_store(C + x + d, Cx); - } + v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d); + v_store_aligned(hsumAdd + x + d, hv); + v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv); } - else - #endif +#else + for( d = 0; d < D; d++ ) { - for( d = 0; d < D; d++ ) - { - int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); - C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]); - } + int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]); + C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]); } +#endif } } else { - for( x = D; x < width1*D; x += D ) +#if CV_SIMD + v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1); + for (d = 0; d < Da; d += v_int16::nlanes) + v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale); +#else + int scale = k == 0 ? SH2 + 1 : 1; + for (d = 0; d < D; d++) + C[d] = (CostType)(C[d] + hsumAdd[d] * scale); +#endif + for( x = Da; x < width1*Da; x += Da ) { - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); + const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); +#if CV_SIMD + for (d = 0; d < Da; d += v_int16::nlanes) + { + v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d); + v_store_aligned(hsumAdd + x + d, hv); + v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale); + } +#else for( d = 0; d < D; d++ ) - hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); + { + CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]); + hsumAdd[x + d] = hv; + C[x + d] = (CostType)(C[x + d] + hv * scale); + } +#endif } } } - - if( y == 0 ) + else { - int scale = k == 0 ? SH2 + 1 : 1; - for( x = 0; x < width1*D; x++ ) - C[x] = (CostType)(C[x] + hsumAdd[x]*scale); + if( y > 0 ) + { + const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; + const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; +#if CV_SIMD + for (x = 0; x < width1*Da; x += v_int16::nlanes) + v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x)); +#else + for (x = 0; x < width1*Da; x++) + C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]); +#endif + } + else + { +#if CV_SIMD + for (x = 0; x < width1*Da; x += v_int16::nlanes) + v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x)); +#else + for (x = 0; x < width1*Da; x++) + C[x] = (CostType)(C[x] + hsumAdd[x]); +#endif + } } + } // also, clear the S buffer - for( k = 0; k < width1*D; k++ ) - S[k] = 0; + memset(S, 0, width1*Da * sizeof(CostType)); } - // clear the left and the right borders - memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) ); - memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) ); - memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) ); - memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) ); - /* [formula 13 in the paper] compute L_r(p, d) = C(p, d) + @@ -515,7 +566,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, 0: r=(-dx, 0) 1: r=(-1, -dy) 2: r=(0, -dy) - 3: r=(1, -dy) + 3: r=(1, -dy) !!!Note that only directions 0 to 3 are processed 4: r=(-2, -dy) 5: r=(-1, -dy*2) 6: r=(1, -dy*2) @@ -524,135 +575,139 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, for( x = x1; x != x2; x += dx ) { - int xm = x*NR2, xd = xm*D2; + int xm = x*NR2, xd = xm*Dlra; int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2; int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2; - CostType* Lr_p0 = Lr[0] + xd - dx*NRD2; - CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2; - CostType* Lr_p2 = Lr[1] + xd + D2*2; - CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3; + CostType* Lr_p0 = Lr[0] + xd - dx*NR2*Dlra; + CostType* Lr_p1 = Lr[1] + xd - NR2*Dlra + Dlra; + CostType* Lr_p2 = Lr[1] + xd + Dlra*2; + CostType* Lr_p3 = Lr[1] + xd + NR2*Dlra + Dlra*3; Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] = Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST; CostType* Lr_p = Lr[0] + xd; - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; + const CostType* Cp = C + x*Da; + CostType* Sp = S + x*Da; - #if CV_SIMD128 - if (true) + CostType* minL = minLr[0] + xm; + d = 0; +#if CV_SIMD + v_int16 _P1 = vx_setall_s16((short)P1); + + v_int16 _delta0 = vx_setall_s16((short)delta0); + v_int16 _delta1 = vx_setall_s16((short)delta1); + v_int16 _delta2 = vx_setall_s16((short)delta2); + v_int16 _delta3 = vx_setall_s16((short)delta3); + v_int16 _minL0 = vx_setall_s16((short)MAX_COST); + v_int16 _minL1 = vx_setall_s16((short)MAX_COST); + v_int16 _minL2 = vx_setall_s16((short)MAX_COST); + v_int16 _minL3 = vx_setall_s16((short)MAX_COST); + + for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) { - v_int16x8 _P1 = v_setall_s16((short)P1); + v_int16 Cpd = vx_load_aligned(Cp + d); + v_int16 Spd = vx_load_aligned(Sp + d); + v_int16 L; - v_int16x8 _delta0 = v_setall_s16((short)delta0); - v_int16x8 _delta1 = v_setall_s16((short)delta1); - v_int16x8 _delta2 = v_setall_s16((short)delta2); - v_int16x8 _delta3 = v_setall_s16((short)delta3); - v_int16x8 _minL0 = v_setall_s16((short)MAX_COST); + L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd; + v_store_aligned(Lr_p + d, L); + _minL0 = v_min(_minL0, L); + Spd += L; - for( d = 0; d < D; d += 8 ) - { - v_int16x8 Cpd = v_load(Cp + d); - v_int16x8 L0, L1, L2, L3; + L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd; + v_store_aligned(Lr_p + d + Dlra, L); + _minL1 = v_min(_minL1, L); + Spd += L; - L0 = v_load(Lr_p0 + d); - L1 = v_load(Lr_p1 + d); - L2 = v_load(Lr_p2 + d); - L3 = v_load(Lr_p3 + d); + L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd; + v_store_aligned(Lr_p + d + Dlra*2, L); + _minL2 = v_min(_minL2, L); + Spd += L; - L0 = v_min(L0, (v_load(Lr_p0 + d - 1) + _P1)); - L0 = v_min(L0, (v_load(Lr_p0 + d + 1) + _P1)); + L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd; + v_store_aligned(Lr_p + d + Dlra*3, L); + _minL3 = v_min(_minL3, L); + Spd += L; - L1 = v_min(L1, (v_load(Lr_p1 + d - 1) + _P1)); - L1 = v_min(L1, (v_load(Lr_p1 + d + 1) + _P1)); - - L2 = v_min(L2, (v_load(Lr_p2 + d - 1) + _P1)); - L2 = v_min(L2, (v_load(Lr_p2 + d + 1) + _P1)); - - L3 = v_min(L3, (v_load(Lr_p3 + d - 1) + _P1)); - L3 = v_min(L3, (v_load(Lr_p3 + d + 1) + _P1)); - - L0 = v_min(L0, _delta0); - L0 = ((L0 - _delta0) + Cpd); - - L1 = v_min(L1, _delta1); - L1 = ((L1 - _delta1) + Cpd); - - L2 = v_min(L2, _delta2); - L2 = ((L2 - _delta2) + Cpd); - - L3 = v_min(L3, _delta3); - L3 = ((L3 - _delta3) + Cpd); - - v_store(Lr_p + d, L0); - v_store(Lr_p + d + D2, L1); - v_store(Lr_p + d + D2*2, L2); - v_store(Lr_p + d + D2*3, L3); - - // Get minimum from in L0-L3 - v_int16x8 t02L, t02H, t13L, t13H, t0123L, t0123H; - v_zip(L0, L2, t02L, t02H); // L0[0] L2[0] L0[1] L2[1]... - v_zip(L1, L3, t13L, t13H); // L1[0] L3[0] L1[1] L3[1]... - v_int16x8 t02 = v_min(t02L, t02H); // L0[i] L2[i] L0[i] L2[i]... - v_int16x8 t13 = v_min(t13L, t13H); // L1[i] L3[i] L1[i] L3[i]... - v_zip(t02, t13, t0123L, t0123H); // L0[i] L1[i] L2[i] L3[i]... - v_int16x8 t0 = v_min(t0123L, t0123H); - _minL0 = v_min(_minL0, t0); - - v_int16x8 Sval = v_load(Sp + d); - - L0 = L0 + L1; - L2 = L2 + L3; - Sval = Sval + L0; - Sval = Sval + L2; - - v_store(Sp + d, Sval); - } - - v_int32x4 minL, minH; - v_expand(_minL0, minL, minH); - v_pack_store(&minLr[0][xm], v_min(minL, minH)); + v_store_aligned(Sp + d, Spd); } - else - #endif + +#if CV_SIMD_WIDTH > 32 + minL[0] = v_reduce_min(_minL0); + minL[1] = v_reduce_min(_minL1); + minL[2] = v_reduce_min(_minL2); + minL[3] = v_reduce_min(_minL3); +#else + // Get minimum for L0-L3 + v_int16 t0, t1, t2, t3; + v_zip(_minL0, _minL2, t0, t2); + v_zip(_minL1, _minL3, t1, t3); + v_zip(v_min(t0, t2), v_min(t1, t3), t0, t1); + t0 = v_min(t0, t1); + t0 = v_min(t0, v_rotate_right<4>(t0)); +#if CV_SIMD_WIDTH == 32 + CostType buf[v_int16::nlanes]; + v_store_low(buf, v_min(t0, v_rotate_right<8>(t0))); + minL[0] = buf[0]; + minL[1] = buf[1]; + minL[2] = buf[2]; + minL[3] = buf[3]; +#else + v_store_low(minL, t0); +#endif +#endif +#else + minL[0] = MAX_COST; + minL[1] = MAX_COST; + minL[2] = MAX_COST; + minL[3] = MAX_COST; +#endif + for( ; d < D; d++ ) { - int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST; + int Cpd = Cp[d], L; + int Spd = Sp[d]; - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L0, L1, L2, L3; + L = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0; + Lr_p[d] = (CostType)L; + minL[0] = std::min(minL[0], (CostType)L); + Spd += L; - L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0; - L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d-1] + P1, std::min(Lr_p1[d+1] + P1, delta1))) - delta1; - L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d-1] + P1, std::min(Lr_p2[d+1] + P1, delta2))) - delta2; - L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d-1] + P1, std::min(Lr_p3[d+1] + P1, delta3))) - delta3; + L = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d - 1] + P1, std::min(Lr_p1[d + 1] + P1, delta1))) - delta1; + Lr_p[d + Dlra] = (CostType)L; + minL[1] = std::min(minL[1], (CostType)L); + Spd += L; - Lr_p[d] = (CostType)L0; - minL0 = std::min(minL0, L0); + L = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d - 1] + P1, std::min(Lr_p2[d + 1] + P1, delta2))) - delta2; + Lr_p[d + Dlra*2] = (CostType)L; + minL[2] = std::min(minL[2], (CostType)L); + Spd += L; - Lr_p[d + D2] = (CostType)L1; - minL1 = std::min(minL1, L1); + L = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d - 1] + P1, std::min(Lr_p3[d + 1] + P1, delta3))) - delta3; + Lr_p[d + Dlra*3] = (CostType)L; + minL[3] = std::min(minL[3], (CostType)L); + Spd += L; - Lr_p[d + D2*2] = (CostType)L2; - minL2 = std::min(minL2, L2); - - Lr_p[d + D2*3] = (CostType)L3; - minL3 = std::min(minL3, L3); - - Sp[d] = saturate_cast(Sp[d] + L0 + L1 + L2 + L3); - } - minLr[0][xm] = (CostType)minL0; - minLr[0][xm+1] = (CostType)minL1; - minLr[0][xm+2] = (CostType)minL2; - minLr[0][xm+3] = (CostType)minL3; + Sp[d] = saturate_cast(Spd); } } if( pass == npasses ) { - for( x = 0; x < width; x++ ) + x = 0; +#if CV_SIMD + v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED); + v_int16 v_max_cost = vx_setall_s16(MAX_COST); + for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes ) + { + v_store(disp1ptr + x, v_inv_dist); + v_store(disp2ptr + x, v_inv_dist); + v_store(disp2cost + x, v_max_cost); + } +#endif + for( ; x < width; x++ ) { disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; disp2cost[x] = MAX_COST; @@ -660,127 +715,81 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, for( x = width1 - 1; x >= 0; x-- ) { - CostType* Sp = S + x*D; - int minS = MAX_COST, bestDisp = -1; + CostType* Sp = S + x*Da; + CostType minS = MAX_COST; + short bestDisp = -1; if( npasses == 1 ) { - int xm = x*NR2, xd = xm*D2; + int xm = x*NR2, xd = xm*Dlra; - int minL0 = MAX_COST; - int delta0 = minLr[0][xm + NR2] + P2; - CostType* Lr_p0 = Lr[0] + xd + NRD2; + CostType* Lr_p0 = Lr[0] + xd + NR2*Dlra; Lr_p0[-1] = Lr_p0[D] = MAX_COST; CostType* Lr_p = Lr[0] + xd; - const CostType* Cp = C + x*D; + const CostType* Cp = C + x*Da; - #if CV_SIMD128 - if (true) + d = 0; + int delta0 = minLr[0][xm + NR2] + P2; + int minL0 = MAX_COST; +#if CV_SIMD + v_int16 _P1 = vx_setall_s16((short)P1); + v_int16 _delta0 = vx_setall_s16((short)delta0); + + v_int16 _minL0 = vx_setall_s16((short)MAX_COST); + v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1); + for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) { - v_int16x8 _P1 = v_setall_s16((short)P1); - v_int16x8 _delta0 = v_setall_s16((short)delta0); + v_int16 Cpd = vx_load_aligned(Cp + d); + v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd; - v_int16x8 _minL0 = v_setall_s16((short)minL0); - v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1); - v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8); + v_store_aligned(Lr_p + d, L0); + _minL0 = v_min(_minL0, L0); + L0 += vx_load_aligned(Sp + d); + v_store_aligned(Sp + d, L0); - for( d = 0; d < D; d += 8 ) - { - v_int16x8 Cpd = v_load(Cp + d); - v_int16x8 L0 = v_load(Lr_p0 + d); - - L0 = v_min(L0, v_load(Lr_p0 + d - 1) + _P1); - L0 = v_min(L0, v_load(Lr_p0 + d + 1) + _P1); - L0 = v_min(L0, _delta0); - L0 = L0 - _delta0 + Cpd; - - v_store(Lr_p + d, L0); - _minL0 = v_min(_minL0, L0); - L0 = L0 + v_load(Sp + d); - v_store(Sp + d, L0); - - v_int16x8 mask = _minS > L0; - _minS = v_min(_minS, L0); - _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask); - _d8 += _8; - } - short bestDispBuf[8]; - v_store(bestDispBuf, _bestDisp); - - v_int32x4 min32L, min32H; - v_expand(_minL0, min32L, min32H); - minLr[0][xm] = (CostType)std::min(v_reduce_min(min32L), v_reduce_min(min32H)); - - v_expand(_minS, min32L, min32H); - minS = std::min(v_reduce_min(min32L), v_reduce_min(min32H)); - - v_int16x8 ss = v_setall_s16((short)minS); - v_uint16x8 minMask = v_reinterpret_as_u16(ss == _minS); - v_uint16x8 minBit = minMask & v_LSB; - - v_uint32x4 minBitL, minBitH; - v_expand(minBit, minBitL, minBitH); - - int idx = v_reduce_sum(minBitL) + v_reduce_sum(minBitH); - bestDisp = bestDispBuf[LSBTab[idx]]; + _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp); + _minS = v_min(_minS, L0); } - else - #endif + minL0 = (CostType)v_reduce_min(_minL0); + min_pos(_minS, _bestDisp, minS, bestDisp); +#endif + for( ; d < D; d++ ) { - for( d = 0; d < D; d++ ) + int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0; + + Lr_p[d] = (CostType)L0; + minL0 = std::min(minL0, L0); + + CostType Sval = Sp[d] = saturate_cast(Sp[d] + L0); + if( Sval < minS ) { - int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0; - - Lr_p[d] = (CostType)L0; - minL0 = std::min(minL0, L0); - - int Sval = Sp[d] = saturate_cast(Sp[d] + L0); - if( Sval < minS ) - { - minS = Sval; - bestDisp = d; - } + minS = Sval; + bestDisp = (short)d; } - minLr[0][xm] = (CostType)minL0; } + minLr[0][xm] = (CostType)minL0; } else { - #if CV_SIMD128 - if (true) + d = 0; +#if CV_SIMD + v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1); + for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes ) { - v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1); - v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8); - - for( d = 0; d < D; d+= 8 ) - { - v_int16x8 L0 = v_load(Sp + d); - v_int16x8 mask = L0 < _minS; - _minS = v_min( L0, _minS ); - _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask); - _d8 = _d8 + _8; - } - v_int32x4 _d0, _d1; - v_expand(_minS, _d0, _d1); - minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); - v_int16x8 v_mask = v_setall_s16((short)minS) == _minS; - - _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask); - v_expand(_bestDisp, _d0, _d1); - bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); + v_int16 L0 = vx_load_aligned(Sp + d); + _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp); + _minS = v_min( L0, _minS ); } - else - #endif + min_pos(_minS, _bestDisp, minS, bestDisp); +#endif + for( ; d < D; d++ ) { - for( d = 0; d < D; d++ ) + int Sval = Sp[d]; + if( Sval < minS ) { - int Sval = Sp[d]; - if( Sval < minS ) - { - minS = Sval; - bestDisp = d; - } + minS = (CostType)Sval; + bestDisp = (short)d; } } } @@ -853,12 +862,13 @@ struct CalcVerticalSums: public ParallelLoopBody width = img1.cols; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); D = maxD - minD; + Da = (int)alignSize(D, v_int16::nlanes); + Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D width1 = maxX1 - minX1; - D2 = D + 16; - costBufSize = width1*D; + costBufSize = width1*Da; CSBufSize = costBufSize*height; minLrSize = width1; - LrSize = minLrSize*D2; + LrSize = minLrSize*Dlra; hsumBufNRows = SH2*2 + 2; Cbuf = alignedBuf; Sbuf = Cbuf + CSBufSize; @@ -868,20 +878,19 @@ struct CalcVerticalSums: public ParallelLoopBody void operator()(const Range& range) const CV_OVERRIDE { static const CostType MAX_COST = SHRT_MAX; - static const int ALIGN = 16; static const int TAB_OFS = 256*4; static const int npasses = 2; int x1 = range.start, x2 = range.end, k; - size_t pixDiffSize = ((x2 - x1) + 2*SW2)*D; - size_t auxBufsSize = pixDiffSize*sizeof(CostType) + //pixdiff size - width*16*img1.channels()*sizeof(PixType) + 32; //tempBuf + size_t pixDiffSize = ((x2 - x1) + 2*SW2)*Da; + size_t auxBufsSize = CV_SIMD_WIDTH + pixDiffSize*sizeof(CostType) + //alignment and pixdiff size + width*(4*img1.channels()+2)*sizeof(PixType); //tempBuf Mat auxBuff; auxBuff.create(1, (int)auxBufsSize, CV_8U); - CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), ALIGN); + CostType* pixDiff = (CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH); PixType* tempBuf = (PixType*)(pixDiff + pixDiffSize); // Simplification of index calculation - pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*D; + pixDiff -= (x1>SW2 ? (x1 - SW2): 0)*Da; for( int pass = 1; pass <= npasses; pass++ ) { @@ -896,18 +905,18 @@ struct CalcVerticalSums: public ParallelLoopBody y1 = height-1; y2 = -1; dy = -1; } - CostType *Lr[NLR]={0}, *minLr[NLR]={0}; + CostType *Lr[2]={0}, *minLr[2]={0}; - for( k = 0; k < NLR; k++ ) + for( k = 0; k < 2; k++ ) { // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, // and will occasionally use negative indices with the arrays // we need to shift Lr[k] pointers by 1, to give the space for d=-1. // however, then the alignment will be imperfect, i.e. bad for SSE, - // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment) - Lr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*k + 8; - memset( Lr[k] + x1*D2 - 8, 0, (x2-x1)*D2*sizeof(CostType) ); - minLr[k] = hsumBuf + costBufSize*hsumBufNRows + LrSize*NLR + minLrSize*k; + // thus we shift the pointers by SIMD vector size + Lr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*k; + memset( Lr[k] + x1*Dlra, 0, (x2-x1)*Dlra*sizeof(CostType) ); + minLr[k] = hsumBuf + costBufSize*hsumBufNRows + v_int16::nlanes + LrSize*2 + minLrSize*k; memset( minLr[k] + x1, 0, (x2-x1)*sizeof(CostType) ); } @@ -929,78 +938,115 @@ struct CalcVerticalSums: public ParallelLoopBody { calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero, x1 - SW2, x2 + SW2); - memset(hsumAdd + x1*D, 0, D*sizeof(CostType)); - for( x = (x1 - SW2)*D; x <= (x1 + SW2)*D; x += D ) + memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType)); + for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da ) { - int xbord = x <= 0 ? 0 : (x > (width1 - 1)*D? (width1 - 1)*D : x); + int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x); +#if CV_SIMD + for( d = 0; d < Da; d += v_int16::nlanes ) + v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d)); +#else for( d = 0; d < D; d++ ) - hsumAdd[x1*D + d] = (CostType)(hsumAdd[x1*D + d] + pixDiff[xbord + d]); + hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]); +#endif } if( y > 0 ) { const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; const CostType* Cprev = C - costBufSize; - +#if CV_SIMD + for( d = 0; d < Da; d += v_int16::nlanes ) + v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d)); +#else for( d = 0; d < D; d++ ) - C[x1*D + d] = (CostType)(Cprev[x1*D + d] + hsumAdd[x1*D + d] - hsumSub[x1*D + d]); - - for( x = (x1+1)*D; x < x2*D; x += D ) + C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]); +#endif + for( x = (x1+1)*Da; x < x2*Da; x += Da ) { - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); + const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); - #if CV_SIMD128 - if (true) +#if CV_SIMD + for( d = 0; d < Da; d += v_int16::nlanes ) { - for( d = 0; d < D; d += 8 ) - { - v_int16x8 hv = v_load(hsumAdd + x - D + d); - v_int16x8 Cx = v_load(Cprev + x + d); - v_int16x8 psub = v_load(pixSub + d); - v_int16x8 padd = v_load(pixAdd + d); - hv = (hv - psub + padd); - psub = v_load(hsumSub + x + d); - Cx = Cx - psub + hv; - v_store(hsumAdd + x + d, hv); - v_store(C + x + d, Cx); - } + v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d); + v_store_aligned(hsumAdd + x + d, hv); + v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv); } - else - #endif +#else + for( d = 0; d < D; d++ ) { - for( d = 0; d < D; d++ ) - { - int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); - C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]); - } + int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]); + C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]); } +#endif } } else { - for( x = (x1+1)*D; x < x2*D; x += D ) +#if CV_SIMD + v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1); + for (d = 0; d < Da; d += v_int16::nlanes) + v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale); +#else + int scale = k == 0 ? SH2 + 1 : 1; + for (d = 0; d < D; d++) + C[x1*Da + d] = (CostType)(C[x1*Da + d] + hsumAdd[x1*Da + d] * scale); +#endif + for( x = (x1+1)*Da; x < x2*Da; x += Da ) { - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); - + const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); +#if CV_SIMD + for (d = 0; d < Da; d += v_int16::nlanes) + { + v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d); + v_store_aligned(hsumAdd + x + d, hv); + v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale); + } +#else for( d = 0; d < D; d++ ) - hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); + { + CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]); + hsumAdd[x + d] = hv; + C[x + d] = (CostType)(C[x + d] + hv * scale); + } +#endif } } } - - if( y == 0 ) + else { - int scale = k == 0 ? SH2 + 1 : 1; - for( x = x1*D; x < x2*D; x++ ) - C[x] = (CostType)(C[x] + hsumAdd[x]*scale); +/* if (y > 0) + { + const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; + const CostType* Cprev = C - costBufSize; + +#if CV_SIMD + for( x = x1*Da; x < x2*Da; x += v_int16::nlanes ) + v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x)); +#else + for( x = x1*Da; x < x2*Da; x++ ) + C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]); +#endif + } + else*/ + if(y == 0) + { +#if CV_SIMD + for( x = x1*Da; x < x2*Da; x += v_int16::nlanes ) + v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x)); +#else + for( x = x1*Da; x < x2*Da; x++ ) + C[x] = (CostType)(C[x] + hsumAdd[x]); +#endif + } } } // also, clear the S buffer - for( k = x1*D; k < x2*D; k++ ) - S[k] = 0; + memset(S + x1*Da, 0, (x2-x1)*Da*sizeof(CostType)); } // [formula 13 in the paper] @@ -1015,7 +1061,7 @@ struct CalcVerticalSums: public ParallelLoopBody for( x = x1; x != x2; x++ ) { - int xd = x*D2; + int xd = x*Dlra; int delta = minLr[1][x] + P2; @@ -1024,64 +1070,39 @@ struct CalcVerticalSums: public ParallelLoopBody Lr_ppr[-1] = Lr_ppr[D] = MAX_COST; CostType* Lr_p = Lr[0] + xd; - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; + const CostType* Cp = C + x*Da; + CostType* Sp = S + x*Da; - #if CV_SIMD128 - if (true) + CostType& minL = minLr[0][x]; + d = 0; +#if CV_SIMD + v_int16 _P1 = vx_setall_s16((short)P1); + + v_int16 _delta = vx_setall_s16((short)delta); + v_int16 _minL = vx_setall_s16((short)MAX_COST); + + for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) { - v_int16x8 _P1 = v_setall_s16((short)P1); - - v_int16x8 _delta = v_setall_s16((short)delta); - v_int16x8 _minL = v_setall_s16((short)MAX_COST); - - for( d = 0; d < D; d += 8 ) - { - v_int16x8 Cpd = v_load(Cp + d); - v_int16x8 L; - - L = v_load(Lr_ppr + d); - - L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1)); - L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1)); - - L = v_min(L, _delta); - L = ((L - _delta) + Cpd); - - v_store(Lr_p + d, L); - - // Get minimum from in L-L3 - _minL = v_min(_minL, L); - - v_int16x8 Sval = v_load(Sp + d); - - Sval = Sval + L; - - v_store(Sp + d, Sval); - } - - v_int32x4 min1, min2, min12; - v_expand(_minL, min1, min2); - min12 = v_min(min1,min2); - minLr[0][x] = (CostType)v_reduce_min(min12); + v_int16 Cpd = vx_load_aligned(Cp + d); + v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_store_aligned(Lr_p + d, L); + _minL = v_min(_minL, L); + v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L); } - else - #endif + minL = v_reduce_min(_minL); +#else + minL = MAX_COST; +#endif + for( ; d < D; d++ ) { - int minL = MAX_COST; + int Cpd = Cp[d], L; - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L; + L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; + Lr_p[d] = (CostType)L; + minL = std::min(minL, (CostType)L); - Lr_p[d] = (CostType)L; - minL = std::min(minL, L); - - Sp[d] = saturate_cast(Sp[d] + L); - } - minLr[0][x] = (CostType)minL; + Sp[d] = saturate_cast(Sp[d] + L); } } @@ -1091,7 +1112,6 @@ struct CalcVerticalSums: public ParallelLoopBody } } } - static const int NLR = 2; const Mat& img1; const Mat& img2; CostType* Cbuf; @@ -1100,8 +1120,7 @@ struct CalcVerticalSums: public ParallelLoopBody PixType* clipTab; int minD; int maxD; - int D; - int D2; + int D, Da, Dlra; int SH2; int SW2; int width; @@ -1135,11 +1154,12 @@ struct CalcHorizontalSums: public ParallelLoopBody INVALID_DISP = minD - 1; INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; D = maxD - minD; + Da = (int)alignSize(D, v_int16::nlanes); + Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D width1 = maxX1 - minX1; - costBufSize = width1*D; + costBufSize = width1*Da; CSBufSize = costBufSize*height; - D2 = D + 16; - LrSize = 2 * D2; + LrSize = 2 * Dlra; Cbuf = alignedBuf; Sbuf = Cbuf + CSBufSize; } @@ -1147,11 +1167,11 @@ struct CalcHorizontalSums: public ParallelLoopBody void operator()(const Range& range) const CV_OVERRIDE { int y1 = range.start, y2 = range.end; - size_t auxBufsSize = LrSize * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)) + 32; + size_t auxBufsSize = CV_SIMD_WIDTH + (v_int16::nlanes + LrSize) * sizeof(CostType) + width*(sizeof(CostType) + sizeof(DispType)); Mat auxBuff; auxBuff.create(1, (int)auxBufsSize, CV_8U); - CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), ALIGN)) + 8; + CostType *Lr = ((CostType*)alignPtr(auxBuff.ptr(), CV_SIMD_WIDTH)) + v_int16::nlanes; CostType* disp2cost = Lr + LrSize; DispType* disp2ptr = (DispType*)(disp2cost + width); @@ -1164,15 +1184,26 @@ struct CalcHorizontalSums: public ParallelLoopBody CostType* C = Cbuf + y*costBufSize; CostType* S = Sbuf + y*costBufSize; - for( x = 0; x < width; x++ ) + x = 0; +#if CV_SIMD + v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED); + v_int16 v_max_cost = vx_setall_s16(MAX_COST); + for (; x <= width - v_int16::nlanes; x += v_int16::nlanes) + { + v_store(disp1ptr + x, v_inv_dist); + v_store(disp2ptr + x, v_inv_dist); + v_store(disp2cost + x, v_max_cost); + } +#endif + for( ; x < width; x++ ) { disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; disp2cost[x] = MAX_COST; } // clear buffers - memset( Lr - 8, 0, LrSize*sizeof(CostType) ); - Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST; + memset( Lr, 0, LrSize*sizeof(CostType) ); + Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST; minLr = 0; // [formula 13 in the paper] @@ -1189,70 +1220,43 @@ struct CalcHorizontalSums: public ParallelLoopBody { int delta = minLr + P2; - CostType* Lr_ppr = Lr + ((x&1)? 0 : D2); + CostType* Lr_ppr = Lr + ((x&1)? 0 : Dlra); - CostType* Lr_p = Lr + ((x&1)? D2 :0); - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; + CostType* Lr_p = Lr + ((x&1)? Dlra :0); + const CostType* Cp = C + x*Da; + CostType* Sp = S + x*Da; - #if CV_SIMD128 - if (true) + d = 0; +#if CV_SIMD + v_int16 _P1 = vx_setall_s16((short)P1); + + v_int16 _delta = vx_setall_s16((short)delta); + v_int16 _minL = vx_setall_s16((short)MAX_COST); + + for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes) { - v_int16x8 _P1 = v_setall_s16((short)P1); - - v_int16x8 _delta = v_setall_s16((short)delta); - v_int16x8 _minL = v_setall_s16((short)MAX_COST); - - for( d = 0; d < D; d += 8 ) - { - v_int16x8 Cpd = v_load(Cp + d); - v_int16x8 L; - - L = v_load(Lr_ppr + d); - - L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1)); - L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1)); - - L = v_min(L, _delta); - L = ((L - _delta) + Cpd); - - v_store(Lr_p + d, L); - - // Get minimum from in L-L3 - _minL = v_min(_minL, L); - - v_int16x8 Sval = v_load(Sp + d); - - Sval = Sval + L; - - v_store(Sp + d, Sval); - } - - v_int32x4 min1, min2, min12; - v_expand(_minL, min1, min2); - min12 = v_min(min1,min2); - minLr = (CostType)v_reduce_min(min12); + v_int16 Cpd = vx_load_aligned(Cp + d); + v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_store_aligned(Lr_p + d, L); + _minL = v_min(_minL, L); + v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L); } - else - #endif + minLr = v_reduce_min(_minL); +#else + minLr = MAX_COST; +#endif + for( ; d < D; d++ ) { - minLr = MAX_COST; - for( d = 0; d < D; d++ ) - { - int Cpd = Cp[d], L; - - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - - Lr_p[d] = (CostType)L; - minLr = (CostType)std::min((int)minLr, L); - - Sp[d] = saturate_cast(Sp[d] + L); - } + int Cpd = Cp[d], L; + L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; + Lr_p[d] = (CostType)L; + minLr = std::min(minLr, (CostType)L); + Sp[d] = saturate_cast(Sp[d] + L); } } - memset( Lr - 8, 0, LrSize*sizeof(CostType) ); - Lr[-1] = Lr[D] = Lr[D2 - 1] = Lr[D2 + D] = MAX_COST; + memset( Lr, 0, LrSize*sizeof(CostType) ); + Lr[-1] = Lr[D] = Lr[Dlra - 1] = Lr[Dlra + D] = MAX_COST; minLr = 0; @@ -1260,88 +1264,55 @@ struct CalcHorizontalSums: public ParallelLoopBody { int delta = minLr + P2; - CostType* Lr_ppr = Lr + ((x&1)? 0 :D2); + CostType* Lr_ppr = Lr + ((x&1)? 0 :Dlra); - CostType* Lr_p = Lr + ((x&1)? D2 :0); - const CostType* Cp = C + x*D; - CostType* Sp = S + x*D; - int minS = MAX_COST, bestDisp = -1; + CostType* Lr_p = Lr + ((x&1)? Dlra :0); + const CostType* Cp = C + x*Da; + CostType* Sp = S + x*Da; + CostType minS = MAX_COST; + short bestDisp = -1; minLr = MAX_COST; - #if CV_SIMD128 - if (true) + d = 0; +#if CV_SIMD + v_int16 _P1 = vx_setall_s16((short)P1); + v_int16 _delta = vx_setall_s16((short)delta); + + v_int16 _minL = vx_setall_s16((short)MAX_COST); + v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1); + for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes ) { - v_int16x8 _P1 = v_setall_s16((short)P1); + v_int16 Cpd = vx_load_aligned(Cp + d); + v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd; + v_store_aligned(Lr_p + d, L); + _minL = v_min(_minL, L); + L += vx_load_aligned(Sp + d); + v_store_aligned(Sp + d, L); - v_int16x8 _delta = v_setall_s16((short)delta); - v_int16x8 _minL = v_setall_s16((short)MAX_COST); - - v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1); - v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8); - - for( d = 0; d < D; d+= 8 ) - { - v_int16x8 Cpd = v_load(Cp + d); - v_int16x8 L; - - L = v_load(Lr_ppr + d); - - L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1)); - L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1)); - - L = v_min(L, _delta); - L = ((L - _delta) + Cpd); - - v_store(Lr_p + d, L); - - // Get minimum from in L-L3 - _minL = v_min(_minL, L); - - v_int16x8 Sval = v_load(Sp + d); - - Sval = Sval + L; - - v_int16x8 mask = Sval < _minS; - _minS = v_min( Sval, _minS ); - _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask); - _d8 = _d8 + _8; - - v_store(Sp + d, Sval); - } - v_int32x4 min1, min2, min12; - v_expand(_minL, min1, min2); - min12 = v_min(min1,min2); - minLr = (CostType)v_reduce_min(min12); - - v_int32x4 _d0, _d1; - v_expand(_minS, _d0, _d1); - minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); - v_int16x8 v_mask = v_setall_s16((short)minS) == _minS; - - _bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask); - v_expand(_bestDisp, _d0, _d1); - bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1)); + _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp); + _minS = v_min( L, _minS ); } - else - #endif + minLr = v_reduce_min(_minL); + + min_pos(_minS, _bestDisp, minS, bestDisp); +#endif + for( ; d < D; d++ ) { - for( d = 0; d < D; d++ ) + int Cpd = Cp[d], L; + + L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; + + Lr_p[d] = (CostType)L; + minLr = std::min(minLr, (CostType)L); + + Sp[d] = saturate_cast(Sp[d] + L); + if( Sp[d] < minS ) { - int Cpd = Cp[d], L; - - L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta; - - Lr_p[d] = (CostType)L; - minLr = (CostType)std::min((int)minLr, L); - - Sp[d] = saturate_cast(Sp[d] + L); - if( Sp[d] < minS ) - { - minS = Sp[d]; - bestDisp = d; - } + minS = Sp[d]; + bestDisp = (short)d; } } + //Some postprocessing procedures and saving for( d = 0; d < D; d++ ) { @@ -1392,7 +1363,6 @@ struct CalcHorizontalSums: public ParallelLoopBody static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; static const int DISP_SCALE = (1 << DISP_SHIFT); static const CostType MAX_COST = SHRT_MAX; - static const int ALIGN = 16; const Mat& img1; const Mat& img2; Mat& disp1; @@ -1400,8 +1370,7 @@ struct CalcHorizontalSums: public ParallelLoopBody CostType* Sbuf; int minD; int maxD; - int D; - int D2; + int D, Da, Dlra; int width; int width1; int height; @@ -1435,7 +1404,6 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params, Mat& buffer ) { - const int ALIGN = 16; const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SCALE = (1 << DISP_SHIFT); int minD = params.minDisparity, maxD = minD + params.numDisparities; @@ -1445,7 +1413,8 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1); int k, width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); - int D = maxD - minD, width1 = maxX1 - minX1; + int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1; + int Dlra = D + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int SH2 = SADWindowSize.height/2; int INVALID_DISP = minD - 1; int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; @@ -1461,25 +1430,20 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, return; } - CV_Assert( D % 16 == 0 ); - - int D2 = D+16; + // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) + // we keep pixel difference cost (C) and the summary cost over 4 directions (S). + // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: // for dynamic programming we need the current row and // the previous row, i.e. 2 rows in total - const int NLR = 2; - - // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) - // we keep pixel difference cost (C) and the summary cost over 4 directions (S). - // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) size_t costBufSize = width1*D; size_t CSBufSize = costBufSize*height; - size_t minLrSize = width1 , LrSize = minLrSize*D2; + size_t minLrSize = width1 , LrSize = minLrSize*Dlra; int hsumBufNRows = SH2*2 + 2; - size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[] - costBufSize*hsumBufNRows*sizeof(CostType) + // hsumBuf - CSBufSize*2*sizeof(CostType) + 1024; // C, S + size_t totalBufSize = CV_SIMD_WIDTH + CSBufSize * 2 * sizeof(CostType) + // Alignment, C, S + costBufSize*hsumBufNRows * sizeof(CostType) + // hsumBuf + ((LrSize + minLrSize)*2 + v_int16::nlanes) * sizeof(CostType); // minLr[] and Lr[] if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) @@ -1488,7 +1452,7 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, } // summary cost over different (nDirs) directions - CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN); + CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH); // add P2 to every C(x,y). it saves a few operations in the inner loops for(k = 0; k < (int)CSBufSize; k++ ) @@ -1501,7 +1465,7 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, ////////////////////////////////////////////////////////////////////////////////////////////////////// -void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2, +void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2, CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, PixType*& tmpBuf, CostType*& horPassCostVolume, CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf, @@ -1517,7 +1481,7 @@ struct SGBM3WayMainLoop : public ParallelLoopBody int stripe_overlap; int width,height; - int minD, maxD, D; + int minD, maxD, D, Da; int minX1, maxX1, width1; int SW2, SH2; @@ -1528,10 +1492,13 @@ struct SGBM3WayMainLoop : public ParallelLoopBody int TAB_OFS, ftzero; PixType* clipTab; - +#if CV_SIMD + short idx_row[v_int16::nlanes]; +#endif SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap); void getRawMatchingCost(CostType* C, CostType* hsumBuf, CostType* pixDiff, PixType* tmpBuf, int y, int src_start_idx) const; void operator () (const Range& range) const CV_OVERRIDE; + template void impl(const Range& range) const; }; SGBM3WayMainLoop::SGBM3WayMainLoop(Mat *_buffers, const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, PixType* _clipTab, int _nstripes, int _stripe_overlap): @@ -1544,7 +1511,7 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli width = img1->cols; height = img1->rows; minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD; minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1; - CV_Assert( D % 16 == 0 ); + Da = (int)alignSize(D, v_int16::nlanes); SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1; @@ -1552,22 +1519,26 @@ buffers(_buffers), img1(&_img1), img2(&_img2), dst_disp(_dst_disp), clipTab(_cli uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; - costBufSize = width1*D; + costBufSize = width1*Da; hsumBufNRows = SH2*2 + 2; TAB_OFS = 256*4; ftzero = std::max(params.preFilterCap, 15) | 1; +#if CV_SIMD + for(short i = 0; i < v_int16::nlanes; ++i) + idx_row[i] = i; +#endif } -void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, int SH2, int P2, +void getBufferPointers(Mat& buffer, int width, int width1, int Da, int num_ch, int SH2, int P2, CostType*& curCostVolumeLine, CostType*& hsumBuf, CostType*& pixDiff, PixType*& tmpBuf, CostType*& horPassCostVolume, CostType*& vertPassCostVolume, CostType*& vertPassMin, CostType*& rightPassBuf, CostType*& disp2CostBuf, short*& disp2Buf) { // allocating all the required memory: - int costVolumeLineSize = width1*D; + int costVolumeLineSize = width1*Da; int width1_ext = width1+2; - int costVolumeLineSize_ext = width1_ext*D; + int costVolumeLineSize_ext = width1_ext*Da; int hsumBufNRows = SH2*2 + 2; // main buffer to store matching costs for the current line: @@ -1576,49 +1547,55 @@ void getBufferPointers(Mat& buffer, int width, int width1, int D, int num_ch, in // auxiliary buffers for the raw matching cost computation: int hsumBufSize = costVolumeLineSize*hsumBufNRows*sizeof(CostType); int pixDiffSize = costVolumeLineSize*sizeof(CostType); - int tmpBufSize = width*16*num_ch*sizeof(PixType); + int tmpBufSize = width * (4 * num_ch + 2) * sizeof(PixType); // auxiliary buffers for the matching cost aggregation: int horPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the 2-pass horizontal cost aggregation int vertPassCostVolumeSize = costVolumeLineSize_ext*sizeof(CostType); // buffer for the vertical cost aggregation + int rightPassBufSize = Da * sizeof(CostType); // additional small buffer for the right-to-left pass int vertPassMinSize = width1_ext*sizeof(CostType); // buffer for storing minimum costs from the previous line - int rightPassBufSize = D*sizeof(CostType); // additional small buffer for the right-to-left pass // buffers for the pseudo-LRC check: int disp2CostBufSize = width*sizeof(CostType); int disp2BufSize = width*sizeof(short); // sum up the sizes of all the buffers: - size_t totalBufSize = curCostVolumeLineSize + + size_t totalBufSize = CV_SIMD_WIDTH + curCostVolumeLineSize + hsumBufSize + pixDiffSize + - tmpBufSize + horPassCostVolumeSize + vertPassCostVolumeSize + - vertPassMinSize + rightPassBufSize + + vertPassMinSize + disp2CostBufSize + disp2BufSize + - 16; //to compensate for the alignPtr shifts + tmpBufSize; if( buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize ) buffer.reserveBuffer(totalBufSize); // set up all the pointers: - curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), 16); + curCostVolumeLine = (CostType*)alignPtr(buffer.ptr(), CV_SIMD_WIDTH); hsumBuf = curCostVolumeLine + costVolumeLineSize; pixDiff = hsumBuf + costVolumeLineSize*hsumBufNRows; - tmpBuf = (PixType*)(pixDiff + costVolumeLineSize); - horPassCostVolume = (CostType*)(tmpBuf + width*16*num_ch); + horPassCostVolume = pixDiff + costVolumeLineSize; vertPassCostVolume = horPassCostVolume + costVolumeLineSize_ext; rightPassBuf = vertPassCostVolume + costVolumeLineSize_ext; - vertPassMin = rightPassBuf + D; + vertPassMin = rightPassBuf + Da; + disp2CostBuf = vertPassMin + width1_ext; disp2Buf = disp2CostBuf + width; + tmpBuf = (PixType*)(disp2Buf + width); // initialize memory: memset(buffer.ptr(),0,totalBufSize); - for(int i=0;i src_start_idx ) { const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize; - for( x = D; x < width1*D; x += D ) - { - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); - -#if CV_SIMD128 - if (true) - { - v_int16x8 hv_reg; - for( d = 0; d < D; d+=8 ) - { - hv_reg = v_load_aligned(hsumAdd+x-D+d) + (v_load_aligned(pixAdd+d) - v_load_aligned(pixSub+d)); - v_store_aligned(hsumAdd+x+d,hv_reg); - v_store_aligned(C+x+d,v_load_aligned(C+x+d)+(hv_reg-v_load_aligned(hsumSub+x+d))); - } - } - else +#if CV_SIMD + for (d = 0; d < Da; d += v_int16::nlanes) + v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d)); +#else + for (d = 0; d < D; d++) + C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]); #endif + + for( x = Da; x < width1*Da; x += Da ) + { + const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); +#if CV_SIMD + v_int16 hv_reg; + for( d = 0; d < Da; d+=v_int16::nlanes ) { - for( d = 0; d < D; d++ ) - { - int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); - C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]); - } + hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d); + v_store_aligned(hsumAdd+x+d,hv_reg); + v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d)); } +#else + for( d = 0; d < D; d++ ) + { + int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); + C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]); + } +#endif } } else { - for( x = D; x < width1*D; x += D ) +#if CV_SIMD + v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1); + for (d = 0; d < Da; d += v_int16::nlanes) + v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale); +#else + int scale = k == src_start_idx ? SH2 + 1 : 1; + for (d = 0; d < D; d++) + C[d] = (CostType)(C[d] + hsumAdd[d] * scale); +#endif + for( x = Da; x < width1*Da; x += Da ) { - const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D); - const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0); - - for( d = 0; d < D; d++ ) - hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); + const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da); + const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0); +#if CV_SIMD + for (d = 0; d < Da; d += v_int16::nlanes) + { + v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d); + v_store_aligned(hsumAdd + x + d, hv); + v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale); + } +#else + for (d = 0; d < D; d++) + { + CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]); + hsumAdd[x + d] = hv; + C[x + d] = (CostType)(C[x + d] + hv * scale); + } +#endif } } } - - if( y == src_start_idx ) + else { - int scale = k == src_start_idx ? SH2 + 1 : 1; - for( x = 0; x < width1*D; x++ ) - C[x] = (CostType)(C[x] + hsumAdd[x]*scale); + if( y > src_start_idx ) + { + const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, src_start_idx) % hsumBufNRows)*costBufSize; +#if CV_SIMD + for( x = 0; x < width1*Da; x += v_int16::nlanes) + v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x)); +#else + for( x = 0; x < width1*Da; x++ ) + C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]); +#endif + } + else + { +#if CV_SIMD + for( x = 0; x < width1*Da; x += v_int16::nlanes) + v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x)); +#else + for( x = 0; x < width1*Da; x++ ) + C[x] = (CostType)(C[x] + hsumAdd[x]); +#endif + } } } } -#if CV_SIMD128 -// define some additional reduce operations: -inline short min_pos(const v_int16x8& val, const v_int16x8& pos, const short min_val) -{ - v_int16x8 v_min = v_setall_s16(min_val); - v_int16x8 v_mask = v_min == val; - v_int16x8 v_pos = (pos & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask); - - return v_reduce_min(v_pos); -} -#endif - // performing SGM cost accumulation from left to right (result is stored in leftBuf) and // in-place cost accumulation from top to bottom (result is stored in topBuf) +template inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, CostType* topBuf, CostType* costs, CostType& leftMinCost, CostType& topMinCost, int D, int P1, int P2) { -#if CV_SIMD128 - if (true) + int i = 0; +#if CV_SIMD + int Da = (int)alignSize(D, v_int16::nlanes); + v_int16 P1_reg = vx_setall_s16(cv::saturate_cast(P1)); + + v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast(leftMinCost+P2)); + v_int16 leftMinCost_new_reg = vx_setall_s16(SHRT_MAX); + v_int16 src0_leftBuf = vx_setall_s16(SHRT_MAX); + v_int16 src1_leftBuf = vx_load_aligned(leftBuf_prev); + + v_int16 topMinCostP2_reg = vx_setall_s16(cv::saturate_cast(topMinCost+P2)); + v_int16 topMinCost_new_reg = vx_setall_s16(SHRT_MAX); + v_int16 src0_topBuf = vx_setall_s16(SHRT_MAX); + v_int16 src1_topBuf = vx_load_aligned(topBuf); + + v_int16 src2; + v_int16 src_shifted_left,src_shifted_right; + v_int16 res; + + for(;i(P1)); - - v_int16x8 leftMinCostP2_reg = v_setall_s16(cv::saturate_cast(leftMinCost+P2)); - v_int16x8 leftMinCost_new_reg = v_setall_s16(SHRT_MAX); - v_int16x8 src0_leftBuf = v_setall_s16(SHRT_MAX); - v_int16x8 src1_leftBuf = v_load_aligned(leftBuf_prev); - - v_int16x8 topMinCostP2_reg = v_setall_s16(cv::saturate_cast(topMinCost+P2)); - v_int16x8 topMinCost_new_reg = v_setall_s16(SHRT_MAX); - v_int16x8 src0_topBuf = v_setall_s16(SHRT_MAX); - v_int16x8 src1_topBuf = v_load_aligned(topBuf); - - v_int16x8 src2; - v_int16x8 src_shifted_left,src_shifted_right; - v_int16x8 res; - - for(int i=0;i (src0_leftBuf,src1_leftBuf) + P1_reg; - src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg; - - // process and save current block: - res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); - leftMinCost_new_reg = v_min(leftMinCost_new_reg,res); - v_store_aligned(leftBuf+i, res); - - //update src buffers: - src0_leftBuf = src1_leftBuf; - src1_leftBuf = src2; - - //process topBuf: - //lookahead load: - src2 = v_load_aligned(topBuf+i+8); - - //get shifted versions of the current block and add P1: - src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg; - src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg; - - // process and save current block: - res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); - topMinCost_new_reg = v_min(topMinCost_new_reg,res); - v_store_aligned(topBuf+i, res); - - //update src buffers: - src0_topBuf = src1_topBuf; - src1_topBuf = src2; - } - - // a bit different processing for the last cycle of the loop: //process leftBuf: - src2 = v_setall_s16(SHRT_MAX); - src_shifted_left = v_extract<7> (src0_leftBuf,src1_leftBuf) + P1_reg; - src_shifted_right = v_extract<1> (src1_leftBuf,src2 ) + P1_reg; + //lookahead load: + src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes); - res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); - leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res)); - v_store_aligned(leftBuf+D-8, res); + //get shifted versions of the current block and add P1: + src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf); + src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 ); + + // process and save current block: + res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); + leftMinCost_new_reg = v_min(leftMinCost_new_reg,res); + v_store_aligned(leftBuf+i, res); + + //update src buffers: + src0_leftBuf = src1_leftBuf; + src1_leftBuf = src2; //process topBuf: - src2 = v_setall_s16(SHRT_MAX); - src_shifted_left = v_extract<7> (src0_topBuf,src1_topBuf) + P1_reg; - src_shifted_right = v_extract<1> (src1_topBuf,src2 ) + P1_reg; + //lookahead load: + src2 = vx_load_aligned(topBuf+i+v_int16::nlanes); - res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); + //get shifted versions of the current block and add P1: + src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf); + src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 ); + + // process and save current block: + res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); + topMinCost_new_reg = v_min(topMinCost_new_reg,res); + v_store_aligned(topBuf+i, res); + + //update src buffers: + src0_topBuf = src1_topBuf; + src1_topBuf = src2; + } + + // a bit different processing for the last cycle of the loop: + if(x_nlanes) + { + src2 = vx_setall_s16(SHRT_MAX); + //process leftBuf: + src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf); + src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 ); + + res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg); + leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res)); + v_store_aligned(leftBuf+Da-v_int16::nlanes, res); + + //process topBuf: + src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf); + src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 ); + + res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg); topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res)); - v_store_aligned(topBuf+D-8, res); + v_store_aligned(topBuf+Da-v_int16::nlanes, res); } else -#endif + { + CostType leftMinCost_new = v_reduce_min(leftMinCost_new_reg); + CostType topMinCost_new = v_reduce_min(topMinCost_new_reg); + CostType leftBuf_prev_i_minus_1 = i > 0 ? leftBuf_prev[i-1] : SHRT_MAX; + CostType topBuf_i_minus_1 = i > 0 ? topBuf[i-1] : SHRT_MAX; +#else { CostType leftMinCost_new = SHRT_MAX; CostType topMinCost_new = SHRT_MAX; - int leftMinCost_P2 = leftMinCost + P2; - int topMinCost_P2 = topMinCost + P2; CostType leftBuf_prev_i_minus_1 = SHRT_MAX; CostType topBuf_i_minus_1 = SHRT_MAX; +#endif + int leftMinCost_P2 = leftMinCost + P2; + int topMinCost_P2 = topMinCost + P2; CostType tmp; - - for(int i=0;i(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2); leftBuf_prev_i_minus_1 = leftBuf_prev[i]; @@ -1825,83 +1846,86 @@ inline void accumulateCostsLeftTop(CostType* leftBuf, CostType* leftBuf_prev, Co // performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the // optimal disparity value with minimum accumulated cost +template inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf, CostType* costs, - CostType& rightMinCost, int D, int P1, int P2, int& optimal_disp, CostType& min_cost) + CostType& rightMinCost, int D, int P1, int P2, short& optimal_disp, CostType& min_cost) { -#if CV_SIMD128 - if (true) + int i = 0; +#if CV_SIMD + int Da = (int)alignSize(D, v_int16::nlanes); + v_int16 P1_reg = vx_setall_s16(cv::saturate_cast(P1)); + + v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast(rightMinCost+P2)); + v_int16 rightMinCost_new_reg = vx_setall_s16(SHRT_MAX); + v_int16 src0_rightBuf = vx_setall_s16(SHRT_MAX); + v_int16 src1_rightBuf = vx_load(rightBuf); + + v_int16 src2; + v_int16 src_shifted_left,src_shifted_right; + v_int16 res; + + v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX); + v_int16 min_sum_pos_reg = vx_setall_s16(0); + + for(;i(P1)); + //lookahead load: + src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes); - v_int16x8 rightMinCostP2_reg = v_setall_s16(cv::saturate_cast(rightMinCost+P2)); - v_int16x8 rightMinCost_new_reg = v_setall_s16(SHRT_MAX); - v_int16x8 src0_rightBuf = v_setall_s16(SHRT_MAX); - v_int16x8 src1_rightBuf = v_load(rightBuf); + //get shifted versions of the current block and add P1: + src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf); + src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 ); - v_int16x8 src2; - v_int16x8 src_shifted_left,src_shifted_right; - v_int16x8 res; + // process and save current block: + res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); + rightMinCost_new_reg = v_min(rightMinCost_new_reg,res); + v_store_aligned(rightBuf+i, res); - v_int16x8 min_sum_cost_reg = v_setall_s16(SHRT_MAX); - v_int16x8 min_sum_pos_reg = v_setall_s16(0); - v_int16x8 loop_idx(0,1,2,3,4,5,6,7); - v_int16x8 eight_reg = v_setall_s16(8); + // compute and save total cost: + res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i); + v_store_aligned(leftBuf+i, res); - for(int i=0;i (src0_rightBuf,src1_rightBuf) + P1_reg; - src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg; + //update src: + src0_rightBuf = src1_rightBuf; + src1_rightBuf = src2; + } - // process and save current block: - res = v_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); - rightMinCost_new_reg = v_min(rightMinCost_new_reg,res); - v_store_aligned(rightBuf+i, res); + // a bit different processing for the last cycle of the loop: + if(x_nlanes) + { + src2 = vx_setall_s16(SHRT_MAX); + src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf); + src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 ); - // compute and save total cost: - res = res + v_load_aligned(leftBuf+i) + v_load_aligned(topBuf+i); - v_store_aligned(leftBuf+i, res); - - // track disparity value with the minimum cost: - min_sum_cost_reg = v_min(min_sum_cost_reg,res); - min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg)); - loop_idx = loop_idx+eight_reg; - - //update src: - src0_rightBuf = src1_rightBuf; - src1_rightBuf = src2; - } - - // a bit different processing for the last cycle of the loop: - src2 = v_setall_s16(SHRT_MAX); - src_shifted_left = v_extract<7> (src0_rightBuf,src1_rightBuf) + P1_reg; - src_shifted_right = v_extract<1> (src1_rightBuf,src2 ) + P1_reg; - - res = v_load_aligned(costs+D-8) + (v_min(v_min(src_shifted_left,src_shifted_right),v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); + res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg); rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res)); - v_store_aligned(rightBuf+D-8, res); + v_store_aligned(rightBuf+D-v_int16::nlanes, res); - res = res + v_load_aligned(leftBuf+D-8) + v_load_aligned(topBuf+D-8); - v_store_aligned(leftBuf+D-8, res); + res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes); + v_store_aligned(leftBuf+D-v_int16::nlanes, res); min_sum_cost_reg = v_min(min_sum_cost_reg,res); - min_cost = v_reduce_min(min_sum_cost_reg); - min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (loop_idx - min_sum_pos_reg)); - optimal_disp = min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost); + min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)D-v_int16::nlanes) - min_sum_pos_reg)); + min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp); } else -#endif + { + CostType rightMinCost_new = v_reduce_min(rightMinCost_new_reg); + CostType rightBuf_i_minus_1 = i > 0 ? rightBuf[i] : SHRT_MAX; + min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp); +#else { CostType rightMinCost_new = SHRT_MAX; - int rightMinCost_P2 = rightMinCost + P2; CostType rightBuf_i_minus_1 = SHRT_MAX; - CostType tmp; min_cost = SHRT_MAX; - - for(int i=0;i(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2); @@ -1910,7 +1934,7 @@ inline void accumulateCostsRight(CostType* rightBuf, CostType* topBuf, CostType* leftBuf[i] = cv::saturate_cast((int)leftBuf[i]+rightBuf[i]+topBuf[i]); if(leftBuf[i]((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]); if(leftBuf[D-1](range); + else impl(range); +} +template +void SGBM3WayMainLoop::impl(const Range& range) const { // force separate processing of stripes: if(range.end>range.start+1) @@ -1958,7 +1988,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const PixType* tmpBuf; CostType *horPassCostVolume, *vertPassCostVolume, *vertPassMin, *rightPassBuf, *disp2CostBuf; short* disp2Buf; - getBufferPointers(cur_buffer,width,width1,D,img1->channels(),SH2,P2, + getBufferPointers(cur_buffer,width,width1,Da,img1->channels(),SH2,P2, curCostVolumeLine,hsumBuf,pixDiff,tmpBuf,horPassCostVolume, vertPassCostVolume,vertPassMin,rightPassBuf,disp2CostBuf,disp2Buf); @@ -1975,73 +2005,75 @@ void SGBM3WayMainLoop::operator () (const Range& range) const disp2Buf[x] = (short)INVALID_DISP_SCALED; disp2CostBuf[x] = SHRT_MAX; } - CostType* C = curCostVolumeLine - D; + CostType* C = curCostVolumeLine - Da; CostType prev_min, min_cost; - int d, best_d; + int d; + short best_d; d = best_d = 0; // forward pass prev_min=0; - for (int x=D;x<(1+width1)*D;x+=D) - accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-D,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/D],D,P1,P2); + for (int x=Da;x<(1+width1)*Da;x+=Da) + accumulateCostsLeftTop(horPassCostVolume+x,horPassCostVolume+x-Da,vertPassCostVolume+x,C+x,prev_min,vertPassMin[x/Da],D,P1,P2); //backward pass - memset(rightPassBuf,0,D*sizeof(CostType)); + memset(rightPassBuf,0,Da*sizeof(CostType)); prev_min=0; - for (int x=width1*D;x>=D;x-=D) + for (int x=width1*Da;x>=Da;x-=Da) { - accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost); + accumulateCostsRight(rightPassBuf,vertPassCostVolume+x,horPassCostVolume+x,C+x,prev_min,D,P1,P2,best_d,min_cost); if(uniquenessRatio>0) { -#if CV_SIMD128 - if (true) + d = 0; +#if CV_SIMD + horPassCostVolume+=x; + int thresh = (100*min_cost)/(100-uniquenessRatio); + v_int16 thresh_reg = vx_setall_s16((short)(thresh+1)); + v_int16 d1 = vx_setall_s16((short)(best_d-1)); + v_int16 d2 = vx_setall_s16((short)(best_d+1)); + v_int16 eight_reg = vx_setall_s16(v_int16::nlanes); + v_int16 cur_d = vx_load(idx_row); + v_int16 mask; + + for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes ) { - horPassCostVolume+=x; - int thresh = (100*min_cost)/(100-uniquenessRatio); - v_int16x8 thresh_reg = v_setall_s16((short)(thresh+1)); - v_int16x8 d1 = v_setall_s16((short)(best_d-1)); - v_int16x8 d2 = v_setall_s16((short)(best_d+1)); - v_int16x8 eight_reg = v_setall_s16(8); - v_int16x8 cur_d(0,1,2,3,4,5,6,7); - v_int16x8 mask,cost1,cost2; - - for( d = 0; d < D; d+=16 ) - { - cost1 = v_load_aligned(horPassCostVolume+d); - cost2 = v_load_aligned(horPassCostVolume+d+8); - - mask = cost1 < thresh_reg; - mask = mask & ( (cur_dd2) ); - if( v_check_any(mask) ) - break; - - cur_d = cur_d+eight_reg; - - mask = cost2 < thresh_reg; - mask = mask & ( (cur_dd2) ); - if( v_check_any(mask) ) - break; - - cur_d = cur_d+eight_reg; - } - horPassCostVolume-=x; + mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_dd2) ); + cur_d = cur_d+eight_reg; + if( v_check_any(mask) ) + break; + mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_dd2) ); + cur_d = cur_d+eight_reg; + if( v_check_any(mask) ) + break; } - else -#endif + if( d <= D - 2*v_int16::nlanes ) { - for( d = 0; d < D; d++ ) + horPassCostVolume-=x; + continue; + } + if( d <= D - v_int16::nlanes ) + { + if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) ) { - if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 ) - break; + horPassCostVolume-=x; + continue; } + d+=v_int16::nlanes; + } + horPassCostVolume-=x; +#endif + for( ; d < D; d++ ) + { + if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 ) + break; } if( d < D ) continue; } d = best_d; - int _x2 = x/D - 1 + minX1 - d - minD; + int _x2 = x/Da - 1 + minX1 - d - minD; if( _x2>=0 && _x2 min_cost ) { disp2CostBuf[_x2] = min_cost; @@ -2059,7 +2091,7 @@ void SGBM3WayMainLoop::operator () (const Range& range) const else d *= DISP_SCALE; - disp_row[(x/D)-1 + minX1] = (DispType)(d + minD*DISP_SCALE); + disp_row[(x/Da)-1 + minX1] = (DispType)(d + minD*DISP_SCALE); } for(int x = minX1; x < maxX1; x++ )