diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 7b56439117..6ad0db2671 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2191,236 +2191,527 @@ icvBayer2BGR_VNG_8u_C1C3R( const uchar* bayer, int bstep, { int blueIdx = code == CV_BayerBG2BGR_VNG || code == CV_BayerGB2BGR_VNG ? 0 : 2; bool greenCell0 = code != CV_BayerBG2BGR_VNG && code != CV_BayerRG2BGR_VNG; - bool greenCell0_buf = !greenCell0; // for too small images use the simple interpolation algorithm - if( MIN(size.width, size.height) < 5 ) + if( MIN(size.width, size.height) < 8 ) return icvBayer2BGR_8u_C1C3R( bayer, bstep, dst, dststep, size, code ); const int brows = 3, bcn = 7; - int i, bufstep = size.width*bcn; + int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7; + int i, bufstep = N7*bcn; cv::AutoBuffer _buf(bufstep*brows); ushort* buf = (ushort*)_buf; bayer += bstep*2; - for( i = 0; i < size.width*3; i++ ) - { - dst[i] = dst[i + dststep] = - dst[i + dststep*(size.height-2)] = - dst[i + dststep*(size.height-1)] = 0; - } +#if CV_SSE2 + bool haveSSE = cv::checkHardwareSupport(CV_CPU_SSE2); + #define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)) +#endif - for( int y = 2; y < size.height - 2; y++ ) + for( int y = 2; y < size.height - 4; y++ ) { - uchar* dstrow = dst + dststep*y; + uchar* dstrow = dst + dststep*y + 6; const uchar* srow; for( int dy = (y == 2 ? -1 : 1); dy <= 1; dy++ ) { - ushort* brow = buf + ((y + dy - 1)%brows)*bufstep; - srow = bayer + dy*bstep; + ushort* brow = buf + ((y + dy - 1)%brows)*bufstep + 1; + srow = bayer + (y+dy)*bstep + 1; for( i = 0; i < bcn; i++ ) - brow[i] = brow[i + (size.width-1)*bcn] = 0; + brow[N*i-1] = brow[(N-2) + N*i] = 0; - bool greenCell = greenCell0_buf; - for( i = 1; i < size.width-1; i++ ) + i = 1; + + #if CV_SSE2 + if( haveSSE ) { - brow += bcn; - brow[0] = (ushort)( abs(srow[i-1-bstep] - srow[i-1+bstep]) + - abs(srow[i-bstep] - srow[i+bstep])*2 + - abs(srow[i+1-bstep] - srow[i+1+bstep])); - brow[1] = (ushort)( abs(srow[i-1-bstep] - srow[i+1-bstep]) + - abs(srow[i-1] - srow[i+1])*2 + - abs(srow[i-1+bstep] - srow[i+1+bstep])); - brow[2] = (ushort)(abs(srow[i+1-bstep] - srow[i-1+bstep])*2); - brow[3] = (ushort)(abs(srow[i-1-bstep] - srow[i+1+bstep])*2); - if(!greenCell) + __m128i z = _mm_setzero_si128(); + for( ; i <= N-9; i += 8, srow += 8, brow += 8 ) { - brow[4] = (ushort)(brow[2] + abs(srow[i-bstep] - srow[i-1]) + abs(srow[i+bstep] - srow[i+1])); - brow[5] = (ushort)(brow[3] + abs(srow[i-bstep] - srow[i+1]) + abs(srow[i+bstep] - srow[i-1])); - brow[6] = (ushort)((srow[i-bstep] + srow[i-1] + srow[i+1] + srow[i+bstep] + 2)>>2); + __m128i s1, s2, s3, s4, s6, s7, s8, s9; + + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1-bstep)),z); + s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)),z); + s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1-bstep)),z); + + s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)),z); + s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)),z); + + s7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1+bstep)),z); + s8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)),z); + s9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1+bstep)),z); + + __m128i b0, b1, b2, b3, b4, b5, b6; + + b0 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s2,s8),1), + _mm_adds_epu16(_mm_absdiff_epu16(s1, s7), + _mm_absdiff_epu16(s3, s9))); + b1 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s4,s6),1), + _mm_adds_epu16(_mm_absdiff_epu16(s1, s3), + _mm_absdiff_epu16(s7, s9))); + b2 = _mm_slli_epi16(_mm_absdiff_epu16(s3,s7),1); + b3 = _mm_slli_epi16(_mm_absdiff_epu16(s1,s9),1); + + _mm_storeu_si128((__m128i*)brow, b0); + _mm_storeu_si128((__m128i*)(brow + N), b1); + _mm_storeu_si128((__m128i*)(brow + N2), b2); + _mm_storeu_si128((__m128i*)(brow + N3), b3); + + b4 = _mm_adds_epu16(b2,_mm_adds_epu16(_mm_absdiff_epu16(s2, s4), + _mm_absdiff_epu16(s6, s8))); + b5 = _mm_adds_epu16(b3,_mm_adds_epu16(_mm_absdiff_epu16(s2, s6), + _mm_absdiff_epu16(s4, s8))); + b6 = _mm_adds_epu16(_mm_adds_epu16(s2, s4), _mm_adds_epu16(s6, s8)); + b6 = _mm_srli_epi16(b6, 1); + + _mm_storeu_si128((__m128i*)(brow + N4), b4); + _mm_storeu_si128((__m128i*)(brow + N5), b5); + _mm_storeu_si128((__m128i*)(brow + N6), b6); } - greenCell = !greenCell; } - greenCell0_buf = !greenCell0_buf; + #endif + + for( ; i < N-1; i++, srow++, brow++ ) + { + brow[0] = (ushort)(abs(srow[-1-bstep] - srow[-1+bstep]) + + abs(srow[-bstep] - srow[+bstep])*2 + + abs(srow[1-bstep] - srow[1+bstep])); + brow[N] = (ushort)(abs(srow[-1-bstep] - srow[1-bstep]) + + abs(srow[-1] - srow[1])*2 + + abs(srow[-1+bstep] - srow[1+bstep])); + brow[N2] = (ushort)(abs(srow[+1-bstep] - srow[-1+bstep])*2); + brow[N3] = (ushort)(abs(srow[-1-bstep] - srow[1+bstep])*2); + brow[N4] = (ushort)(brow[N2] + abs(srow[-bstep] - srow[-1]) + + abs(srow[+bstep] - srow[1])); + brow[N5] = (ushort)(brow[N3] + abs(srow[-bstep] - srow[1]) + + abs(srow[+bstep] - srow[-1])); + brow[N6] = (ushort)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1); + } } - const ushort* brow0 = buf + ((y - 2) % brows)*bufstep + bcn; - const ushort* brow1 = buf + ((y - 1) % brows)*bufstep + bcn; - const ushort* brow2 = buf + (y % brows)*bufstep + bcn; - static const float scale[] = { 0.f, 1.f, 0.5f, 0.3333333333f, 0.25f, 0.2f, 0.1666666667f, 0.1428571f, 0.125f }; - srow = bayer + y*bstep; + const ushort* brow0 = buf + ((y - 2) % brows)*bufstep + 2; + const ushort* brow1 = buf + ((y - 1) % brows)*bufstep + 2; + const ushort* brow2 = buf + (y % brows)*bufstep + 2; + static const float scale[] = { 0.f, 0.5f, 0.25f, 0.1666666666667f, 0.125f, 0.1f, 0.08333333333f, 0.0714286f, 0.0625f }; + srow = bayer + y*bstep + 2; bool greenCell = greenCell0; - for( i = 0; i < 6; i++ ) - dstrow[i] = dstrow[i + (size.width-2)*3] = 0; - dstrow += 6; + i = 2; + #if CV_SSE2 + int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2; + #else + int limit = N - 2; + #endif - for( i = 2; i < size.width-2; i++, brow0 += bcn, brow1 += bcn, brow2 += bcn, dstrow += 3 ) + do { - int gradN = (brow0[0] + brow1[0])>>1; - int gradS = (brow1[0] + brow2[0])>>1; - int gradW = (brow1[-bcn+1] + brow1[1])>>1; - int gradE = (brow1[1] + brow1[bcn+1])>>1; - int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE); - int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE); - int R, G, B; + for( ; i < limit; i++, srow++, brow0++, brow1++, brow2++, dstrow += 3 ) + { + int gradN = brow0[0] + brow1[0]; + int gradS = brow1[0] + brow2[0]; + int gradW = brow1[N-1] + brow1[N]; + int gradE = brow1[N] + brow1[N+1]; + int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE); + int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE); + int R, G, B; + + if( !greenCell ) + { + int gradNE = brow0[N4+1] + brow1[N4]; + int gradSW = brow1[N4] + brow2[N4-1]; + int gradNW = brow0[N5-1] + brow1[N5]; + int gradSE = brow1[N5] + brow2[N5+1]; + + minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE); + maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE); + int T = minGrad + maxGrad/2; + + int Rs = 0, Gs = 0, Bs = 0, ng = 0; + if( gradN < T ) + { + Rs += srow[-bstep*2] + srow[0]; + Gs += srow[-bstep]*2; + Bs += srow[-bstep-1] + srow[-bstep+1]; + ng++; + } + if( gradS < T ) + { + Rs += srow[bstep*2] + srow[0]; + Gs += srow[bstep]*2; + Bs += srow[bstep-1] + srow[bstep+1]; + ng++; + } + if( gradW < T ) + { + Rs += srow[-2] + srow[0]; + Gs += srow[-1]*2; + Bs += srow[-bstep-1] + srow[bstep-1]; + ng++; + } + if( gradE < T ) + { + Rs += srow[2] + srow[0]; + Gs += srow[1]*2; + Bs += srow[-bstep+1] + srow[bstep+1]; + ng++; + } + if( gradNE < T ) + { + Rs += srow[-bstep*2+2] + srow[0]; + Gs += brow0[N6+1]; + Bs += srow[-bstep+1]*2; + ng++; + } + if( gradSW < T ) + { + Rs += srow[bstep*2-2] + srow[0]; + Gs += brow2[N6-1]; + Bs += srow[bstep-1]*2; + ng++; + } + if( gradNW < T ) + { + Rs += srow[-bstep*2-2] + srow[0]; + Gs += brow0[N6-1]; + Bs += srow[-bstep+1]*2; + ng++; + } + if( gradSE < T ) + { + Rs += srow[bstep*2+2] + srow[0]; + Gs += brow2[N6+1]; + Bs += srow[-bstep+1]*2; + ng++; + } + R = srow[0]; + G = R + cvRound((Gs - Rs)*scale[ng]); + B = R + cvRound((Bs - Rs)*scale[ng]); + } + else + { + int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1]; + int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1]; + int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1]; + int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1]; + + minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE); + maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE); + int T = minGrad + maxGrad/2; + + int Rs = 0, Gs = 0, Bs = 0, ng = 0; + if( gradN < T ) + { + Rs += srow[-bstep*2-1] + srow[-bstep*2+1]; + Gs += srow[-bstep*2] + srow[0]; + Bs += srow[-bstep]*2; + ng++; + } + if( gradS < T ) + { + Rs += srow[bstep*2-1] + srow[bstep*2+1]; + Gs += srow[bstep*2] + srow[0]; + Bs += srow[bstep]*2; + ng++; + } + if( gradW < T ) + { + Rs += srow[-1]*2; + Gs += srow[-2] + srow[0]; + Bs += srow[-bstep-2]+srow[bstep-2]; + ng++; + } + if( gradE < T ) + { + Rs += srow[1]*2; + Gs += srow[2] + srow[0]; + Bs += srow[-bstep+2]+srow[bstep+2]; + ng++; + } + if( gradNE < T ) + { + Rs += srow[-bstep*2+1] + srow[1]; + Gs += srow[-bstep+1]*2; + Bs += srow[-bstep] + srow[-bstep+2]; + ng++; + } + if( gradSW < T ) + { + Rs += srow[bstep*2-1] + srow[-1]; + Gs += srow[bstep-1]*2; + Bs += srow[bstep] + srow[bstep-2]; + ng++; + } + if( gradNW < T ) + { + Rs += srow[-bstep*2-1] + srow[-1]; + Gs += srow[-bstep-1]*2; + Bs += srow[-bstep-2]+srow[-bstep]; + ng++; + } + if( gradSE < T ) + { + Rs += srow[bstep*2+1] + srow[1]; + Gs += srow[bstep+1]*2; + Bs += srow[bstep+2]+srow[bstep]; + ng++; + } + G = srow[0]; + R = G + cvRound((Rs - Gs)*scale[ng]); + B = G + cvRound((Bs - Gs)*scale[ng]); + } + dstrow[blueIdx] = CV_CAST_8U(B); + dstrow[1] = CV_CAST_8U(G); + dstrow[blueIdx^2] = CV_CAST_8U(R); + greenCell = !greenCell; + } - if( !greenCell ) + #if CV_SSE2 + if( !haveSSE ) + break; + + __m128i emask = _mm_set1_epi32(0x0000ffff), + omask = _mm_set1_epi32(0xffff0000), + z = _mm_setzero_si128(); + __m128 _0_5 = _mm_set1_ps(0.5f); + + #define _mm_merge_epi16(a, b) \ + _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) + #define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16)) + #define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16)) + + // process 8 pixels at once + for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 ) { - int gradNE = (brow0[bcn+4] + brow1[4])>>1; - int gradSW = (brow1[4] + brow2[-bcn+4])>>1; - int gradNW = (brow0[-bcn+5] + brow1[5])>>1; - int gradSE = (brow1[5] + brow2[bcn+5])>>1; + __m128i gradN, gradS, gradW, gradE, gradNE, gradSW, gradNW, gradSE; + gradN = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow0), + _mm_loadu_si128((__m128i*)brow1)); + gradS = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow1), + _mm_loadu_si128((__m128i*)brow2)); + gradW = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N-1)), + _mm_loadu_si128((__m128i*)(brow1+N))); + gradE = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N+1)), + _mm_loadu_si128((__m128i*)(brow1+N))); - minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE); - maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE); - int T = minGrad + maxGrad/2; + __m128i minGrad, maxGrad, T; + minGrad = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(gradN, gradS), gradW), gradE); + maxGrad = _mm_max_epi16(_mm_max_epi16(_mm_max_epi16(gradN, gradS), gradW), gradE); - int Rs = 0, Gs = 0, Bs = 0, ng = 0; - if( gradN < T ) + __m128i grad0, grad1; + + grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), + _mm_loadu_si128((__m128i*)(brow1+N4))); + grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N2)), + _mm_loadu_si128((__m128i*)(brow0+N2+1))), + _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)), + _mm_loadu_si128((__m128i*)(brow1+N2+1)))); + gradNE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); + + grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), + _mm_loadu_si128((__m128i*)(brow1+N4))); + grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N2)), + _mm_loadu_si128((__m128i*)(brow2+N2-1))), + _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)), + _mm_loadu_si128((__m128i*)(brow1+N2-1)))); + gradSW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); + + minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW); + maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW); + + grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), + _mm_loadu_si128((__m128i*)(brow1+N5))); + grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N3)), + _mm_loadu_si128((__m128i*)(brow0+N3-1))), + _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)), + _mm_loadu_si128((__m128i*)(brow1+N3-1)))); + gradNW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); + + grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), + _mm_loadu_si128((__m128i*)(brow1+N5))); + grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N3)), + _mm_loadu_si128((__m128i*)(brow2+N3+1))), + _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)), + _mm_loadu_si128((__m128i*)(brow1+N3+1)))); + gradSE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); + + minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE); + maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE); + + T = _mm_add_epi16(_mm_srli_epi16(maxGrad, 1), minGrad); + __m128i RGs = z, GRs = z, Bs = z, ng = z, mask; + + __m128i t0, t1, x0, x1, x2, x3, x4, x5, x6, x7, x8, + x9, x10, x11, x12, x13, x14, x15, x16; + + x0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)srow), z); + + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-1)), z); + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-1)), z); + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)), z); + x4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+1)), z); + x5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+1)), z); + x6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+2)), z); + x7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)), z); + x8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+2)), z); + x9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+1)), z); + x10 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+1)), z); + x11 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)), z); + x12 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-1)), z); + x13 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-1)), z); + x14 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-2)), z); + x15 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)), z); + x16 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-2)), z); + + // gradN + mask = _mm_cmpgt_epi16(T, gradN); + ng = _mm_sub_epi16(ng, mask); + + t0 = _mm_slli_epi16(x3, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x2,x4)), mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x5), t0), mask)); + + // gradNE + mask = _mm_cmpgt_epi16(T, gradNE); + ng = _mm_sub_epi16(ng, mask); + + t0 = _mm_slli_epi16(x5, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), + _mm_adds_epu16(x4,x7)), mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x6)), mask)); + + // gradE + mask = _mm_cmpgt_epi16(T, gradE); + ng = _mm_sub_epi16(ng, mask); + + t0 = _mm_slli_epi16(x7, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x5,x9), + _mm_adds_epu16(x6,x8)), mask)); + + // gradSE + mask = _mm_cmpgt_epi16(T, gradSE); + ng = _mm_sub_epi16(ng, mask); + + t0 = _mm_slli_epi16(x9, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), + _mm_adds_epu16(x7,x10)), mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x8,x11)), mask)); + + // gradS + mask = _mm_cmpgt_epi16(T, gradS); + ng = _mm_sub_epi16(ng, mask); + + t0 = _mm_slli_epi16(x11, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x10,x12)), mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x9,x13), t0), mask)); + + // gradSW + mask = _mm_cmpgt_epi16(T, gradSW); + ng = _mm_sub_epi16(ng, mask); + + t0 = _mm_slli_epi16(x13, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), + _mm_adds_epu16(x12,x15)), mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x11,x14)), mask)); + + // gradW + mask = _mm_cmpgt_epi16(T, gradW); + ng = _mm_sub_epi16(ng, mask); + + t0 = _mm_slli_epi16(x15, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x13), + _mm_adds_epu16(x14,x16)), mask)); + + // gradNW + mask = _mm_cmpgt_epi16(T, gradNW); + ng = _mm_sub_epi16(ng, mask); + + __m128 ngf0, ngf1; + ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng)); + ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng)); + + t0 = _mm_slli_epi16(x1, 1); + t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-2)), z), x0); + + RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), + _mm_adds_epu16(x2,x15)), mask)); + Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x16)), mask)); + + // now interpolate r, g & b + t0 = _mm_sub_epi16(GRs, RGs); + t1 = _mm_sub_epi16(Bs, RGs); + + t0 = _mm_add_epi16(x0, _mm_packs_epi32( + _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t0), ngf0)), + _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t0), ngf1)))); + + t1 = _mm_add_epi16(x0, _mm_packs_epi32( + _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t1), ngf0)), + _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t1), ngf1)))); + + x1 = _mm_merge_epi16(x0, t0); + x2 = _mm_merge_epi16(t0, x0); + + uchar R[8], G[8], B[8]; + + _mm_storel_epi64(blueIdx ? (__m128i*)B : (__m128i*)R, _mm_packus_epi16(x1, z)); + _mm_storel_epi64((__m128i*)G, _mm_packus_epi16(x2, z)); + _mm_storel_epi64(blueIdx ? (__m128i*)R : (__m128i*)B, _mm_packus_epi16(t1, z)); + + for( int j = 0; j < 8; j++, dstrow += 3 ) { - Rs += (srow[i-bstep*2] + srow[i])>>1; - Gs += srow[i-bstep]; - Bs += (srow[i-bstep-1] + srow[i-bstep+1])>>1; - ng++; + dstrow[0] = B[j]; dstrow[1] = G[j]; dstrow[2] = R[j]; } - if( gradS < T ) - { - Rs += (srow[i+bstep*2] + srow[i])>>1; - Gs += srow[i+bstep]; - Bs += (srow[i+bstep-1] + srow[i+bstep+1])>>1; - ng++; - } - if( gradW < T ) - { - Rs += (srow[i-2] + srow[i])>>1; - Gs += srow[i-1]; - Bs += (srow[i-bstep-1] + srow[i+bstep-1])>>1; - ng++; - } - if( gradE < T ) - { - Rs += (srow[i+2] + srow[i])>>1; - Gs += srow[i+1]; - Bs += (srow[i-bstep+1] + srow[i+bstep+1])>>1; - ng++; - } - if( gradNE < T ) - { - Rs += (srow[i-bstep*2+2] + srow[i])>>1; - Gs += brow0[bcn+6]; - Bs += srow[i-bstep+1]; - ng++; - } - if( gradSW < T ) - { - Rs += (srow[i+bstep*2-2] + srow[i])>>1; - Gs += brow2[-bcn+6]; - Bs += srow[i+bstep-1]; - ng++; - } - if( gradNW < T ) - { - Rs += (srow[i-bstep*2+2] + srow[i])>>1; - Gs += brow0[bcn+6]; - Bs += srow[i-bstep+1]; - ng++; - } - if( gradSE < T ) - { - Rs += (srow[i-bstep*2+2] + srow[i])>>1; - Gs += brow0[bcn+6]; - Bs += srow[i-bstep+1]; - ng++; - } - R = srow[i]; - G = R + cvRound((Gs - Rs)*scale[ng]); - B = R + cvRound((Bs - Rs)*scale[ng]); } - else - { - int gradNE = (brow0[2] + brow0[bcn+2] + brow1[2] + brow1[bcn+2])>>1; - int gradSW = (brow1[2] + brow1[-bcn+2] + brow2[2] + brow2[-bcn+2])>>1; - int gradNW = (brow0[3] + brow0[-bcn+3] + brow1[3] + brow1[-bcn+3])>>1; - int gradSE = (brow1[3] + brow1[bcn+3] + brow2[3] + brow2[bcn+3])>>1; - - minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE); - maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE); - int T = minGrad + maxGrad/2; - - int Rs = 0, Gs = 0, Bs = 0, ng = 0; - if( gradN < T ) - { - Rs += (srow[i-bstep*2-1] + srow[i-bstep*2+1])>>1; - Gs += (srow[i-bstep*2] + srow[i])>>1; - Bs += srow[i-bstep]; - ng++; - } - if( gradS < T ) - { - Rs += (srow[i+bstep*2-1] + srow[i+bstep*2+1])>>1; - Gs += (srow[i+bstep*2] + srow[i])>>1; - Bs += srow[i+bstep]; - ng++; - } - if( gradW < T ) - { - Rs += srow[i-1]; - Gs += (srow[i-2] + srow[i])>>1; - Bs += (srow[i-bstep-2]+srow[i+bstep-2])>>1; - ng++; - } - if( gradE < T ) - { - Rs += srow[i+1]; - Gs += (srow[i+2] + srow[i])>>1; - Bs += (srow[i-bstep+2]+srow[i+bstep+2])>>1; - ng++; - } - if( gradNE < T ) - { - Rs += (srow[i-bstep*2+1] + srow[i+1])>>1; - Gs += srow[i-bstep+1]; - Bs += (srow[i-bstep] + srow[i-bstep+2])>>1; - ng++; - } - if( gradSW < T ) - { - Rs += (srow[i+bstep*2-1] + srow[i-1])>>1; - Gs += srow[i+bstep-1]; - Bs += (srow[i+bstep] + srow[i+bstep-2])>>1; - ng++; - } - if( gradNW < T ) - { - Rs += (srow[i-bstep*2-1] + srow[i-1])>>1; - Gs += srow[i-bstep-1]; - Bs += (srow[i-bstep-2]+srow[i-bstep])>>1; - ng++; - } - if( gradSE < T ) - { - Rs += (srow[i+bstep*2+1] + srow[i+1])>>1; - Gs += srow[i+bstep+1]; - Bs += (srow[i+bstep+2]+srow[i+bstep])>>1; - ng++; - } - G = srow[i]; - R = G + cvRound((Rs - Gs)*scale[ng]); - B = G + cvRound((Bs - Gs)*scale[ng]); - } - dstrow[blueIdx] = CV_CAST_8U(B); - dstrow[1] = CV_CAST_8U(G); - dstrow[blueIdx^2] = CV_CAST_8U(R); - greenCell = !greenCell; + #endif + + limit = N - 2; } + while( i < N - 2 ); + + for( i = 0; i < 6; i++ ) + { + dst[dststep*y + 5 - i] = dst[dststep*y + 8 - i]; + dst[dststep*y + (N - 2)*3 + i] = dst[dststep*y + (N - 3)*3 + i]; + } + greenCell0 = !greenCell0; blueIdx ^= 2; } + for( i = 0; i < size.width*3; i++ ) + { + dst[i] = dst[i + dststep] = dst[i + dststep*2]; + dst[i + dststep*(size.height-4)] = + dst[i + dststep*(size.height-3)] = + dst[i + dststep*(size.height-2)] = + dst[i + dststep*(size.height-1)] = dst[i + dststep*(size.height-5)]; + } + return CV_OK; }