From 992795d47d925b5bb846279960b6220ddbecfa01 Mon Sep 17 00:00:00 2001 From: k-shinotsuka Date: Sat, 22 Oct 2016 19:56:55 +0900 Subject: [PATCH] add SSE code for RGB2Luv_f. --- modules/imgproc/src/color.cpp | 172 ++++++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 7 deletions(-) diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 9da7dba6c4..77af633af3 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -141,6 +141,39 @@ template static inline _Tp splineInterpolate(_Tp x, const _Tp* tab return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0]; } +#if CV_SSE2 +template static inline void splineInterpolate(__m128& v_x, const _Tp* tab, int n) +{ + __m128i v_ix = _mm_cvtps_epi32(_mm_min_ps(_mm_max_ps(v_x, _mm_setzero_ps()), _mm_set1_ps(float(n - 1)))); + v_x = _mm_sub_ps(v_x, _mm_cvtepi32_ps(v_ix)); + v_ix = _mm_slli_epi32(v_ix, 2); + + int CV_DECL_ALIGNED(16) ix[4]; + _mm_store_si128((__m128i *)ix, v_ix); + + __m128 v_tab0 = _mm_loadu_ps(tab + ix[0]); + __m128 v_tab1 = _mm_loadu_ps(tab + ix[1]); + __m128 v_tab2 = _mm_loadu_ps(tab + ix[2]); + __m128 v_tab3 = _mm_loadu_ps(tab + ix[3]); + + __m128 v_tmp0 = _mm_unpacklo_ps(v_tab0, v_tab1); + __m128 v_tmp1 = _mm_unpacklo_ps(v_tab2, v_tab3); + __m128 v_tmp2 = _mm_unpackhi_ps(v_tab0, v_tab1); + __m128 v_tmp3 = _mm_unpackhi_ps(v_tab2, v_tab3); + + v_tab0 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0x44); + v_tab2 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0x44); + v_tab1 = _mm_shuffle_ps(v_tmp0, v_tmp1, 0xee); + v_tab3 = _mm_shuffle_ps(v_tmp2, v_tmp3, 0xee); + + __m128 v_l = _mm_mul_ps(v_x, v_tab3); + v_l = _mm_add_ps(v_l, v_tab2); + v_l = _mm_mul_ps(v_l, v_x); + v_l = _mm_add_ps(v_l, v_tab1); + v_l = _mm_mul_ps(v_l, v_x); + v_x = _mm_add_ps(v_l, v_tab0); +} +#endif template struct ColorChannel { @@ -5766,24 +5799,146 @@ struct RGB2Luv_f } float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3); - un = 4*whitept[0]*d; - vn = 9*whitept[1]*d; + un = 4*whitept[0]*d*13; + vn = 9*whitept[1]*d*13; + + #if CV_SSE2 + haveSIMD = checkHardwareSupport(CV_CPU_SSE2); + #endif CV_Assert(whitept[1] == 1.f); } + #if CV_SSE2 + void process(__m128& v_r0, __m128& v_r1, __m128& v_g0, + __m128& v_g1, __m128& v_b0, __m128& v_b1) const + { + __m128 v_x0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[0])); + __m128 v_x1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[0])); + __m128 v_y0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[3])); + __m128 v_y1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[3])); + __m128 v_z0 = _mm_mul_ps(v_r0, _mm_set1_ps(coeffs[6])); + __m128 v_z1 = _mm_mul_ps(v_r1, _mm_set1_ps(coeffs[6])); + + v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[1]))); + v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[1]))); + v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[4]))); + v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[4]))); + v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_g0, _mm_set1_ps(coeffs[7]))); + v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_g1, _mm_set1_ps(coeffs[7]))); + + v_x0 = _mm_add_ps(v_x0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[2]))); + v_x1 = _mm_add_ps(v_x1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[2]))); + v_y0 = _mm_add_ps(v_y0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[5]))); + v_y1 = _mm_add_ps(v_y1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[5]))); + v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_b0, _mm_set1_ps(coeffs[8]))); + v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_b1, _mm_set1_ps(coeffs[8]))); + + __m128 v_l0 = _mm_mul_ps(v_y0, _mm_set1_ps(LabCbrtTabScale)); + __m128 v_l1 = _mm_mul_ps(v_y1, _mm_set1_ps(LabCbrtTabScale)); + splineInterpolate(v_l0, LabCbrtTab, LAB_CBRT_TAB_SIZE); + splineInterpolate(v_l1, LabCbrtTab, LAB_CBRT_TAB_SIZE); + + v_l0 = _mm_mul_ps(v_l0, _mm_set1_ps(116.0f)); + v_l1 = _mm_mul_ps(v_l1, _mm_set1_ps(116.0f)); + v_r0 = _mm_sub_ps(v_l0, _mm_set1_ps(16.0f)); + v_r1 = _mm_sub_ps(v_l1, _mm_set1_ps(16.0f)); + + v_z0 = _mm_mul_ps(v_z0, _mm_set1_ps(3.0f)); + v_z1 = _mm_mul_ps(v_z1, _mm_set1_ps(3.0f)); + v_z0 = _mm_add_ps(v_z0, v_x0); + v_z1 = _mm_add_ps(v_z1, v_x1); + v_z0 = _mm_add_ps(v_z0, _mm_mul_ps(v_y0, _mm_set1_ps(15.0f))); + v_z1 = _mm_add_ps(v_z1, _mm_mul_ps(v_y1, _mm_set1_ps(15.0f))); + v_z0 = _mm_max_ps(v_z0, _mm_set1_ps(FLT_EPSILON)); + v_z1 = _mm_max_ps(v_z1, _mm_set1_ps(FLT_EPSILON)); + __m128 v_d0 = _mm_div_ps(_mm_set1_ps(52.0f), v_z0); + __m128 v_d1 = _mm_div_ps(_mm_set1_ps(52.0f), v_z1); + + v_x0 = _mm_mul_ps(v_x0, v_d0); + v_x1 = _mm_mul_ps(v_x1, v_d1); + v_x0 = _mm_sub_ps(v_x0, _mm_set1_ps(un)); + v_x1 = _mm_sub_ps(v_x1, _mm_set1_ps(un)); + v_g0 = _mm_mul_ps(v_x0, v_r0); + v_g1 = _mm_mul_ps(v_x1, v_r1); + + v_y0 = _mm_mul_ps(v_y0, v_d0); + v_y1 = _mm_mul_ps(v_y1, v_d1); + v_y0 = _mm_mul_ps(v_y0, _mm_set1_ps(2.25f)); + v_y1 = _mm_mul_ps(v_y1, _mm_set1_ps(2.25f)); + v_y0 = _mm_sub_ps(v_y0, _mm_set1_ps(vn)); + v_y1 = _mm_sub_ps(v_y1, _mm_set1_ps(vn)); + v_b0 = _mm_mul_ps(v_y0, v_r0); + v_b1 = _mm_mul_ps(v_y1, v_r1); + } + #endif + void operator()(const float* src, float* dst, int n) const { - int i, scn = srccn; + int i = 0, scn = srccn; float gscale = GammaTabScale; const float* gammaTab = srgb ? sRGBGammaTab : 0; float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; - float _un = 13*un, _vn = 13*vn; n *= 3; - for( i = 0; i < n; i += 3, src += scn ) + #if CV_SSE2 + if (haveSIMD) + { + for( ; i <= n - 24; i += 24, src += scn * 8 ) + { + __m128 v_r0 = _mm_loadu_ps(src + 0); + __m128 v_r1 = _mm_loadu_ps(src + 4); + __m128 v_g0 = _mm_loadu_ps(src + 8); + __m128 v_g1 = _mm_loadu_ps(src + 12); + __m128 v_b0 = _mm_loadu_ps(src + 16); + __m128 v_b1 = _mm_loadu_ps(src + 20); + + if (scn == 3) + { + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + } + else + { + __m128 v_a0 = _mm_loadu_ps(src + 24); + __m128 v_a1 = _mm_loadu_ps(src + 28); + + _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1); + } + + if ( gammaTab ) + { + __m128 v_gscale = _mm_set1_ps(gscale); + v_r0 = _mm_mul_ps(v_r0, v_gscale); + v_r1 = _mm_mul_ps(v_r1, v_gscale); + v_g0 = _mm_mul_ps(v_g0, v_gscale); + v_g1 = _mm_mul_ps(v_g1, v_gscale); + v_b0 = _mm_mul_ps(v_b0, v_gscale); + v_b1 = _mm_mul_ps(v_b1, v_gscale); + + splineInterpolate(v_r0, gammaTab, GAMMA_TAB_SIZE); + splineInterpolate(v_r1, gammaTab, GAMMA_TAB_SIZE); + splineInterpolate(v_g0, gammaTab, GAMMA_TAB_SIZE); + splineInterpolate(v_g1, gammaTab, GAMMA_TAB_SIZE); + splineInterpolate(v_b0, gammaTab, GAMMA_TAB_SIZE); + splineInterpolate(v_b1, gammaTab, GAMMA_TAB_SIZE); + } + + process(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1); + + _mm_storeu_ps(dst + i + 0, v_r0); + _mm_storeu_ps(dst + i + 4, v_r1); + _mm_storeu_ps(dst + i + 8, v_g0); + _mm_storeu_ps(dst + i + 12, v_g1); + _mm_storeu_ps(dst + i + 16, v_b0); + _mm_storeu_ps(dst + i + 20, v_b1); + } + } + #endif + for( ; i < n; i += 3, src += scn ) { float R = src[0], G = src[1], B = src[2]; if( gammaTab ) @@ -5801,8 +5956,8 @@ struct RGB2Luv_f L = 116.f*L - 16.f; float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON); - float u = L*(X*d - _un); - float v = L*((9*0.25f)*Y*d - _vn); + float u = L*(X*d - un); + float v = L*((9*0.25f)*Y*d - vn); dst[i] = L; dst[i+1] = u; dst[i+2] = v; } @@ -5811,6 +5966,9 @@ struct RGB2Luv_f int srccn; float coeffs[9], un, vn; bool srgb; + #if CV_SSE2 + bool haveSIMD; + #endif };