From 4001e310f5c20b6d336118a49177660aeaa6267f Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 23 Apr 2018 18:06:06 +0300 Subject: [PATCH] improved performance of v_load_deinterleave(8uC3) & v_store_interleave(8uC3) intrinsics when using SSSE3 instructions. --- .../include/opencv2/core/hal/intrin_sse.hpp | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index c91b05de93..91d37ee00d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1607,6 +1607,28 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) { +#if CV_SSSE3 + static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14); + static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11); + static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6); + + __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); + __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); + __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); + + __m128i s0 = _mm_shuffle_epi8(t0, m0); + __m128i s1 = _mm_shuffle_epi8(t1, m1); + __m128i s2 = _mm_shuffle_epi8(t2, m2); + + t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5); + a.val = _mm_alignr_epi8(s2, t0, 5); + + t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6); + b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5); + + t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11); + c.val = _mm_alignr_epi8(t2, s0, 11); +#else __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16)); __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32)); @@ -1626,6 +1648,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31)); b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32); c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32)); +#endif } inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) @@ -1840,6 +1863,27 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c ) { +#if CV_SSSE3 + static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5); + static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10); + static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15); + + __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5); + t0 = _mm_alignr_epi8(c.val, t0, 5); + __m128i s0 = _mm_shuffle_epi8(t0, m0); + + __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6); + t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5); + __m128i s1 = _mm_shuffle_epi8(t1, m1); + + __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11); + t2 = _mm_alignr_epi8(t2, a.val, 11); + __m128i s2 = _mm_shuffle_epi8(t2, m2); + + _mm_storeu_si128((__m128i*)ptr, s0); + _mm_storeu_si128((__m128i*)(ptr + 16), s1); + _mm_storeu_si128((__m128i*)(ptr + 32), s2); +#else __m128i z = _mm_setzero_si128(); __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val); __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val); @@ -1881,6 +1925,7 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 _mm_storeu_si128((__m128i*)(ptr), v0); _mm_storeu_si128((__m128i*)(ptr + 16), v1); _mm_storeu_si128((__m128i*)(ptr + 32), v2); +#endif } inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,