From 37a5af367375794e4c9980ef9bbc235d7aca391a Mon Sep 17 00:00:00 2001 From: winice <31055295+winice-test@users.noreply.github.com> Date: Mon, 4 Feb 2019 23:59:00 +0800 Subject: [PATCH] Merge pull request #13737 from winice-test:master * Optical Flow rework to use wide universal intrinsics * remove if (useSIMD) check as review requested --- modules/video/src/optflowgf.cpp | 87 ++++++++++++++++----------------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp index e06dbbf379..83ad47fc0e 100644 --- a/modules/video/src/optflowgf.cpp +++ b/modules/video/src/optflowgf.cpp @@ -42,6 +42,7 @@ #include "precomp.hpp" #include "opencl_kernels_video.hpp" +#include "opencv2/core/hal/intrin.hpp" #if defined __APPLE__ || defined __ANDROID__ #define SMALL_LOCALSIZE @@ -433,13 +434,11 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, for( i = 0; i <= m; i++ ) kernel[i] = (float)(kernel[i]*s); -#if CV_SSE2 +#if CV_SIMD128 float* simd_kernel = alignPtr(kernel + m+1, 16); - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); - if( useSIMD ) { for( i = 0; i <= m; i++ ) - _mm_store_ps(simd_kernel + i*4, _mm_set1_ps(kernel[i])); + v_store(simd_kernel + i*4, v_setall_f32(kernel[i])); } #endif @@ -457,54 +456,53 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, } x = 0; -#if CV_SSE2 - if( useSIMD ) +#if CV_SIMD128 { for( ; x <= width*5 - 16; x += 16 ) { const float *sptr0 = srow[m], *sptr1; - __m128 g4 = _mm_load_ps(simd_kernel); - __m128 s0, s1, s2, s3; - s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4); - s1 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 4), g4); - s2 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 8), g4); - s3 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 12), g4); + v_float32x4 g4 = v_load(simd_kernel); + v_float32x4 s0, s1, s2, s3; + s0 = v_load(sptr0 + x) * g4; + s1 = v_load(sptr0 + x + 4) * g4; + s2 = v_load(sptr0 + x + 8) * g4; + s3 = v_load(sptr0 + x + 12) * g4; for( i = 1; i <= m; i++ ) { - __m128 x0, x1; + v_float32x4 x0, x1; sptr0 = srow[m+i], sptr1 = srow[m-i]; - g4 = _mm_load_ps(simd_kernel + i*4); - x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x)); - x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 4), _mm_loadu_ps(sptr1 + x + 4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4)); - x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 8), _mm_loadu_ps(sptr1 + x + 8)); - x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 12), _mm_loadu_ps(sptr1 + x + 12)); - s2 = _mm_add_ps(s2, _mm_mul_ps(x0, g4)); - s3 = _mm_add_ps(s3, _mm_mul_ps(x1, g4)); + g4 = v_load(simd_kernel + i*4); + x0 = v_load(sptr0 + x) + v_load(sptr1 + x); + x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4); + s0 = v_muladd(x0, g4, s0); + s1 = v_muladd(x1, g4, s1); + x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8); + x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12); + s2 = v_muladd(x0, g4, s2); + s3 = v_muladd(x1, g4, s3); } - _mm_store_ps(vsum + x, s0); - _mm_store_ps(vsum + x + 4, s1); - _mm_store_ps(vsum + x + 8, s2); - _mm_store_ps(vsum + x + 12, s3); + v_store(vsum + x, s0); + v_store(vsum + x + 4, s1); + v_store(vsum + x + 8, s2); + v_store(vsum + x + 12, s3); } for( ; x <= width*5 - 4; x += 4 ) { const float *sptr0 = srow[m], *sptr1; - __m128 g4 = _mm_load_ps(simd_kernel); - __m128 s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4); + v_float32x4 g4 = v_load(simd_kernel); + v_float32x4 s0 = v_load(sptr0 + x) * g4; for( i = 1; i <= m; i++ ) { sptr0 = srow[m+i], sptr1 = srow[m-i]; - g4 = _mm_load_ps(simd_kernel + i*4); - __m128 x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); + g4 = v_load(simd_kernel + i*4); + v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x); + s0 = v_muladd(x0, g4, s0); } - _mm_store_ps(vsum + x, s0); + v_store(vsum + x, s0); } } #endif @@ -525,28 +523,25 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, // horizontal blur x = 0; -#if CV_SSE2 - if( useSIMD ) +#if CV_SIMD128 { for( ; x <= width*5 - 8; x += 8 ) { - __m128 g4 = _mm_load_ps(simd_kernel); - __m128 s0 = _mm_mul_ps(_mm_loadu_ps(vsum + x), g4); - __m128 s1 = _mm_mul_ps(_mm_loadu_ps(vsum + x + 4), g4); + v_float32x4 g4 = v_load(simd_kernel); + v_float32x4 s0 = v_load(vsum + x) * g4; + v_float32x4 s1 = v_load(vsum + x + 4) * g4; for( i = 1; i <= m; i++ ) { - g4 = _mm_load_ps(simd_kernel + i*4); - __m128 x0 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5), - _mm_loadu_ps(vsum + x + i*5)); - __m128 x1 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5 + 4), - _mm_loadu_ps(vsum + x + i*5 + 4)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); - s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4)); + g4 = v_load(simd_kernel + i*4); + v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5); + v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4); + s0 = v_muladd(x0, g4, s0); + s1 = v_muladd(x1, g4, s1); } - _mm_store_ps(hsum + x, s0); - _mm_store_ps(hsum + x + 4, s1); + v_store(hsum + x, s0); + v_store(hsum + x + 4, s1); } } #endif