diff --git a/CMakeLists.txt b/CMakeLists.txt index c19689f8e4..30e424b505 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -323,7 +323,6 @@ OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OFF IF CV_GCC ) OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CV_GCC ) OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) ) -OCV_OPTION(ENABLE_VSX "Enable POWER8 and above VSX (64-bit little-endian)" ON IF ((CV_GCC OR CV_CLANG) AND PPC64LE) ) OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CV_GCC AND (X86 OR X86_64)) ) if(NOT IOS AND (NOT ANDROID OR OPENCV_ANDROID_USE_LEGACY_FLAGS)) # Use CPU_BASELINE instead OCV_OPTION(ENABLE_NEON "Enable NEON instructions" (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) ) diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 7deb127e65..4c115f0e53 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -5,6 +5,10 @@ # AVX / AVX2 / AVX_512F # FMA3 +# ppc64le arch: +# VSX (always available on Power8) +# VSX3 (always available on Power9) + # CPU_{opt}_SUPPORTED=ON/OFF - compiler support (possibly with additional flag) # CPU_{opt}_IMPLIES= # CPU_{opt}_FORCE= - subset of "implies" list @@ -29,7 +33,7 @@ set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F;AVX512_SKX") list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) -list(APPEND CPU_ALL_OPTIMIZATIONS VSX) +list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) ocv_update(CPU_VFPV3_FEATURE_ALIAS "") @@ -81,7 +85,7 @@ ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON) ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF) ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF) -ocv_optimization_process_obsolete_option(ENABLE_VSX VSX OFF) +ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON) macro(ocv_is_optimization_in_list resultvar check_opt) set(__checked "") @@ -289,14 +293,24 @@ elseif(ARM OR AARCH64) set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") endif() elseif(PPC64LE) - ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX;VSX3") ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp") + ocv_update(CPU_VSX3_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx3.cpp") + + if(NOT OPENCV_CPU_OPT_IMPLIES_IGNORE) + ocv_update(CPU_VSX3_IMPLIES "VSX") + endif() if(CV_CLANG AND (NOT ${CMAKE_CXX_COMPILER} MATCHES "xlc")) ocv_update(CPU_VSX_FLAGS_ON "-mvsx -maltivec") + ocv_update(CPU_VSX3_FLAGS_ON "-mpower9-vector") else() ocv_update(CPU_VSX_FLAGS_ON "-mcpu=power8") + ocv_update(CPU_VSX3_FLAGS_ON "-mcpu=power9 -mtune=power9") endif() + + set(CPU_DISPATCH "VSX3" CACHE STRING "${HELP_CPU_DISPATCH}") + set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}") endif() # Helper values for cmake-gui diff --git a/cmake/checks/cpu_vsx.cpp b/cmake/checks/cpu_vsx.cpp index 6d744825b6..8d0cd574ce 100644 --- a/cmake/checks/cpu_vsx.cpp +++ b/cmake/checks/cpu_vsx.cpp @@ -1,8 +1,12 @@ -# if defined(__VSX__) -# include -# else -# error "VSX is not supported" -# endif +#if defined(__VSX__) + #if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) + #include + #else + #error "OpenCV only supports little-endian mode" + #endif +#else + #error "VSX is not supported" +#endif int main() { diff --git a/cmake/checks/cpu_vsx3.cpp b/cmake/checks/cpu_vsx3.cpp new file mode 100644 index 0000000000..31d7014a7c --- /dev/null +++ b/cmake/checks/cpu_vsx3.cpp @@ -0,0 +1,17 @@ +#if defined(__VSX__) + #if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) + #include + #else + #error "OpenCV only supports little-endian mode" + #endif +#else + #error "VSX3 is not supported" +#endif + +int main() +{ + __vector unsigned char a = vec_splats((unsigned char)1); + __vector unsigned char b = vec_splats((unsigned char)2); + __vector unsigned char r = vec_absd(a, b); + return 0; +} diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 20fdf96c59..0f6f78d4ed 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -2,7 +2,7 @@ set(the_description "The Core Functionality") ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) ocv_add_dispatched_file(stat SSE4_2 AVX2) -ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) # dispatching for accuracy tests ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2) diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 4c686af97f..57aa0ce2fb 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -107,7 +107,7 @@ # include #endif -#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) +#ifdef CV_CPU_COMPILE_VSX # include # undef vector # undef pixel @@ -115,6 +115,10 @@ # define CV_VSX 1 #endif +#ifdef CV_CPU_COMPILE_VSX3 +# define CV_VSX3 1 +#endif + #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX @@ -237,3 +241,7 @@ struct VZeroUpperGuard { #ifndef CV_VSX # define CV_VSX 0 #endif + +#ifndef CV_VSX3 +# define CV_VSX3 0 +#endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 84f489bab4..ad1339796d 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -315,5 +315,26 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...) CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX3 +# define CV_TRY_VSX3 1 +# define CV_CPU_FORCE_VSX3 1 +# define CV_CPU_HAS_SUPPORT_VSX3 1 +# define CV_CPU_CALL_VSX3(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_VSX3_(fn, args) return (opt_VSX3::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX3 +# define CV_TRY_VSX3 1 +# define CV_CPU_FORCE_VSX3 0 +# define CV_CPU_HAS_SUPPORT_VSX3 (cv::checkHardwareSupport(CV_CPU_VSX3)) +# define CV_CPU_CALL_VSX3(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args) +# define CV_CPU_CALL_VSX3_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args) +#else +# define CV_TRY_VSX3 0 +# define CV_CPU_FORCE_VSX3 0 +# define CV_CPU_HAS_SUPPORT_VSX3 0 +# define CV_CPU_CALL_VSX3(fn, args) +# define CV_CPU_CALL_VSX3_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...) CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 6623a1c2d4..88aa11bed4 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -226,9 +226,10 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard #define CV_CPU_AVX_512VBMI 20 #define CV_CPU_AVX_512VL 21 -#define CV_CPU_NEON 100 +#define CV_CPU_NEON 100 -#define CV_CPU_VSX 200 +#define CV_CPU_VSX 200 +#define CV_CPU_VSX3 201 // CPU features groups #define CV_CPU_AVX512_SKX 256 @@ -266,6 +267,7 @@ enum CpuFeatures { CPU_NEON = 100, CPU_VSX = 200, + CPU_VSX3 = 201, CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index f8cc7a4d00..30377048ae 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -905,6 +905,11 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4) OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps) OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd) +inline v_float32x8 v_not_nan(const v_float32x8& a) +{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); } +inline v_float64x4 v_not_nan(const v_float64x4& a) +{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); } + /** min/max **/ OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8) OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8) diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 5712f167a8..1cfb14ae06 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -683,6 +683,25 @@ OPENCV_HAL_IMPL_CMP_OP(==) For all types except 64-bit integer values. */ OPENCV_HAL_IMPL_CMP_OP(!=) +template +inline v_reg v_not_nan(const v_reg& a) +{ +typedef typename V_TypeTraits::int_type itype; +v_reg c; +for (int i = 0; i < n; i++) + c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); + return c; +} +template +inline v_reg v_not_nan(const v_reg& a) +{ + typedef typename V_TypeTraits::int_type itype; + v_reg c; + for (int i = 0; i < n; i++) + c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); + return c; +} + //! @brief Helper macro //! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 50c9b154ee..2de4e45283 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -764,6 +764,13 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64) OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64) #endif +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); } +#if CV_SIMD128_64F +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); } +#endif + OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8) OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16) diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index c49d0de377..283c5158d7 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1041,6 +1041,11 @@ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64) OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64) +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); } +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); } + OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8) OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index b23e19950e..fe4a5db5df 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -607,6 +607,11 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2) OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2) OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2) +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ return v_float32x4(vec_cmpeq(a.val, a.val)); } +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ return v_float64x2(vec_cmpeq(a.val, a.val)); } + /** min/max **/ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min) OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max) diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp index 3aabfa138d..b1fa676e5e 100644 --- a/modules/core/include/opencv2/core/types.hpp +++ b/modules/core/include/opencv2/core/types.hpp @@ -1941,8 +1941,11 @@ Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Size_<_Tp>& b ) template static inline Rect_<_Tp>& operator -= ( Rect_<_Tp>& a, const Size_<_Tp>& b ) { - a.width -= b.width; - a.height -= b.height; + const _Tp width = a.width - b.width; + const _Tp height = a.height - b.height; + CV_DbgAssert(width >= 0 && height >= 0); + a.width = width; + a.height = height; return a; } @@ -2007,6 +2010,15 @@ Rect_<_Tp> operator + (const Rect_<_Tp>& a, const Size_<_Tp>& b) return Rect_<_Tp>( a.x, a.y, a.width + b.width, a.height + b.height ); } +template static inline +Rect_<_Tp> operator - (const Rect_<_Tp>& a, const Size_<_Tp>& b) +{ + const _Tp width = a.width - b.width; + const _Tp height = a.height - b.height; + CV_DbgAssert(width >= 0 && height >= 0); + return Rect_<_Tp>( a.x, a.y, width, height ); +} + template static inline Rect_<_Tp> operator & (const Rect_<_Tp>& a, const Rect_<_Tp>& b) { diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp index d74e377494..accfabb6d5 100644 --- a/modules/core/src/parallel.cpp +++ b/modules/core/src/parallel.cpp @@ -454,7 +454,7 @@ static inline int _initMaxThreads() { omp_set_dynamic(maxThreads); } - return numThreads; + return maxThreads; } static int numThreadsMax = _initMaxThreads(); #elif defined HAVE_GCD diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 9f82a15654..408a673659 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -107,15 +107,14 @@ void* allocSingletonBuffer(size_t size) { return fastMalloc(size); } # include #endif -#ifndef __VSX__ -# if defined __PPC64__ && defined __linux__ -# include "sys/auxv.h" -# ifndef AT_HWCAP2 -# define AT_HWCAP2 26 -# endif -# ifndef PPC_FEATURE2_ARCH_2_07 -# define PPC_FEATURE2_ARCH_2_07 0x80000000 -# endif + +#if CV_VSX && defined __linux__ +# include "sys/auxv.h" +# ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +# endif +# ifndef PPC_FEATURE2_ARCH_3_00 +# define PPC_FEATURE2_ARCH_3_00 0x00800000 # endif #endif @@ -359,6 +358,7 @@ struct HWFeatures g_hwFeatureNames[CPU_NEON] = "NEON"; g_hwFeatureNames[CPU_VSX] = "VSX"; + g_hwFeatureNames[CPU_VSX3] = "VSX3"; g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX"; } @@ -513,14 +513,14 @@ struct HWFeatures #endif #endif - #ifdef __VSX__ - have[CV_CPU_VSX] = true; - #elif (defined __PPC64__ && defined __linux__) - uint64 hwcaps = getauxval(AT_HWCAP); + // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs + have[CV_CPU_VSX] = (CV_VSX); + // TODO: Check VSX3 availability in runtime for other platforms + #if CV_VSX && defined __linux__ uint64 hwcap2 = getauxval(AT_HWCAP2); - have[CV_CPU_VSX] = (hwcaps & PPC_FEATURE_PPC_LE && hwcaps & PPC_FEATURE_HAS_VSX && hwcap2 & PPC_FEATURE2_ARCH_2_07); + have[CV_CPU_VSX3] = (hwcap2 & PPC_FEATURE2_ARCH_3_00); #else - have[CV_CPU_VSX] = false; + have[CV_CPU_VSX3] = (CV_VSX3); #endif int baseline_features[] = { CV_CPU_BASELINE_FEATURES }; diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp index e72400c7fa..e0a2c99991 100644 --- a/modules/core/test/test_operations.cpp +++ b/modules/core/test/test_operations.cpp @@ -972,6 +972,13 @@ bool CV_OperationsTest::operations1() if (sz.width != 10 || sz.height != 20) throw test_excep(); if (cvSize(sz).width != 10 || cvSize(sz).height != 20) throw test_excep(); + Rect r1(0, 0, 10, 20); + Size sz1(5, 10); + r1 -= sz1; + if (r1.size().width != 5 || r1.size().height != 10) throw test_excep(); + Rect r2 = r1 - sz1; + if (r2.size().width != 0 || r2.size().height != 0) throw test_excep(); + Vec v5d(1, 1, 1, 1, 1); Vec v6d(1, 1, 1, 1, 1, 1); Vec v7d(1, 1, 1, 1, 1, 1, 1); diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index bb8e761311..b1d81b895b 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -83,7 +83,9 @@ CV__DNN_INLINE_NS_BEGIN DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16, DNN_TARGET_MYRIAD, - DNN_TARGET_VULKAN + DNN_TARGET_VULKAN, + //! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin. + DNN_TARGET_FPGA }; /** @brief This class provides all data needed to initialize layer. @@ -497,6 +499,7 @@ CV__DNN_INLINE_NS_BEGIN * | DNN_TARGET_OPENCL | + | + | + | * | DNN_TARGET_OPENCL_FP16 | + | + | | * | DNN_TARGET_MYRIAD | | + | | + * | DNN_TARGET_FPGA | | + | | */ CV_WRAP void setPreferableTarget(int targetId); diff --git a/modules/dnn/include/opencv2/dnn/version.hpp b/modules/dnn/include/opencv2/dnn/version.hpp index ff9faa0602..62f0dd44d6 100644 --- a/modules/dnn/include/opencv2/dnn/version.hpp +++ b/modules/dnn/include/opencv2/dnn/version.hpp @@ -6,7 +6,7 @@ #define OPENCV_DNN_VERSION_HPP /// Use with major OpenCV version only. -#define OPENCV_DNN_API_VERSION 20180917 +#define OPENCV_DNN_API_VERSION 20181121 #if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION) diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index 1647db3b31..03d7a233f3 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -42,7 +42,7 @@ public: } if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) { - if (!checkMyriadTarget()) + if (!checkIETarget(DNN_TARGET_MYRIAD)) { throw SkipTestException("Myriad is not available/disabled in OpenCV"); } diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index a8b1c40b4b..cb38fafd77 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -1104,7 +1104,8 @@ struct Net::Impl preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL || preferableTarget == DNN_TARGET_OPENCL_FP16 || - preferableTarget == DNN_TARGET_MYRIAD); + preferableTarget == DNN_TARGET_MYRIAD || + preferableTarget == DNN_TARGET_FPGA); CV_Assert(preferableBackend != DNN_BACKEND_VKCOM || preferableTarget == DNN_TARGET_VULKAN); if (!netWasAllocated || this->blobsToKeep != blobsToKeep_) @@ -1609,7 +1610,9 @@ struct Net::Impl ieNode->net = net; auto weightableLayer = std::dynamic_pointer_cast(ieNode->layer); - if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD) && !fused) + if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || + preferableTarget == DNN_TARGET_MYRIAD || + preferableTarget == DNN_TARGET_FPGA) && !fused) { ieNode->layer->precision = InferenceEngine::Precision::FP16; if (weightableLayer) diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 178a2a4f2d..1eb149b3d1 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -119,8 +119,8 @@ public: lp.precision = InferenceEngine::Precision::FP32; std::shared_ptr ieLayer(new InferenceEngine::SplitLayer(lp)); #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R3) - ieLayer->params["axis"] = format("%d", input->dims.size() - 1); - ieLayer->params["out_sizes"] = format("%d", input->dims[0]); + ieLayer->params["axis"] = format("%d", (int)input->dims.size() - 1); + ieLayer->params["out_sizes"] = format("%d", (int)input->dims[0]); #endif return Ptr(new InfEngineBackendNode(ieLayer)); #endif // HAVE_INF_ENGINE diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 9af8f436ac..1d574b7b91 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -220,9 +220,14 @@ public: virtual bool supportBackend(int backendId) CV_OVERRIDE { +#ifdef HAVE_INF_ENGINE if (backendId == DNN_BACKEND_INFERENCE_ENGINE) - return preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height; + { + return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R4) || + (preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height); + } else +#endif return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || (backendId == DNN_BACKEND_VKCOM && haveVulkan()); diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index 81d6c67dcc..d8d13bfa47 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -302,7 +302,8 @@ void InfEngineBackendNet::setTargetDevice(InferenceEngine::TargetDevice device) { if (device != InferenceEngine::TargetDevice::eCPU && device != InferenceEngine::TargetDevice::eGPU && - device != InferenceEngine::TargetDevice::eMYRIAD) + device != InferenceEngine::TargetDevice::eMYRIAD && + device != InferenceEngine::TargetDevice::eFPGA) CV_Error(Error::StsNotImplemented, ""); targetDevice = device; } @@ -314,7 +315,8 @@ InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() CV_NOEXCEPT InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() const CV_NOEXCEPT { - return targetDevice; + return targetDevice == InferenceEngine::TargetDevice::eFPGA ? + InferenceEngine::TargetDevice::eHETERO : targetDevice; } InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) CV_NOEXCEPT @@ -466,6 +468,11 @@ void InfEngineBackendNet::init(int targetId) setPrecision(InferenceEngine::Precision::FP16); setTargetDevice(InferenceEngine::TargetDevice::eMYRIAD); break; } + case DNN_TARGET_FPGA: + { + setPrecision(InferenceEngine::Precision::FP16); + setTargetDevice(InferenceEngine::TargetDevice::eFPGA); break; + } default: CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId)); } @@ -489,10 +496,15 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) } else { - enginePtr = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(targetDevice); + auto dispatcher = InferenceEngine::PluginDispatcher({""}); + if (targetDevice == InferenceEngine::TargetDevice::eFPGA) + enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU"); + else + enginePtr = dispatcher.getSuitablePlugin(targetDevice); sharedPlugins[targetDevice] = enginePtr; - if (targetDevice == InferenceEngine::TargetDevice::eCPU) + if (targetDevice == InferenceEngine::TargetDevice::eCPU || + targetDevice == InferenceEngine::TargetDevice::eFPGA) { std::string suffixes[] = {"_avx2", "_sse4", ""}; bool haveFeature[] = { diff --git a/modules/dnn/test/test_common.hpp b/modules/dnn/test/test_common.hpp index 987a68116e..058a7150cb 100644 --- a/modules/dnn/test/test_common.hpp +++ b/modules/dnn/test/test_common.hpp @@ -68,6 +68,7 @@ static inline void PrintTo(const cv::dnn::Target& v, std::ostream* os) case DNN_TARGET_OPENCL_FP16: *os << "OCL_FP16"; return; case DNN_TARGET_MYRIAD: *os << "MYRIAD"; return; case DNN_TARGET_VULKAN: *os << "VULKAN"; return; + case DNN_TARGET_FPGA: *os << "FPGA"; return; } // don't use "default:" to emit compiler warnings *os << "DNN_TARGET_UNKNOWN(" << (int)v << ")"; } @@ -190,7 +191,7 @@ static inline void normAssertDetections(cv::Mat ref, cv::Mat out, const char *co testBoxes, comment, confThreshold, scores_diff, boxes_iou_diff); } -static inline bool checkMyriadTarget() +static inline bool checkIETarget(int target) { #ifndef HAVE_INF_ENGINE return false; @@ -199,7 +200,7 @@ static inline bool checkMyriadTarget() cv::dnn::LayerParams lp; net.addLayerToPrev("testLayer", "Identity", lp); net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE); - net.setPreferableTarget(cv::dnn::DNN_TARGET_MYRIAD); + net.setPreferableTarget(target); static int inpDims[] = {1, 2, 3, 4}; net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0))); try @@ -267,7 +268,7 @@ testing::internal::ParamGenerator > dnnBackendsAndTargets targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16)); } #endif - if (checkMyriadTarget()) + if (checkIETarget(DNN_TARGET_MYRIAD)) targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD)); } #endif @@ -351,7 +352,7 @@ public: } if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) { - if (!checkMyriadTarget()) + if (!checkIETarget(DNN_TARGET_MYRIAD)) { throw SkipTestException("Myriad is not available/disabled in OpenCV"); } diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp index a50fed8d58..a8404e09a2 100644 --- a/modules/dnn/test/test_ie_models.cpp +++ b/modules/dnn/test/test_ie_models.cpp @@ -57,28 +57,29 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath InferencePlugin plugin; ExecutableNetwork netExec; InferRequest infRequest; - TargetDevice targetDevice; - switch (target) - { - case DNN_TARGET_CPU: - targetDevice = TargetDevice::eCPU; - break; - case DNN_TARGET_OPENCL: - case DNN_TARGET_OPENCL_FP16: - targetDevice = TargetDevice::eGPU; - break; - case DNN_TARGET_MYRIAD: - targetDevice = TargetDevice::eMYRIAD; - break; - default: - CV_Error(Error::StsNotImplemented, "Unknown target"); - }; - try { - enginePtr = PluginDispatcher({""}).getSuitablePlugin(targetDevice); + auto dispatcher = InferenceEngine::PluginDispatcher({""}); + switch (target) + { + case DNN_TARGET_CPU: + enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eCPU); + break; + case DNN_TARGET_OPENCL: + case DNN_TARGET_OPENCL_FP16: + enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eGPU); + break; + case DNN_TARGET_MYRIAD: + enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eMYRIAD); + break; + case DNN_TARGET_FPGA: + enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU"); + break; + default: + CV_Error(Error::StsNotImplemented, "Unknown target"); + }; - if (targetDevice == TargetDevice::eCPU) + if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA) { std::string suffixes[] = {"_avx2", "_sse4", ""}; bool haveFeature[] = { @@ -255,8 +256,10 @@ static testing::internal::ParamGenerator dnnDLIETargets() targets.push_back(DNN_TARGET_OPENCL_FP16); } #endif - if (checkMyriadTarget()) + if (checkIETarget(DNN_TARGET_MYRIAD)) targets.push_back(DNN_TARGET_MYRIAD); + if (checkIETarget(DNN_TARGET_FPGA)) + targets.push_back(DNN_TARGET_FPGA); return testing::ValuesIn(targets); } diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index cf94fad701..2b2148573b 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -351,7 +351,7 @@ TEST_P(Test_Caffe_layers, Conv_Elu) { if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) { - if (!checkMyriadTarget()) + if (!checkIETarget(DNN_TARGET_MYRIAD)) throw SkipTestException("Myriad is not available/disabled in OpenCV"); } diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp index 859a47b26c..2d8ceef577 100644 --- a/modules/dnn/test/test_misc.cpp +++ b/modules/dnn/test/test_misc.cpp @@ -157,7 +157,7 @@ TEST_P(setInput, normalization) const int target = get<1>(get<3>(GetParam())); const bool kSwapRB = true; - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkMyriadTarget()) + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkIETarget(DNN_TARGET_MYRIAD)) throw SkipTestException("Myriad is not available/disabled in OpenCV"); if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16 && dtype != CV_32F) throw SkipTestException(""); diff --git a/modules/features2d/src/draw.cpp b/modules/features2d/src/draw.cpp index 2132d17b42..e791596476 100644 --- a/modules/features2d/src/draw.cpp +++ b/modules/features2d/src/draw.cpp @@ -117,7 +117,7 @@ void drawKeypoints( InputArray image, const std::vector& keypoints, In end = keypoints.end(); for( ; it != end; ++it ) { - Scalar color = isRandColor ? Scalar(rng(256), rng(256), rng(256)) : _color; + Scalar color = isRandColor ? Scalar( rng(256), rng(256), rng(256), 255 ) : _color; _drawKeypoint( outImage, *it, color, flags ); } } @@ -173,7 +173,7 @@ static inline void _drawMatch( InputOutputArray outImg, InputOutputArray outImg1 { RNG& rng = theRNG(); bool isRandMatchColor = matchColor == Scalar::all(-1); - Scalar color = isRandMatchColor ? Scalar( rng(256), rng(256), rng(256) ) : matchColor; + Scalar color = isRandMatchColor ? Scalar( rng(256), rng(256), rng(256), 255 ) : matchColor; _drawKeypoint( outImg1, kp1, color, flags ); _drawKeypoint( outImg2, kp2, color, flags ); diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.cpp index 5e39fa4de5..e9181f2182 100644 --- a/modules/imgproc/src/bilateral_filter.cpp +++ b/modules/imgproc/src/bilateral_filter.cpp @@ -82,7 +82,84 @@ public: memset(buf.data(), 0, buf.size() * sizeof(float)); float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH); float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH); - for( k = 0; k < maxk; k++ ) + k = 0; + for(; k <= maxk-4; k+=4) + { + const uchar* ksptr0 = sptr + space_ofs[k]; + const uchar* ksptr1 = sptr + space_ofs[k+1]; + const uchar* ksptr2 = sptr + space_ofs[k+2]; + const uchar* ksptr3 = sptr + space_ofs[k+3]; + j = 0; +#if CV_SIMD + v_float32 kweight0 = vx_setall_f32(space_weight[k]); + v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); + v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); + v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + { + v_uint32 rval = vx_load_expand_q(sptr + j); + + v_uint32 val = vx_load_expand_q(ksptr0 + j); + v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); + v_float32 v_wsum = vx_load_aligned(wsum + j) + w; + v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j)); + + val = vx_load_expand_q(ksptr1 + j); + w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); + v_wsum += w; + v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); + + val = vx_load_expand_q(ksptr2 + j); + w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); + v_wsum += w; + v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); + + val = vx_load_expand_q(ksptr3 + j); + w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); + v_wsum += w; + v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum); + + v_store_aligned(wsum + j, v_wsum); + v_store_aligned(sum + j, v_sum); + } +#endif +#if CV_SIMD128 + v_float32x4 kweight4 = v_load(space_weight + k); +#endif + for (; j < size.width; j++) + { +#if CV_SIMD128 + v_uint32x4 rval = v_setall_u32(sptr[j]); + v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); + v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval))); + wsum[j] += v_reduce_sum(w); + sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w); +#else + int rval = sptr[j]; + + int val = ksptr0[j]; + float w = space_weight[k] * color_weight[std::abs(val - rval)]; + wsum[j] += w; + sum[j] += val * w; + + val = ksptr1[j]; + w = space_weight[k+1] * color_weight[std::abs(val - rval)]; + wsum[j] += w; + sum[j] += val * w; + + val = ksptr2[j]; + w = space_weight[k+2] * color_weight[std::abs(val - rval)]; + wsum[j] += w; + sum[j] += val * w; + + val = ksptr3[j]; + w = space_weight[k+3] * color_weight[std::abs(val - rval)]; + wsum[j] += w; + sum[j] += val * w; +#endif + } + } + for(; k < maxk; k++) { const uchar* ksptr = sptr + space_ofs[k]; j = 0; @@ -126,7 +203,232 @@ public: float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH); float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH); float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH); - for(k = 0; k < maxk; k++ ) + k = 0; + for(; k <= maxk-4; k+=4) + { + const uchar* ksptr0 = sptr + space_ofs[k]; + const uchar* ksptr1 = sptr + space_ofs[k+1]; + const uchar* ksptr2 = sptr + space_ofs[k+2]; + const uchar* ksptr3 = sptr + space_ofs[k+3]; + const uchar* rsptr = sptr; + j = 0; +#if CV_SIMD + v_float32 kweight0 = vx_setall_f32(space_weight[k]); + v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); + v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); + v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); + for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes, + ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes) + { + v_uint8 kb, kg, kr, rb, rg, rr; + v_load_deinterleave(rsptr, rb, rg, rr); + + v_load_deinterleave(ksptr0, kb, kg, kr); + v_uint16 val0, val1, val2, val3, val4; + v_expand(v_absdiff(kb, rb), val0, val1); + v_expand(v_absdiff(kg, rg), val2, val3); + val0 += val2; val1 += val3; + v_expand(v_absdiff(kr, rr), val2, val3); + val0 += val2; val1 += val3; + + v_uint32 vall, valh; + v_expand(val0, vall, valh); + v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); + v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + v_expand(kb, val0, val2); + v_expand(val0, vall, valh); + v_store_aligned(sum_b + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_expand(kg, val0, val3); + v_expand(val0, vall, valh); + v_store_aligned(sum_g + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_expand(kr, val0, val4); + v_expand(val0, vall, valh); + v_store_aligned(sum_r + j , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); + v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + + v_expand(val1, vall, valh); + w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); + v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + v_expand(val2, vall, valh); + v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_expand(val3, vall, valh); + v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_expand(val4, vall, valh); + v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes))); + v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes))); + + v_load_deinterleave(ksptr1, kb, kg, kr); + v_expand(v_absdiff(kb, rb), val0, val1); + v_expand(v_absdiff(kg, rg), val2, val3); + val0 += val2; val1 += val3; + v_expand(v_absdiff(kr, rr), val2, val3); + val0 += val2; val1 += val3; + + v_expand(val0, vall, valh); + w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); + v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + v_expand(kb, val0, val2); + v_expand(val0, vall, valh); + v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_expand(kg, val0, val3); + v_expand(val0, vall, valh); + v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_expand(kr, val0, val4); + v_expand(val0, vall, valh); + v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); + v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + + v_expand(val1, vall, valh); + w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); + v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + v_expand(val2, vall, valh); + v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_expand(val3, vall, valh); + v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_expand(val4, vall, valh); + v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); + + v_load_deinterleave(ksptr2, kb, kg, kr); + v_expand(v_absdiff(kb, rb), val0, val1); + v_expand(v_absdiff(kg, rg), val2, val3); + val0 += val2; val1 += val3; + v_expand(v_absdiff(kr, rr), val2, val3); + val0 += val2; val1 += val3; + + v_expand(val0, vall, valh); + w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); + v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + v_expand(kb, val0, val2); + v_expand(val0, vall, valh); + v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_expand(kg, val0, val3); + v_expand(val0, vall, valh); + v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_expand(kr, val0, val4); + v_expand(val0, vall, valh); + v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); + v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + + v_expand(val1, vall, valh); + w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); + v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + v_expand(val2, vall, valh); + v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_expand(val3, vall, valh); + v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_expand(val4, vall, valh); + v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); + + v_load_deinterleave(ksptr3, kb, kg, kr); + v_expand(v_absdiff(kb, rb), val0, val1); + v_expand(v_absdiff(kg, rg), val2, val3); + val0 += val2; val1 += val3; + v_expand(v_absdiff(kr, rr), val2, val3); + val0 += val2; val1 += val3; + + v_expand(val0, vall, valh); + w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j)); + v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes)); + v_expand(kb, val0, val2); + v_expand(val0, vall, valh); + v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes))); + v_expand(kg, val0, val3); + v_expand(val0, vall, valh); + v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes))); + v_expand(kr, val0, val4); + v_expand(val0, vall, valh); + v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j))); + v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes))); + + v_expand(val1, vall, valh); + w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall)); + w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh)); + v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes)); + v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes)); + v_expand(val2, vall, valh); + v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes))); + v_expand(val3, vall, valh); + v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes))); + v_expand(val4, vall, valh); + v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes))); + v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes))); + } +#endif +#if CV_SIMD128 + v_float32x4 kweight4 = v_load(space_weight + k); +#endif + for(; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) + { +#if CV_SIMD128 + v_uint32x4 rb = v_setall_u32(rsptr[0]); + v_uint32x4 rg = v_setall_u32(rsptr[1]); + v_uint32x4 rr = v_setall_u32(rsptr[2]); + v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); + v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); + v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); + v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr))); + wsum[j] += v_reduce_sum(w); + sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w); + sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w); + sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w); +#else + int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; + + int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; + float w = space_weight[k]*color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; + wsum[j] += w; + sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; + + b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; + w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; + wsum[j] += w; + sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; + + b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; + w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; + wsum[j] += w; + sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; + + b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; + w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)]; + wsum[j] += w; + sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w; +#endif + } + } + for(; k < maxk; k++) { const uchar* ksptr = sptr + space_ofs[k]; const uchar* rsptr = sptr; @@ -421,7 +723,130 @@ public: v_float32 v_one = vx_setall_f32(1.f); v_float32 sindex = vx_setall_f32(scale_index); #endif - for( k = 0; k < maxk; k++ ) + k = 0; + for(; k <= maxk - 4; k+=4) + { + const float* ksptr0 = sptr + space_ofs[k]; + const float* ksptr1 = sptr + space_ofs[k + 1]; + const float* ksptr2 = sptr + space_ofs[k + 2]; + const float* ksptr3 = sptr + space_ofs[k + 3]; + j = 0; +#if CV_SIMD + v_float32 kweight0 = vx_setall_f32(space_weight[k]); + v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); + v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); + v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 rval = vx_load(sptr + j); + + v_float32 val = vx_load(ksptr0 + j); + v_float32 knan = v_not_nan(val); + v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + v_int32 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; + v_float32 v_wsum = vx_load_aligned(wsum + j) + w; + v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j)); + + val = vx_load(ksptr1 + j); + knan = v_not_nan(val); + alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; + v_wsum += w; + v_sum = v_muladd(val & knan, w, v_sum); + + val = vx_load(ksptr2 + j); + knan = v_not_nan(val); + alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; + v_wsum += w; + v_sum = v_muladd(val & knan, w, v_sum); + + val = vx_load(ksptr3 + j); + knan = v_not_nan(val); + alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; + idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; + v_wsum += w; + v_sum = v_muladd(val & knan, w, v_sum); + + v_store_aligned(wsum + j, v_wsum); + v_store_aligned(sum + j, v_sum); + } +#endif +#if CV_SIMD128 + v_float32x4 v_one4 = v_setall_f32(1.f); + v_float32x4 sindex4 = v_setall_f32(scale_index); + v_float32x4 kweight4 = v_load(space_weight + k); +#endif + for (; j < size.width; j++) + { +#if CV_SIMD128 + v_float32x4 rval = v_setall_f32(sptr[j]); + v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]); + v_float32x4 knan = v_not_nan(val); + v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan; + v_int32x4 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; + wsum[j] += v_reduce_sum(w); + sum[j] += v_reduce_sum((val & knan) * w); +#else + float rval = sptr[j]; + + float val = ksptr0[j]; + float alpha = std::abs(val - rval) * scale_index; + int idx = cvFloor(alpha); + alpha -= idx; + if (!cvIsNaN(val)) + { + float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum[j] += val * w; + } + + val = ksptr1[j]; + alpha = std::abs(val - rval) * scale_index; + idx = cvFloor(alpha); + alpha -= idx; + if (!cvIsNaN(val)) + { + float w = space_weight[k+1] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum[j] += val * w; + } + + val = ksptr2[j]; + alpha = std::abs(val - rval) * scale_index; + idx = cvFloor(alpha); + alpha -= idx; + if (!cvIsNaN(val)) + { + float w = space_weight[k+2] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum[j] += val * w; + } + + val = ksptr3[j]; + alpha = std::abs(val - rval) * scale_index; + idx = cvFloor(alpha); + alpha -= idx; + if (!cvIsNaN(val)) + { + float w = space_weight[k+3] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum[j] += val * w; + } +#endif + } + } + for(; k < maxk; k++) { const float* ksptr = sptr + space_ofs[k]; j = 0; @@ -430,36 +855,44 @@ public: for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) { v_float32 val = vx_load(ksptr + j); - - v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex; + v_float32 rval = vx_load(sptr + j); + v_float32 knan = v_not_nan(val); + v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan; v_int32 idx = v_trunc(alpha); alpha -= v_cvt_f32(idx); - v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha)); + v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan; v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j))); + v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j))); } #endif for (; j < size.width; j++) { float val = ksptr[j]; - float alpha = std::abs(val - sptr[j]) * scale_index; + float rval = sptr[j]; + float alpha = std::abs(val - rval) * scale_index; int idx = cvFloor(alpha); alpha -= idx; - float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx])); - wsum[j] += w; - sum[j] += val * w; + if (!cvIsNaN(val)) + { + float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum[j] += val * w; + } } } j = 0; #if CV_SIMD for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes) - v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j)); + { + v_float32 v_val = vx_load(sptr + j); + v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val)))); + } #endif for (; j < size.width; j++) { - CV_DbgAssert(fabs(wsum[j]) > 0); - dptr[j] = sum[j] / wsum[j]; + CV_DbgAssert(fabs(wsum[j]) >= 0); + dptr[j] = cvIsNaN(sptr[j]) ? sum[j] / wsum[j] : (sum[j] + sptr[j]) / (wsum[j] + 1.f); } } else @@ -475,7 +908,162 @@ public: v_float32 v_one = vx_setall_f32(1.f); v_float32 sindex = vx_setall_f32(scale_index); #endif - for (k = 0; k < maxk; k++) + k = 0; + for (; k <= maxk-4; k+=4) + { + const float* ksptr0 = sptr + space_ofs[k]; + const float* ksptr1 = sptr + space_ofs[k+1]; + const float* ksptr2 = sptr + space_ofs[k+2]; + const float* ksptr3 = sptr + space_ofs[k+3]; + const float* rsptr = sptr; + j = 0; +#if CV_SIMD + v_float32 kweight0 = vx_setall_f32(space_weight[k]); + v_float32 kweight1 = vx_setall_f32(space_weight[k+1]); + v_float32 kweight2 = vx_setall_f32(space_weight[k+2]); + v_float32 kweight3 = vx_setall_f32(space_weight[k+3]); + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes, + ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes) + { + v_float32 kb, kg, kr, rb, rg, rr; + v_load_deinterleave(rsptr, rb, rg, rr); + + v_load_deinterleave(ksptr0, kb, kg, kr); + v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); + v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + v_int32 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; + v_float32 v_wsum = vx_load_aligned(wsum + j) + w; + v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)); + v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)); + v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)); + + v_load_deinterleave(ksptr1, kb, kg, kr); + knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); + alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; + v_wsum += w; + v_sum_b = v_muladd(kb & knan, w, v_sum_b); + v_sum_g = v_muladd(kg & knan, w, v_sum_g); + v_sum_r = v_muladd(kr & knan, w, v_sum_r); + + v_load_deinterleave(ksptr2, kb, kg, kr); + knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); + alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; + v_wsum += w; + v_sum_b = v_muladd(kb & knan, w, v_sum_b); + v_sum_g = v_muladd(kg & knan, w, v_sum_g); + v_sum_r = v_muladd(kr & knan, w, v_sum_r); + + v_load_deinterleave(ksptr3, kb, kg, kr); + knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); + alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; + v_wsum += w; + v_sum_b = v_muladd(kb & knan, w, v_sum_b); + v_sum_g = v_muladd(kg & knan, w, v_sum_g); + v_sum_r = v_muladd(kr & knan, w, v_sum_r); + + v_store_aligned(wsum + j, v_wsum); + v_store_aligned(sum_b + j, v_sum_b); + v_store_aligned(sum_g + j, v_sum_g); + v_store_aligned(sum_r + j, v_sum_r); + } +#endif +#if CV_SIMD128 + v_float32x4 v_one4 = v_setall_f32(1.f); + v_float32x4 sindex4 = v_setall_f32(scale_index); + v_float32x4 kweight4 = v_load(space_weight + k); +#endif + for (; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3) + { +#if CV_SIMD128 + v_float32x4 rb = v_setall_f32(rsptr[0]); + v_float32x4 rg = v_setall_f32(rsptr[1]); + v_float32x4 rr = v_setall_f32(rsptr[2]); + v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]); + v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]); + v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]); + v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); + v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; + v_int32x4 idx = v_trunc(alpha); + alpha -= v_cvt_f32(idx); + v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan; + wsum[j] += v_reduce_sum(w); + sum_b[j] += v_reduce_sum((kb & knan) * w); + sum_g[j] += v_reduce_sum((kg & knan) * w); + sum_r[j] += v_reduce_sum((kr & knan) * w); +#else + float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; + bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); + + float b = ksptr0[0], g = ksptr0[1], r = ksptr0[2]; + bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); + float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; + int idx = cvFloor(alpha); + alpha -= idx; + if (!v_NAN) + { + float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum_b[j] += b*w; + sum_g[j] += g*w; + sum_r[j] += r*w; + } + + b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2]; + v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); + alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; + idx = cvFloor(alpha); + alpha -= idx; + if (!v_NAN) + { + float w = space_weight[k+1] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum_b[j] += b*w; + sum_g[j] += g*w; + sum_r[j] += r*w; + } + + b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2]; + v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); + alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; + idx = cvFloor(alpha); + alpha -= idx; + if (!v_NAN) + { + float w = space_weight[k+2] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum_b[j] += b*w; + sum_g[j] += g*w; + sum_r[j] += r*w; + } + + b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2]; + v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); + alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; + idx = cvFloor(alpha); + alpha -= idx; + if (!v_NAN) + { + float w = space_weight[k+3] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum_b[j] += b*w; + sum_g[j] += g*w; + sum_r[j] += r*w; + } +#endif + } + } + for (; k < maxk; k++) { const float* ksptr = sptr + space_ofs[k]; const float* rsptr = sptr; @@ -488,45 +1076,68 @@ public: v_load_deinterleave(ksptr, kb, kg, kr); v_load_deinterleave(rsptr, rb, rg, rr); - v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex; + v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr); + v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan; v_int32 idx = v_trunc(alpha); alpha -= v_cvt_f32(idx); - v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha)); + v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan; v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w); - v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j))); - v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j))); - v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j))); + v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j))); + v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j))); + v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j))); } #endif for (; j < size.width; j++, ksptr += 3, rsptr += 3) { float b = ksptr[0], g = ksptr[1], r = ksptr[2]; - float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index; + bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r); + float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2]; + bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr); + float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index; int idx = cvFloor(alpha); alpha -= idx; - float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])); - wsum[j] += w; - sum_b[j] += b*w; - sum_g[j] += g*w; - sum_r[j] += r*w; + if (!v_NAN) + { + float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]))); + wsum[j] += w; + sum_b[j] += b*w; + sum_g[j] += g*w; + sum_r[j] += r*w; + } } } j = 0; #if CV_SIMD - for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes) + for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes) { - v_float32 w = v_one / vx_load_aligned(wsum + j); - v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w); + v_float32 b, g, r; + v_load_deinterleave(sptr, b, g, r); + v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r); + v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask)); + v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w); } #endif for (; j < size.width; j++) { - CV_DbgAssert(fabs(wsum[j]) > 0); - wsum[j] = 1.f / wsum[j]; - *(dptr++) = sum_b[j] * wsum[j]; - *(dptr++) = sum_g[j] * wsum[j]; - *(dptr++) = sum_r[j] * wsum[j]; + CV_DbgAssert(fabs(wsum[j]) >= 0); + float b = *(sptr++); + float g = *(sptr++); + float r = *(sptr++); + if (cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r)) + { + wsum[j] = 1.f / wsum[j]; + *(dptr++) = sum_b[j] * wsum[j]; + *(dptr++) = sum_g[j] * wsum[j]; + *(dptr++) = sum_r[j] * wsum[j]; + } + else + { + wsum[j] = 1.f / (wsum[j] + 1.f); + *(dptr++) = (sum_b[j] + b) * wsum[j]; + *(dptr++) = (sum_g[j] + g) * wsum[j]; + *(dptr++) = (sum_r[j] + r) * wsum[j]; + } } } } @@ -585,9 +1196,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, // temporary copy of the image with borders for easy processing Mat temp; copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - minValSrc -= 5. * sigma_color; - patchNaNs( temp, minValSrc ); // this replacement of NaNs makes the assumption that depth values are nonnegative - // TODO: make replacement parameter avalible in the outside function interface + // allocate lookup tables std::vector _space_weight(d*d); std::vector _space_ofs(d*d); @@ -620,7 +1229,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, for( j = -radius; j <= radius; j++ ) { double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius ) + if( r > radius || ( i == 0 && j == 0 ) ) continue; space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); diff --git a/samples/dnn/tf_text_graph_mask_rcnn.py b/samples/dnn/tf_text_graph_mask_rcnn.py index b92d4623b8..aaefe456ad 100644 --- a/samples/dnn/tf_text_graph_mask_rcnn.py +++ b/samples/dnn/tf_text_graph_mask_rcnn.py @@ -38,6 +38,8 @@ aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']] width_stride = float(grid_anchor_generator['width_stride'][0]) height_stride = float(grid_anchor_generator['height_stride'][0]) features_stride = float(config['feature_extractor'][0]['first_stage_features_stride'][0]) +first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0]) +first_stage_max_proposals = int(config['first_stage_max_proposals'][0]) print('Number of classes: %d' % num_classes) print('Scales: %s' % str(scales)) @@ -53,7 +55,8 @@ graph_def = parseTextGraph(args.output) removeIdentity(graph_def) def to_remove(name, op): - return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) + return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \ + (name.startswith('CropAndResize') and op != 'CropAndResize') removeUnusedNodesAndAttrs(to_remove, graph_def) @@ -123,20 +126,22 @@ detectionOut.input.append('proposals') detectionOut.addAttr('num_classes', 2) detectionOut.addAttr('share_location', True) detectionOut.addAttr('background_label_id', 0) -detectionOut.addAttr('nms_threshold', 0.7) +detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold) detectionOut.addAttr('top_k', 6000) detectionOut.addAttr('code_type', "CENTER_SIZE") -detectionOut.addAttr('keep_top_k', 100) +detectionOut.addAttr('keep_top_k', first_stage_max_proposals) detectionOut.addAttr('clip', True) graph_def.node.extend([detectionOut]) # Save as text. +cropAndResizeNodesNames = [] for node in reversed(topNodes): if node.op != 'CropAndResize': graph_def.node.extend([node]) topNodes.pop() else: + cropAndResizeNodesNames.append(node.name) if numCropAndResize == 1: break else: @@ -166,11 +171,15 @@ for i in reversed(range(len(graph_def.node))): if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape', 'SecondStageBoxPredictor/Flatten/flatten/strided_slice', - 'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape']: + 'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape', + 'SecondStageBoxPredictor/Flatten_1/flatten/Shape', + 'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice', + 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']: del graph_def.node[i] for node in graph_def.node: - if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape': + if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \ + node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape': node.op = 'Flatten' node.input.pop() @@ -178,6 +187,12 @@ for node in graph_def.node: 'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']: node.addAttr('loc_pred_transposed', True) + if node.name.startswith('MaxPool2D'): + assert(node.op == 'MaxPool') + assert(len(cropAndResizeNodesNames) == 2) + node.input = [cropAndResizeNodesNames[0]] + del cropAndResizeNodesNames[0] + ################################################################################ ### Postprocessing ################################################################################ @@ -223,6 +238,11 @@ graph_def.node.extend([detectionOut]) for node in reversed(topNodes): graph_def.node.extend([node]) + if node.name.startswith('MaxPool2D'): + assert(node.op == 'MaxPool') + assert(len(cropAndResizeNodesNames) == 1) + node.input = [cropAndResizeNodesNames[0]] + for i in reversed(range(len(graph_def.node))): if graph_def.node[i].op == 'CropAndResize': graph_def.node[i].input.insert(1, 'detection_out_final')