From d354ad1c34c3b83427ca67bedc89b6c7b8784ce7 Mon Sep 17 00:00:00 2001 From: Tatsuro Shibamura Date: Sun, 27 Feb 2022 02:35:03 +0900 Subject: [PATCH] Merge pull request #21630 from shibayan:arm64-msvc-neon * Added NEON support in builds for Windows on ARM * Fixed `HAVE_CPU_NEON_SUPPORT` display broken during compiler test * Fixed a build error prior to Visual Studio 2022 --- cmake/OpenCVCompilerOptions.cmake | 4 +++ cmake/checks/cpu_neon.cpp | 1 + .../include/opencv2/core/hal/intrin_neon.hpp | 30 ++++++++++--------- modules/core/src/system.cpp | 3 ++ 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index bcb8a3e203..4f5c353980 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -314,6 +314,10 @@ if(MSVC) set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} /FS") set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} /FS") endif() + + if(AARCH64 AND NOT MSVC_VERSION LESS 1930) + set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /D _ARM64_DISTINCT_NEON_TYPES") + endif() endif() if(PROJECT_NAME STREQUAL "OpenCV") diff --git a/cmake/checks/cpu_neon.cpp b/cmake/checks/cpu_neon.cpp index c309e85049..bb103ec366 100644 --- a/cmake/checks/cpu_neon.cpp +++ b/cmake/checks/cpu_neon.cpp @@ -1,6 +1,7 @@ #include #if defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64)) +# define _ARM64_DISTINCT_NEON_TYPES # include # include # define CV_NEON 1 diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index e17972a3fc..28cf813379 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -591,28 +591,26 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { +#if CV_NEON_AARCH64 + int32x4_t c = vmull_high_s16(a.val, b.val); +#else // #if CV_NEON_AARCH64 + int32x4_t c = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)); +#endif // #if CV_NEON_AARCH64 return v_int16x8(vcombine_s16( vshrn_n_s32(vmull_s16( vget_low_s16(a.val), vget_low_s16(b.val)), 16), - vshrn_n_s32( -#if CV_NEON_AARCH64 - vmull_high_s16(a.val, b.val) -#else // #if CV_NEON_AARCH64 - vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)) -#endif // #if CV_NEON_AARCH64 - , 16) + vshrn_n_s32(c, 16) )); } inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { +#if CV_NEON_AARCH64 + uint32x4_t c = vmull_high_u16(a.val, b.val); +#else // #if CV_NEON_AARCH64 + uint32x4_t c = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)); +#endif // #if CV_NEON_AARCH64 return v_uint16x8(vcombine_u16( vshrn_n_u32(vmull_u16( vget_low_u16(a.val), vget_low_u16(b.val)), 16), - vshrn_n_u32( -#if CV_NEON_AARCH64 - vmull_high_u16(a.val, b.val) -#else // #if CV_NEON_AARCH64 - vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)) -#endif // #if CV_NEON_AARCH64 - , 16) + vshrn_n_u32(c, 16) )); } @@ -1937,10 +1935,14 @@ inline v_int32x4 v_round(const v_float32x4& a) { float32x4_t a_ = a.val; int32x4_t result; +#if defined _MSC_VER + result = vcvtnq_s32_f32(a_); +#else __asm__ ("fcvtns %0.4s, %1.4s" : "=w"(result) : "w"(a_) : /* No clobbers */); +#endif return v_int32x4(result); } #else diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index ebafee59e0..d2231fe952 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -615,6 +615,9 @@ struct HWFeatures #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800) have[CV_CPU_NEON] = true; #endif + #if defined _M_ARM64 + have[CV_CPU_NEON] = true; + #endif #ifdef __riscv_vector have[CV_CPU_RISCVV] = true; #endif