From ff4c3873f2f805107c2053b001245c521d63ec3a Mon Sep 17 00:00:00 2001 From: Zhangyin Date: Mon, 29 Jun 2020 15:50:24 +0800 Subject: [PATCH] Added cmake toolchain for RISC-V with clang. - Added cross compile cmake file for target riscv64-clang - Extended cmake for RISC-V and added instruction checks - Created intrin_rvv.hpp with C++ version universal intrinsics --- cmake/OpenCVCompilerOptimizations.cmake | 11 + cmake/checks/cpu_rvv.cpp | 23 + .../include/opencv2/core/cv_cpu_dispatch.h | 8 + .../core/include/opencv2/core/cv_cpu_helper.h | 21 + modules/core/include/opencv2/core/cvdef.h | 4 + .../core/include/opencv2/core/hal/intrin.hpp | 4 + .../include/opencv2/core/hal/intrin_rvv.hpp | 1678 +++++++++++++++++ modules/core/src/system.cpp | 6 + platforms/linux/riscv64-clang.toolchain.cmake | 27 + 9 files changed, 1782 insertions(+) create mode 100644 cmake/checks/cpu_rvv.cpp create mode 100644 modules/core/include/opencv2/core/hal/intrin_rvv.hpp create mode 100644 platforms/linux/riscv64-clang.toolchain.cmake diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 970dd28903..6fda0c84eb 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -49,6 +49,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SK list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) list(APPEND CPU_ALL_OPTIMIZATIONS MSA) list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3) +list(APPEND CPU_ALL_OPTIMIZATIONS RVV) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) ocv_update(CPU_VFPV3_FEATURE_ALIAS "") @@ -102,6 +103,8 @@ ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF) ocv_optimization_process_obsolete_option(ENABLE_VSX VSX ON) +ocv_optimization_process_obsolete_option(ENABLE_RVV RVV OFF) + macro(ocv_is_optimization_in_list resultvar check_opt) set(__checked "") set(__queue ${ARGN}) @@ -366,6 +369,14 @@ elseif(PPC64LE) set(CPU_DISPATCH "VSX3" CACHE STRING "${HELP_CPU_DISPATCH}") set(CPU_BASELINE "VSX" CACHE STRING "${HELP_CPU_BASELINE}") + +elseif(RISCV) + ocv_update(CPU_RVV_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_rvv.cpp") + ocv_update(CPU_KNOWN_OPTIMIZATIONS "RVV") + ocv_update(CPU_RVV_FLAGS_ON "") + set(CPU_DISPATCH "RVV" CACHE STRING "${HELP_CPU_DISPATCH}") + set(CPU_BASELINE "RVV" CACHE STRING "${HELP_CPU_BASELINE}") + endif() # Helper values for cmake-gui diff --git a/cmake/checks/cpu_rvv.cpp b/cmake/checks/cpu_rvv.cpp new file mode 100644 index 0000000000..a3eab2abc4 --- /dev/null +++ b/cmake/checks/cpu_rvv.cpp @@ -0,0 +1,23 @@ +#include + +#if defined(__riscv) +# include +# define CV_RVV 1 +#endif + +#if defined CV_RVV +int test() +{ + const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f }; + vfloat32m1_t val = vle32_v_f32m1((const float*)(src)); + return (int)vfmv_f_s_f32m1_f32(val); +} +#else +#error "RISC-V vector extension(RVV) is not supported" +#endif + +int main() +{ + printf("%d\n", test()); + return 0; +} diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 42651aed5e..eb3f8693c2 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -168,6 +168,10 @@ # include #endif +#if defined CV_CPU_COMPILE_RVV +# define CV_RVV 1 +#endif + #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX @@ -343,3 +347,7 @@ struct VZeroUpperGuard { #ifndef CV_WASM_SIMD # define CV_WASM_SIMD 0 #endif + +#ifndef CV_RVV +# define CV_RVV 0 +#endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index aaa89ed415..39ae0b91f7 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -483,5 +483,26 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...) CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_RVV +# define CV_TRY_RVV 1 +# define CV_CPU_FORCE_RVV 1 +# define CV_CPU_HAS_SUPPORT_RVV 1 +# define CV_CPU_CALL_RVV(fn, args) return (cpu_baseline::fn args) +# define CV_CPU_CALL_RVV_(fn, args) return (opt_RVV::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_RVV +# define CV_TRY_RVV 1 +# define CV_CPU_FORCE_RVV 0 +# define CV_CPU_HAS_SUPPORT_RVV (cv::checkHardwareSupport(CV_CPU_RVV)) +# define CV_CPU_CALL_RVV(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args) +# define CV_CPU_CALL_RVV_(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args) +#else +# define CV_TRY_RVV 0 +# define CV_CPU_FORCE_RVV 0 +# define CV_CPU_HAS_SUPPORT_RVV 0 +# define CV_CPU_CALL_RVV(fn, args) +# define CV_CPU_CALL_RVV_(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index e66a646710..de89425eef 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -260,6 +260,8 @@ namespace cv { #define CV_CPU_VSX 200 #define CV_CPU_VSX3 201 +#define CV_CPU_RVV 210 + // CPU features groups #define CV_CPU_AVX512_SKX 256 #define CV_CPU_AVX512_COMMON 257 @@ -312,6 +314,8 @@ enum CpuFeatures { CPU_VSX = 200, CPU_VSX3 = 201, + CPU_RVV = 210, + CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512 CPU_AVX512_KNL = 258, //!< Knights Landing with AVX-512F/CD/ER/PF diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 52f6b5d552..0ffcb49cea 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -199,6 +199,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_VSX # undef CV_FP16 # undef CV_MSA +# undef CV_RVV #endif #if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD) && !defined(CV_FORCE_SIMD128_CPP) @@ -226,6 +227,9 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP) #include "opencv2/core/hal/intrin_wasm.hpp" +#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) +#include "opencv2/core/hal/intrin_rvv.hpp" + #else #include "opencv2/core/hal/intrin_cpp.hpp" diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp new file mode 100644 index 0000000000..eca787c7fd --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp @@ -0,0 +1,1678 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_HAL_INTRIN_RVV_HPP +#define OPENCV_HAL_INTRIN_RVV_HPP + +#include +#include +#include +#include "opencv2/core/saturate.hpp" + +#define CV_SIMD128_CPP 1 +#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN) +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 +#endif + +namespace cv +{ + +#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN +#endif + + +template struct v_reg +{ + typedef _Tp lane_type; + enum { nlanes = n }; + + explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } + + v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } + + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } + + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, + _Tp s4, _Tp s5, _Tp s6, _Tp s7) + { + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; + } + + v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, + _Tp s4, _Tp s5, _Tp s6, _Tp s7, + _Tp s8, _Tp s9, _Tp s10, _Tp s11, + _Tp s12, _Tp s13, _Tp s14, _Tp s15) + { + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; + s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; + s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; + s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; + } + + v_reg() {} + + v_reg(const v_reg<_Tp, n> & r) + { + for( int i = 0; i < n; i++ ) + s[i] = r.s[i]; + } + _Tp get0() const { return s[0]; } + + _Tp get(const int i) const { return s[i]; } + v_reg<_Tp, n> high() const + { + v_reg<_Tp, n> c; + int i; + for( i = 0; i < n/2; i++ ) + { + c.s[i] = s[i+(n/2)]; + c.s[i+(n/2)] = 0; + } + return c; + } + + static v_reg<_Tp, n> zero() + { + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = (_Tp)0; + return c; + } + + static v_reg<_Tp, n> all(_Tp s) + { + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = s; + return c; + } + + template v_reg<_Tp2, n2> reinterpret_as() const + { + size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); + v_reg<_Tp2, n2> c; + std::memcpy(&c.s[0], &s[0], bytes); + return c; + } + + v_reg& operator=(const v_reg<_Tp, n> & r) + { + for( int i = 0; i < n; i++ ) + s[i] = r.s[i]; + return *this; + } + + _Tp s[n]; +}; + +typedef v_reg v_uint8x16; +typedef v_reg v_int8x16; +typedef v_reg v_uint16x8; +typedef v_reg v_int16x8; +typedef v_reg v_uint32x4; +typedef v_reg v_int32x4; +typedef v_reg v_float32x4; +typedef v_reg v_float64x2; +typedef v_reg v_uint64x2; +typedef v_reg v_int64x2; + +template CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +template CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +template CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +template CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + + +template CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +template CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +template CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +template CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a); + + +#ifndef CV_DOXYGEN + +#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \ +__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(short, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(int, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \ + +#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \ +__CV_EXPAND(macro_name(float, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(double, __VA_ARGS__)) \ + +#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \ +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \ +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \ + +#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \ +template inline \ +v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return c; \ +} \ +template inline \ +v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + for( int i = 0; i < n; i++ ) \ + a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return a; \ +} + +#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op) + +CV__HAL_INTRIN_IMPL_BIN_OP(+) +CV__HAL_INTRIN_IMPL_BIN_OP(-) +CV__HAL_INTRIN_IMPL_BIN_OP(*) +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /) + +#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \ +template CV_INLINE \ +v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + typedef typename V_TypeTraits<_Tp>::int_type itype; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ + V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ + return c; \ +} \ +template CV_INLINE \ +v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef typename V_TypeTraits<_Tp>::int_type itype; \ + for( int i = 0; i < n; i++ ) \ + a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ + V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ + return a; \ +} + +#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \ +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \ +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */ + + +CV__HAL_INTRIN_IMPL_BIT_OP(&) +CV__HAL_INTRIN_IMPL_BIT_OP(|) +CV__HAL_INTRIN_IMPL_BIT_OP(^) + +#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \ +template CV_INLINE \ +v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \ + return c; \ +} \ + +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~) + +#endif + + +#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ +template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ +{ \ + v_reg<_Tp2, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cfunc(a.s[i]); \ + return c; \ +} + +#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \ +inline v_reg func(const v_reg& a) \ +{ \ + v_reg c; \ + for( int i = 0; i < 4; i++ ) \ + c.s[i] = cfunc(a.s[i]); \ + return c; \ +} \ +inline v_reg func(const v_reg& a) \ +{ \ + v_reg c; \ + for( int i = 0; i < 2; i++ ) \ + { \ + c.s[i] = cfunc(a.s[i]); \ + c.s[i + 2] = 0; \ + } \ + return c; \ +} + +OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) + +OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) +OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) + +OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, + typename V_TypeTraits<_Tp>::abs_type) + +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound) + +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor) + +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil) + +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int) + +#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \ +template inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cfunc(a.s[i], b.s[i]); \ + return c; \ +} + +#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \ +template inline _Tp func(const v_reg<_Tp, n>& a) \ +{ \ + _Tp c = a.s[0]; \ + for( int i = 1; i < n; i++ ) \ + c = cfunc(c, a.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min) + +OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max) + +OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min) + +OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max) + +static const unsigned char popCountTable[] = +{ + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; +template +inline v_reg::abs_type, n> v_popcount(const v_reg<_Tp, n>& a) +{ + v_reg::abs_type, n> b = v_reg::abs_type, n>::zero(); + for (int i = 0; i < n*(int)sizeof(_Tp); i++) + b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]]; + return b; +} + + +template +inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval ) +{ + for( int i = 0; i < n; i++ ) + { + minval.s[i] = std::min(a.s[i], b.s[i]); + maxval.s[i] = std::max(a.s[i], b.s[i]); + } +} + +#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ +template \ +inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef typename V_TypeTraits<_Tp>::int_type itype; \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ + return c; \ +} + +OPENCV_HAL_IMPL_CMP_OP(<) + +OPENCV_HAL_IMPL_CMP_OP(>) + +OPENCV_HAL_IMPL_CMP_OP(<=) + +OPENCV_HAL_IMPL_CMP_OP(>=) + +OPENCV_HAL_IMPL_CMP_OP(==) + +OPENCV_HAL_IMPL_CMP_OP(!=) + +template +inline v_reg v_not_nan(const v_reg& a) +{ + typedef typename V_TypeTraits::int_type itype; + v_reg c; + for (int i = 0; i < n; i++) + c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); + return c; +} +template +inline v_reg v_not_nan(const v_reg& a) +{ + typedef typename V_TypeTraits::int_type itype; + v_reg c; + for (int i = 0; i < n; i++) + c.s[i] = V_TypeTraits::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i])); + return c; +} + +#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \ +template \ +inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + typedef _Tp2 rtype; \ + v_reg c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ + return c; \ +} + +OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp) + +OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp) + +OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp) + +template inline T _absdiff(T a, T b) +{ + return a > b ? a - b : b - a; +} + +template +inline v_reg::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b) +{ + typedef typename V_TypeTraits<_Tp>::abs_type rtype; + v_reg c; + const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0); + for( int i = 0; i < n; i++ ) + { + rtype ua = a.s[i] ^ mask; + rtype ub = b.s[i] ^ mask; + c.s[i] = _absdiff(ua, ub); + } + return c; +} + +inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 c; + for( int i = 0; i < c.nlanes; i++ ) + c.s[i] = _absdiff(a.s[i], b.s[i]); + return c; +} + +inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) +{ + v_float64x2 c; + for( int i = 0; i < c.nlanes; i++ ) + c.s[i] = _absdiff(a.s[i], b.s[i]); + return c; +} + +template +inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++) + c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i])); + return c; +} + +template +inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = 1.f/std::sqrt(a.s[i]); + return c; +} + +template +inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); + return c; +} + +template +inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; + return c; +} + +template +inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg<_Tp, n>& c) +{ + v_reg<_Tp, n> d; + for( int i = 0; i < n; i++ ) + d.s[i] = a.s[i]*b.s[i] + c.s[i]; + return d; +} + +template +inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg<_Tp, n>& c) +{ + return v_fma(a, b, c); +} + +template inline v_reg::w_type, n/2> +v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + v_reg c; + for( int i = 0; i < (n/2); i++ ) + c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1]; + return c; +} + +template inline v_reg::w_type, n/2> +v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::w_type, n / 2>& c) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + v_reg s; + for( int i = 0; i < (n/2); i++ ) + s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i]; + return s; +} + +template inline v_reg::w_type, n/2> +v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ return v_dotprod(a, b); } + +template inline v_reg::w_type, n/2> +v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::w_type, n / 2>& c) +{ return v_dotprod(a, b, c); } + +template inline v_reg::q_type, n/4> +v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg s; + for( int i = 0; i < (n/4); i++ ) + s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + + (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3]; + return s; +} + +template inline v_reg::q_type, n/4> +v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::q_type, n / 4>& c) +{ + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg s; + for( int i = 0; i < (n/4); i++ ) + s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] + + (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i]; + return s; +} + +template inline v_reg::q_type, n/4> +v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ return v_dotprod_expand(a, b); } + +template inline v_reg::q_type, n/4> +v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + const v_reg::q_type, n / 4>& c) +{ return v_dotprod_expand(a, b, c); } + +template inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg::w_type, n/2>& c, + v_reg::w_type, n/2>& d) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = (w_type)a.s[i]*b.s[i]; + d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)]; + } +} + +template inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + v_reg<_Tp, n> c; + for (int i = 0; i < n; i++) + c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8); + return c; +} + +template inline void v_hsum(const v_reg<_Tp, n>& a, + v_reg::w_type, n/2>& c) +{ + typedef typename V_TypeTraits<_Tp>::w_type w_type; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1]; + } +} + +#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ +template inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = (_Tp)(a.s[i] shift_op imm); \ + return c; \ +} + +OPENCV_HAL_IMPL_SHIFT_OP(<< ) + +OPENCV_HAL_IMPL_SHIFT_OP(>> ) + +#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \ +template inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \ +{ \ + v_reg<_Tp, n> b; \ + for (int i = 0; i < n; i++) \ + { \ + int sIndex = i opA imm; \ + if (0 <= sIndex && sIndex < n) \ + { \ + b.s[i] = a.s[sIndex]; \ + } \ + else \ + { \ + b.s[i] = 0; \ + } \ + } \ + return b; \ +} \ +template inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for (int i = 0; i < n; i++) \ + { \ + int aIndex = i opA imm; \ + int bIndex = i opA imm opB n; \ + if (0 <= bIndex && bIndex < n) \ + { \ + c.s[i] = b.s[bIndex]; \ + } \ + else if (0 <= aIndex && aIndex < n) \ + { \ + c.s[i] = a.s[aIndex]; \ + } \ + else \ + { \ + c.s[i] = 0; \ + } \ + } \ + return c; \ +} + +OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +) + +OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -) + +template inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) +{ + typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; + for( int i = 1; i < n; i++ ) + c += a.s[i]; + return c; +} + +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + v_float32x4 r; + r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3]; + r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3]; + r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3]; + r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3]; + return r; +} + +template inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]); + for (int i = 1; i < n; i++) + c += _absdiff(a.s[i], b.s[i]); + return c; +} + +template inline int v_signmask(const v_reg<_Tp, n>& a) +{ + int mask = 0; + for( int i = 0; i < n; i++ ) + mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; + return mask; +} + +template inline int v_scan_forward(const v_reg<_Tp, n>& a) +{ + for (int i = 0; i < n; i++) + if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) + return i; + return 0; +} + +template inline bool v_check_all(const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) + return false; + return true; +} + +template inline bool v_check_any(const v_reg<_Tp, n>& a) +{ + for( int i = 0; i < n; i++ ) + if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) + return true; + return false; +} + +template inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, + const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + typedef V_TypeTraits<_Tp> Traits; + typedef typename Traits::int_type int_type; + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + { + int_type m = Traits::reinterpret_int(mask.s[i]); + CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc + c.s[i] = m ? a.s[i] : b.s[i]; + } + return c; +} + +template inline void v_expand(const v_reg<_Tp, n>& a, + v_reg::w_type, n/2>& b0, + v_reg::w_type, n/2>& b1) +{ + for( int i = 0; i < (n/2); i++ ) + { + b0.s[i] = a.s[i]; + b1.s[i] = a.s[i+(n/2)]; + } +} + +template +inline v_reg::w_type, n/2> +v_expand_low(const v_reg<_Tp, n>& a) +{ + v_reg::w_type, n/2> b; + for( int i = 0; i < (n/2); i++ ) + b.s[i] = a.s[i]; + return b; +} + +template +inline v_reg::w_type, n/2> +v_expand_high(const v_reg<_Tp, n>& a) +{ + v_reg::w_type, n/2> b; + for( int i = 0; i < (n/2); i++ ) + b.s[i] = a.s[i+(n/2)]; + return b; +} + +template inline v_reg::int_type, n> + v_reinterpret_as_int(const v_reg<_Tp, n>& a) +{ + v_reg::int_type, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]); + return c; +} + +template inline v_reg::uint_type, n> + v_reinterpret_as_uint(const v_reg<_Tp, n>& a) +{ + v_reg::uint_type, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]); + return c; +} + +template inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, + v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) +{ + int i; + for( i = 0; i < n/2; i++ ) + { + b0.s[i*2] = a0.s[i]; + b0.s[i*2+1] = a1.s[i]; + } + for( ; i < n; i++ ) + { + b1.s[i*2-n] = a0.s[i]; + b1.s[i*2-n+1] = a1.s[i]; + } +} + +template +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr); +} + +template +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr) +{ + CV_Assert(isAligned::nlanes128>)>(ptr)); + return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr); +} + +template +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for( int i = 0; i < c.nlanes/2; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + +template +inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(loptr)); + CV_Assert(isAligned(hiptr)); +#endif + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for( int i = 0; i < c.nlanes/2; i++ ) + { + c.s[i] = loptr[i]; + c.s[i+c.nlanes/2] = hiptr[i]; + } + return c; +} + +template +inline v_reg::w_type, V_TypeTraits<_Tp>::nlanes128 / 2> +v_load_expand(const _Tp* ptr) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + typedef typename V_TypeTraits<_Tp>::w_type w_type; + v_reg::nlanes128> c; + for( int i = 0; i < c.nlanes; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + +template +inline v_reg::q_type, V_TypeTraits<_Tp>::nlanes128 / 4> +v_load_expand_q(const _Tp* ptr) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + typedef typename V_TypeTraits<_Tp>::q_type q_type; + v_reg::nlanes128> c; + for( int i = 0; i < c.nlanes; i++ ) + { + c.s[i] = ptr[i]; + } + return c; +} + +template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + int i, i2; + for( i = i2 = 0; i < n; i++, i2 += 2 ) + { + a.s[i] = ptr[i2]; + b.s[i] = ptr[i2+1]; + } +} + +template inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + int i, i3; + for( i = i3 = 0; i < n; i++, i3 += 3 ) + { + a.s[i] = ptr[i3]; + b.s[i] = ptr[i3+1]; + c.s[i] = ptr[i3+2]; + } +} + +template +inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, + v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, + v_reg<_Tp, n>& d) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + int i, i4; + for( i = i4 = 0; i < n; i++, i4 += 4 ) + { + a.s[i] = ptr[i4]; + b.s[i] = ptr[i4+1]; + c.s[i] = ptr[i4+2]; + d.s[i] = ptr[i4+3]; + } +} + +template +inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + int i, i2; + for( i = i2 = 0; i < n; i++, i2 += 2 ) + { + ptr[i2] = a.s[i]; + ptr[i2+1] = b.s[i]; + } +} + +template +inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + int i, i3; + for( i = i3 = 0; i < n; i++, i3 += 3 ) + { + ptr[i3] = a.s[i]; + ptr[i3+1] = b.s[i]; + ptr[i3+2] = c.s[i]; + } +} + +template inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, + const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, + const v_reg<_Tp, n>& d, + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + int i, i4; + for( i = i4 = 0; i < n; i++, i4 += 4 ) + { + ptr[i4] = a.s[i]; + ptr[i4+1] = b.s[i]; + ptr[i4+2] = c.s[i]; + ptr[i4+3] = d.s[i]; + } +} + +template +inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + for( int i = 0; i < n; i++ ) + ptr[i] = a.s[i]; +} + +template +inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + v_store(ptr, a); +} + +template +inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + for( int i = 0; i < (n/2); i++ ) + ptr[i] = a.s[i]; +} + +template +inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) +{ +#if CV_STRONG_ALIGNMENT + CV_Assert(isAligned(ptr)); +#endif + for( int i = 0; i < (n/2); i++ ) + ptr[i] = a.s[i+(n/2)]; +} + +template +inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + CV_Assert(isAligned)>(ptr)); + v_store(ptr, a); +} + +template +inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a) +{ + CV_Assert(isAligned)>(ptr)); + v_store(ptr, a); +} + +template +inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/) +{ + CV_Assert(isAligned)>(ptr)); + v_store(ptr, a); +} + +template +inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = a.s[i]; + c.s[i+(n/2)] = b.s[i]; + } + return c; +} + +template +inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < (n/2); i++ ) + { + c.s[i] = a.s[i+(n/2)]; + c.s[i+(n/2)] = b.s[i+(n/2)]; + } + return c; +} + +template +inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, + v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) +{ + for( int i = 0; i < (n/2); i++ ) + { + low.s[i] = a.s[i]; + low.s[i+(n/2)] = b.s[i]; + high.s[i] = a.s[i+(n/2)]; + high.s[i+(n/2)] = b.s[i+(n/2)]; + } +} + +template +inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a) +{ + v_reg<_Tp, n> c; + for( int i = 0; i < n; i++ ) + c.s[i] = a.s[n-i-1]; + return c; +} + +template +inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + v_reg<_Tp, n> r; + const int shift = n - s; + int i = 0; + for (; i < shift; ++i) + r.s[i] = a.s[i+s]; + for (; i < n; ++i) + r.s[i] = b.s[i-shift]; + return r; +} + +template +inline _Tp v_extract_n(const v_reg<_Tp, n>& v) +{ + CV_DbgAssert(s >= 0 && s < n); + return v.s[s]; +} + +template +inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a) +{ + CV_DbgAssert(i >= 0 && i < n); + return v_reg<_Tp, n>::all(a.s[i]); +} + +template inline v_reg v_round(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvRound(a.s[i]); + return c; +} + +template inline v_reg v_round(const v_reg& a, const v_reg& b) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvRound(a.s[i]); + c.s[i+n] = cvRound(b.s[i]); + } + return c; +} + +template inline v_reg v_floor(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvFloor(a.s[i]); + return c; +} + +template inline v_reg v_ceil(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = cvCeil(a.s[i]); + return c; +} + +template inline v_reg v_trunc(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (int)(a.s[i]); + return c; +} + +template inline v_reg v_round(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvRound(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_floor(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvFloor(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_ceil(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvCeil(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_trunc(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = cvCeil(a.s[i]); + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_cvt_f32(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (float)a.s[i]; + return c; +} + +template inline v_reg v_cvt_f32(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = (float)a.s[i]; + c.s[i+n] = 0; + } + return c; +} + +template inline v_reg v_cvt_f32(const v_reg& a, const v_reg& b) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = (float)a.s[i]; + c.s[i+n] = (float)b.s[i]; + } + return c; +} + +CV_INLINE v_reg v_cvt_f64(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + +CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i + 2]; + return c; +} + +CV_INLINE v_reg v_cvt_f64(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + +CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i + 2]; + return c; +} + +CV_INLINE v_reg v_cvt_f64(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + +CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + + +template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx) +{ + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) + c.s[i] = tab[idx[i]]; + return c; +} +template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx) +{ + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) + c.s[i] = tab[idx[i / 2] + i % 2]; + return c; +} +template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx) +{ + v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; + for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++) + c.s[i] = tab[idx[i / 4] + i % 4]; + return c; +} + +template inline v_reg v_lut(const int* tab, const v_reg& idx) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = tab[idx.s[i]]; + return c; +} + +template inline v_reg v_lut(const unsigned* tab, const v_reg& idx) +{ + v_reg c; + for (int i = 0; i < n; i++) + c.s[i] = tab[idx.s[i]]; + return c; +} + +template inline v_reg v_lut(const float* tab, const v_reg& idx) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = tab[idx.s[i]]; + return c; +} + +template inline v_reg v_lut(const double* tab, const v_reg& idx) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = tab[idx.s[i]]; + return c; +} + + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + return v_lut(tab, idxvec.s); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + return v_lut(tab, idxvec.s); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + return v_lut(tab, idxvec.s); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + return v_lut(tab, idxvec.s); +} + + +template inline void v_lut_deinterleave(const float* tab, const v_reg& idx, + v_reg& x, v_reg& y) +{ + for( int i = 0; i < n; i++ ) + { + int j = idx.s[i]; + x.s[i] = tab[j]; + y.s[i] = tab[j+1]; + } +} + +template inline void v_lut_deinterleave(const double* tab, const v_reg& idx, + v_reg& x, v_reg& y) +{ + for( int i = 0; i < n; i++ ) + { + int j = idx.s[i]; + x.s[i] = tab[j]; + y.s[i] = tab[j+1]; + } +} + +template inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec) +{ + v_reg<_Tp, n> c; + for (int i = 0; i < n/4; i++) + { + c.s[4*i ] = vec.s[4*i ]; + c.s[4*i+1] = vec.s[4*i+2]; + c.s[4*i+2] = vec.s[4*i+1]; + c.s[4*i+3] = vec.s[4*i+3]; + } + return c; +} + +template inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec) +{ + v_reg<_Tp, n> c; + for (int i = 0; i < n/8; i++) + { + c.s[8*i ] = vec.s[8*i ]; + c.s[8*i+1] = vec.s[8*i+4]; + c.s[8*i+2] = vec.s[8*i+1]; + c.s[8*i+3] = vec.s[8*i+5]; + c.s[8*i+4] = vec.s[8*i+2]; + c.s[8*i+5] = vec.s[8*i+6]; + c.s[8*i+6] = vec.s[8*i+3]; + c.s[8*i+7] = vec.s[8*i+7]; + } + return c; +} + +template inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec) +{ + v_reg<_Tp, n> c; + for (int i = 0; i < n/4; i++) + { + c.s[3*i ] = vec.s[4*i ]; + c.s[3*i+1] = vec.s[4*i+1]; + c.s[3*i+2] = vec.s[4*i+2]; + } + return c; +} + +template +inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, + const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, + v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, + v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 ) +{ + b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); + b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); + b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); + b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); +} + +#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } + +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64) + +#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } + +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16) +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32) +OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32) +OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64) +OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64) + +#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \ +template inline _Tpvec \ + v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \ +{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); } + +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16) +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32) +OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32) +OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64) +OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64) + +#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ return a << n; } + +OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort) +OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short) +OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int) +OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64) +OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64) + +#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ return a >> n; } + +OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort) +OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short) +OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int) +OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64) +OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64) + +#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \ +template inline _Tpvec v_rshr(const _Tpvec& a) \ +{ \ + _Tpvec c; \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + return c; \ +} + +OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort) +OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short) +OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int) +OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64) +OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64) + +#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \ +inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpnvec c; \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + { \ + c.s[i] = cast<_Tpn>(a.s[i]); \ + c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \ + } \ + return c; \ +} + +OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast) + +#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ +template inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpnvec c; \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + { \ + c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ + } \ + return c; \ +} + +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) + +#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ +inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ +{ \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + ptr[i] = cast<_Tpn>(a.s[i]); \ +} + +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) + +#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \ +template inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ +{ \ + for( int i = 0; i < _Tpvec::nlanes; i++ ) \ + ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ +} + +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast) +OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast) + +template +inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) +{ + for (int i = 0; i < n; ++i) + { + mptr[i] = (_Tpm)a.s[i]; + mptr[i + n] = (_Tpm)b.s[i]; + } +} + + + +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint8x16 mask; + _pack_b(mask.s, a, b); + return mask; +} + + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + v_uint8x16 mask; + _pack_b(mask.s, a, b); + _pack_b(mask.s + 8, c, d); + return mask; +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + v_uint8x16 mask; + _pack_b(mask.s, a, b); + _pack_b(mask.s + 4, c, d); + _pack_b(mask.s + 8, e, f); + _pack_b(mask.s + 12, g, h); + return mask; +} + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], + v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], + v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], + v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); +} + +inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0], + v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1], + v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2], + v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]); +} + + +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } + +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } + +////// FP16 support /////// + +inline v_reg::nlanes128> +v_load_expand(const float16_t* ptr) +{ + v_reg::nlanes128> v; + for( int i = 0; i < v.nlanes; i++ ) + { + v.s[i] = ptr[i]; + } + return v; +} + +inline void +v_pack_store(float16_t* ptr, const v_reg::nlanes128>& v) +{ + for( int i = 0; i < v.nlanes; i++ ) + { + ptr[i] = float16_t(v.s[i]); + } +} + +inline void v_cleanup() {} + + +#ifndef CV_DOXYGEN +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END +#endif +} + +#endif diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 872019dd9e..2fecc7c299 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -399,6 +399,8 @@ struct HWFeatures g_hwFeatureNames[CPU_AVX512_CNL] = "AVX512-CNL"; g_hwFeatureNames[CPU_AVX512_CLX] = "AVX512-CLX"; g_hwFeatureNames[CPU_AVX512_ICL] = "AVX512-ICL"; + + g_hwFeatureNames[CPU_RVV] = "RVV"; } void initialize(void) @@ -597,6 +599,10 @@ struct HWFeatures have[CV_CPU_VSX3] = (CV_VSX3); #endif + #if defined __riscv && defined __riscv_vector + have[CV_CPU_RVV] = true; + #endif + bool skip_baseline_check = false; #ifndef NO_GETENV if (getenv("OPENCV_SKIP_CPU_BASELINE_CHECK")) diff --git a/platforms/linux/riscv64-clang.toolchain.cmake b/platforms/linux/riscv64-clang.toolchain.cmake new file mode 100644 index 0000000000..58e06ddf7b --- /dev/null +++ b/platforms/linux/riscv64-clang.toolchain.cmake @@ -0,0 +1,27 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +set(RISCV_CLANG_BUILD_ROOT /opt/rvv-llvm CACHE PATH "Path to CLANG for RISC-V cross compiler build directory") +set(RISCV_GCC_INSTALL_ROOT /opt/RISCV CACHE PATH "Path to GCC for RISC-V cross compiler installation directory") +set(CMAKE_SYSROOT ${RISCV_GCC_INSTALL_ROOT}/sysroot CACHE PATH "RISC-V sysroot") + +set(CLANG_TARGET_TRIPLE riscv64-unknown-linux-gnu) + +set(CMAKE_C_COMPILER ${RISCV_CLANG_BUILD_ROOT}/bin/clang) +set(CMAKE_C_COMPILER_TARGET ${CLANG_TARGET_TRIPLE}) +set(CMAKE_CXX_COMPILER ${RISCV_CLANG_BUILD_ROOT}/bin/clang++) +set(CMAKE_CXX_COMPILER_TARGET ${CLANG_TARGET_TRIPLE}) +set(CMAKE_ASM_COMPILER ${RISCV_CLANG_BUILD_ROOT}/bin/clang) +set(CMAKE_ASM_COMPILER_TARGET ${CLANG_TARGET_TRIPLE}) + +# Don't run the linker on compiler check +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) + +set(CMAKE_C_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CMAKE_C_FLAGS}") +set(CMAKE_CXX_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w ${CXX_FLAGS}") + +set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT}) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)