RISC-V: support RVV 0.7 in mainline RVV intrinsics
This commit is contained in:
parent
47293f28cf
commit
903ec0ec60
@ -10,13 +10,27 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
|
// Building for T-Head C906 core with RVV 0.7.1 using toolchain
|
||||||
|
// https://github.com/T-head-Semi/xuantie-gnu-toolchain
|
||||||
|
// with option '-march=rv64gcv0p7'
|
||||||
|
#ifdef __THEAD_VERSION__
|
||||||
|
# if __riscv_v == 7000
|
||||||
|
# include <fenv.h>
|
||||||
|
# define CV_RVV_THEAD_0_7
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace cv
|
namespace cv
|
||||||
{
|
{
|
||||||
|
|
||||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
#define CV_SIMD128 1
|
#define CV_SIMD128 1
|
||||||
#define CV_SIMD128_64F 1
|
#ifndef CV_RVV_THEAD_0_7
|
||||||
|
# define CV_SIMD128_64F 1
|
||||||
|
#else
|
||||||
|
# define CV_SIMD128_64F 0
|
||||||
|
#endif
|
||||||
|
|
||||||
//////////// Unsupported native intrinsics in C++ ////////////
|
//////////// Unsupported native intrinsics in C++ ////////////
|
||||||
// The following types have been defined in clang, but not in GCC yet.
|
// The following types have been defined in clang, but not in GCC yet.
|
||||||
@ -1001,14 +1015,17 @@ OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 6
|
|||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
|
||||||
|
#if CV_SIMD128_64F
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
|
||||||
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
|
OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
|
||||||
|
#endif
|
||||||
|
|
||||||
// Three times reinterpret
|
// Three times reinterpret
|
||||||
|
#if CV_SIMD128_64F
|
||||||
inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) \
|
inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) \
|
||||||
{ \
|
{ \
|
||||||
return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
|
return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
|
||||||
@ -1017,6 +1034,7 @@ inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) \
|
|||||||
{ \
|
{ \
|
||||||
return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
|
return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////// Extract //////////////
|
////////////// Extract //////////////
|
||||||
|
|
||||||
@ -1920,13 +1938,15 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
|
|||||||
#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
|
#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
|
||||||
inline bool v_check_all(const _Tpvec& a) \
|
inline bool v_check_all(const _Tpvec& a) \
|
||||||
{ \
|
{ \
|
||||||
v_uint64x2 v = v_uint64x2(vreinterpret_v_##suffix##m1_u64m1(vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl))); \
|
auto v0 = vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl); \
|
||||||
return (v.val[0] | v.val[1]) == 0; \
|
v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
|
||||||
|
return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) == 0; \
|
||||||
} \
|
} \
|
||||||
inline bool v_check_any(const _Tpvec& a) \
|
inline bool v_check_any(const _Tpvec& a) \
|
||||||
{ \
|
{ \
|
||||||
v_uint64x2 v = v_uint64x2(vreinterpret_v_##suffix##m1_u64m1(vsrl_vx_##suffix##m1(a, shift, vl))); \
|
auto v0 = vsrl_vx_##suffix##m1(a, shift, vl); \
|
||||||
return (v.val[0] | v.val[1]) != 0; \
|
v_uint32x4 v = v_uint32x4(v_reinterpret_as_u32(_Tpvec(v0))); \
|
||||||
|
return (v.val[0] | v.val[1] | v.val[2] | v.val[3]) != 0; \
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
|
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
|
||||||
@ -2042,28 +2062,18 @@ OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
|
|||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
|
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
|
||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
|
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
|
||||||
|
|
||||||
// use reinterpret instead of c-style casting.
|
#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(ivec, uvec, itype, utype, isuf, usuf, vlen) \
|
||||||
#ifndef __clang__
|
inline uvec v_absdiff(const ivec& a, const ivec& b) \
|
||||||
#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width, vl) \
|
|
||||||
inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
|
|
||||||
{ \
|
{ \
|
||||||
return _rTpvec(rshr(vreinterpret_v_i##width##m2_u##width##m2(sub(v_max(a, b), v_min(a, b), vl)), 0, vl)); \
|
itype max = vmax_vv_##isuf(a, b, vlen); \
|
||||||
|
itype min = vmin_vv_##isuf(a, b, vlen); \
|
||||||
|
return uvec(vreinterpret_v_##isuf##_##usuf(vsub_vv_##isuf(max, min, vlen))); \
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16, 16)
|
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vint8m1_t, vuint8m1_t, i8m1, u8m1, 16)
|
||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 32, 8)
|
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vint16m1_t, vuint16m1_t, i16m1, u16m1, 8)
|
||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 64, 4)
|
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vint32m1_t, vuint32m1_t, i32m1, u32m1, 4)
|
||||||
#else
|
|
||||||
#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width, vl) \
|
|
||||||
inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
|
|
||||||
{ \
|
|
||||||
return _rTpvec(rshr(vreinterpret_u##width##m2(sub(v_max(a, b), v_min(a, b), vl)), 0, vl)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16, 16)
|
|
||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 32, 8)
|
|
||||||
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 64, 4)
|
|
||||||
#endif
|
|
||||||
#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
|
#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
|
||||||
inline _Tprvec v_abs(const _Tpvec& a) \
|
inline _Tprvec v_abs(const _Tpvec& a) \
|
||||||
{ \
|
{ \
|
||||||
@ -2902,7 +2912,14 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
|
|||||||
|
|
||||||
inline v_int32x4 v_trunc(const v_float32x4& a)
|
inline v_int32x4 v_trunc(const v_float32x4& a)
|
||||||
{
|
{
|
||||||
|
#ifndef CV_RVV_THEAD_0_7
|
||||||
return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
|
return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
|
||||||
|
#else
|
||||||
|
const int old_round = fesetround(FE_TOWARDZERO);
|
||||||
|
vint32m1_t val = vfcvt_x_f_v_i32m1(a, 4);
|
||||||
|
fesetround(old_round);
|
||||||
|
return v_int32x4(val);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#if CV_SIMD128_64F
|
#if CV_SIMD128_64F
|
||||||
#ifndef __clang__
|
#ifndef __clang__
|
||||||
@ -2938,7 +2955,14 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
|
|||||||
{
|
{
|
||||||
double arr[4] = {a.val[0], a.val[1], 0, 0};
|
double arr[4] = {a.val[0], a.val[1], 0, 0};
|
||||||
vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
|
vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
|
||||||
|
#ifndef CV_RVV_THEAD_0_7
|
||||||
return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
|
return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
|
||||||
|
#else
|
||||||
|
const int old_round = fesetround(FE_TOWARDZERO);
|
||||||
|
vint32m1_t val = vfncvt_x_f_w_i32m1(tmp, 4);
|
||||||
|
fesetround(old_round);
|
||||||
|
return v_int32x4(val);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -19,7 +19,7 @@ namespace cv
|
|||||||
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
#define CV_SIMD128 1
|
#define CV_SIMD128 1
|
||||||
#define CV_SIMD128_64F 1
|
#define CV_SIMD128_64F 0
|
||||||
//////////// Types ////////////
|
//////////// Types ////////////
|
||||||
struct v_uint8x16
|
struct v_uint8x16
|
||||||
{
|
{
|
||||||
@ -2021,23 +2021,18 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
|
|||||||
c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
|
c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
|
||||||
return v_int32x4(vget_i32m2_i32m1(c, 0));
|
return v_int32x4(vget_i32m2_i32m1(c, 0));
|
||||||
}
|
}
|
||||||
#define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
|
#define VITL_16 (vuint32m2_t){0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
|
||||||
#define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
|
#define VITL_8 (vuint32m2_t){0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
|
||||||
#define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
|
#define VITL_4 (vuint32m2_t){0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
|
||||||
#define VITL_2 (vuint64m2_t){0, 2, 1, 3}
|
#define VITL_2 (vuint32m2_t){0, 0, 2, 0, 1, 0, 3, 0}
|
||||||
#define LOW_4 0x0000000100000000, 0x0000000500000004
|
|
||||||
#define LOW_8 0x0003000200010000, 0x000B000A00090008
|
|
||||||
#define LOW_16 0x0706050403020100, 0x1716151413121110
|
|
||||||
#define HIGH_4 0x0000000300000002, 0x0000000700000006
|
|
||||||
#define HIGH_8 0x0007000600050004, 0x000F000E000D000C
|
|
||||||
#define HIGH_16 0x0F0E0D0C0B0A0908, 0x1F1E1D1C1B1A1918
|
|
||||||
#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
|
#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
|
||||||
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
|
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
|
||||||
{ \
|
{ \
|
||||||
v##_Tp##m2_t tmp = vundefined_##_T##m2();\
|
v##_Tp##m2_t tmp = vundefined_##_T##m2();\
|
||||||
tmp = vset_##_T##m2(tmp, 0, a0.val); \
|
tmp = vset_##_T##m2(tmp, 0, a0.val); \
|
||||||
tmp = vset_##_T##m2(tmp, 1, a1.val); \
|
tmp = vset_##_T##m2(tmp, 1, a1.val); \
|
||||||
vuint64m2_t mask = VITL_##num; \
|
vuint32m2_t mask = VITL_##num; \
|
||||||
tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \
|
tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \
|
||||||
b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
|
b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
|
||||||
b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
|
b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user