Merge pull request #22353 from hanliutong:more-rvv-intrin

[GSoC] Add more universal intrinsic implementations for RVV.
2022-08-23 12:50:01 +03:00
parent 7831aae5dd b9a1039566
commit d10832074e
3 changed files with 1141 additions and 36 deletions
@@ -537,7 +537,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
    inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
    inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
 #endif
    //! @}
@@ -554,7 +554,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
    inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
    inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
 #endif
    //! @}
@@ -571,7 +571,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
    inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
    inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
 #endif
    //! @}
@@ -588,7 +588,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
    inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
    inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
 #endif
    //! @}
@@ -605,7 +605,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
    inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
    inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
 #endif
    //! @}
@@ -622,7 +622,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
    inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
    inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
 #endif
    //! @}
@@ -639,7 +639,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
    inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
    inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
 #endif
    //! @}
@@ -656,7 +656,7 @@ namespace CV__SIMD_NAMESPACE {
    inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
    inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
    inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
    inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
 #endif
    //! @}
@@ -5,6 +5,7 @@
 #include <initializer_list>
 #include <assert.h>
 #include <vector>
+#include <opencv2/core/check.hpp>

 #ifndef CV_RVV_MAX_VLEN
 #define CV_RVV_MAX_VLEN 1024
@@ -284,6 +285,64 @@ inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
 }
 #endif

+//////////// Extract //////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vl) \
+template <int s = 0> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
+{ \
+    return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
+} \
+template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
+{ \
+    return vmv_x(vslidedown(v_setzero_##suffix(), v, i, vl)); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8, uchar, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8, schar, s8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16, ushort, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16, short, s16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32, unsigned int, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32, int, s32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64, uint64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64, int64, s64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vl) \
+template <int s = 0> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
+{ \
+    return vslideup(vslidedown(v_setzero_##suffix(), a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
+} \
+template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
+{ \
+    return vfmv_f(vslidedown(v_setzero_##suffix(), v, i, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, f64, VTraits<v_float64>::vlanes())
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
+inline _Tp v_extract_highest(_Tpvec v) \
+{ \
+    return v_extract_n(v, vl-1); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8, schar, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16, short, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
+#endif
+

 ////////////// Load/Store //////////////
 #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
@@ -387,6 +446,9 @@ OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2)
 OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1)
 OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2)
 OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, mf2)
+#endif

 inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
 inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
@@ -401,6 +463,189 @@ inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_
 inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
 inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); }

+////////////// Pack boolean ////////////////////
+/* TODO */
+
+////////////// Arithmetics //////////////
+#define OPENCV_HAL_IMPL_RVV_BIN_OP(_Tpvec, ocv_intrin, rvv_intrin) \
+inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return rvv_intrin(a, b, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, vssubu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, vssub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, vssubu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, vssub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, vfsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, vfmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, vfdiv)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, vsub)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, sub, vfsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, mul, vfmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, vfdiv)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \
+template<typename... Args> \
+inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+    return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
+}
+#define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \
+template<typename... Args> \
+inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+    return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
+}
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int8, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint16, vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int16, vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint32, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int32, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float32, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint64, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, vadd)
+
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, vfmul)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, vfmul)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _TpwvecM2, suffix, wmul) \
+inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
+{ \
+    _TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \
+    c = vget_##suffix##m1(temp, 0); \
+    d = vget_##suffix##m1(temp, 1); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m2_t, u16, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m2_t, i16, vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m2_t, u32, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m2_t, i32, vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m2_t, u64, vwmulu)
+
+inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b)
+{
+    return vmulh(a, b, VTraits<v_int16>::vlanes());
+}
+inline v_uint16 v_mul_hi(const v_uint16& a, const v_uint16& b)
+{
+    return vmulhu(a, b, VTraits<v_uint16>::vlanes());
+}
+
+////////////// Arithmetics (wrap)//////////////
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add_wrap, vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub_wrap, vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, mul_wrap, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, mul_wrap, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, vmul)
+
+//////// Saturating Multiply ////////
+#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _clip, _wmul) \
+inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _clip(_wmul(a, b, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()); \
+} \
+template<typename... Args> \
+inline _Tpvec v_mul(const _Tpvec& a1, const _Tpvec& a2, const Args&... va) { \
+    return v_mul(_clip(_wmul(a1, a2, VTraits<_Tpvec>::vlanes()), 0, VTraits<_Tpvec>::vlanes()), va...); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8, vnclipu, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8, vnclip, vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16, vnclipu, vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16, vnclip, vwmul)
+
+////////////// Bitwise logic //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, vl) \
+inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vand(a, b, vl); \
+} \
+inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vor(a, b, vl); \
+} \
+inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vxor(a, b, vl); \
+} \
+inline _Tpvec v_not (const _Tpvec& a) \
+{ \
+    return vnot(a, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())
+
+
+
+////////////// Bitwise shifts //////////////
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsll(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsrl(a, uint8_t(n), vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsll(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsra(a, uint8_t(n), vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits<v_int64>::vlanes())
+
+////////////// Comparison //////////////
+// TODO

 ////////////// Min/Max //////////////

@@ -424,15 +669,378 @@ OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits<v_int32>::vlanes())
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits<v_int32>::vlanes())
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits<v_float32>::vlanes())
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits<v_float32>::vlanes())
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_min, vminu, VTraits<v_uint64>::vlanes())
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_max, vmaxu, VTraits<v_uint64>::vlanes())
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_min, vmin, VTraits<v_int64>::vlanes())
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_max, vmax, VTraits<v_int64>::vlanes())
 #if CV_SIMD_SCALABLE_64F
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits<v_float64>::vlanes())
 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits<v_float64>::vlanes())
 #endif

+////////////// Reduce //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
+    res = v##red(res, a, zero, vl); \
+    return (scalartype)v_get0(res); \
+}
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits<v_uint8>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits<v_int8>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, VTraits<v_uint16>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits<v_int16>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits<v_uint32>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits<v_int32>::vlanes(), wredsum)
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
+    res = vfredosum(res, a, zero, vl); \
+    return (scalartype)v_get0(res); \
+}
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
+inline scalartype v_reduce_##func(const _Tpvec& a)  \
+{ \
+    _Tpvec res = _Tpvec(v##red(a, a, a, vl)); \
+    return (scalartype)v_get0(res); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, min, uchar, u8, VTraits<v_uint8>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, min, schar, i8, VTraits<v_int8>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, min, ushort, u16, VTraits<v_uint16>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, min, short, i16, VTraits<v_int16>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, min, unsigned, u32, VTraits<v_uint32>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, min, int, i32, VTraits<v_int32>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, min, float, f32, VTraits<v_float32>::vlanes(), fredmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, max, uchar, u8, VTraits<v_uint8>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, max, schar, i8, VTraits<v_int8>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, max, ushort, u16, VTraits<v_uint16>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, max, short, i16, VTraits<v_int16>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
+
+//TODO: v_reduce_sum4
+
+////////////// Square-Root //////////////
+
+inline v_float32 v_sqrt(const v_float32& x)
+{
+    return vfsqrt(x, VTraits<v_float32>::vlanes());
+}
+
+inline v_float32 v_invsqrt(const v_float32& x)
+{
+    v_float32 one = v_setall_f32(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_sqrt(const v_float64& x)
+{
+    return vfsqrt(x, VTraits<v_float64>::vlanes());
+}
+
+inline v_float64 v_invsqrt(const v_float64& x)
+{
+    v_float64 one = v_setall_f64(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+#endif
+
+inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
+{
+    v_float32 x = vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
+    return v_sqrt(x);
+}
+
+inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
+{
+    return v_float32(vfmacc(vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
+{
+    v_float64 x = vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
+    return v_sqrt(x);
+}
+
+inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
+{
+    return vfmacc(vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
+}
+#endif
+
+////////////// Multiply-Add //////////////
+
+inline v_float32 v_fma(const v_float32& a, const v_float32& b, const v_float32& c)
+{
+    return vfmacc(c, a, b, VTraits<v_float32>::vlanes());
+}
+inline v_int32 v_fma(const v_int32& a, const v_int32& b, const v_int32& c)
+{
+    return vmacc(c, a, b, VTraits<v_float32>::vlanes());
+}
+
+inline v_float32 v_muladd(const v_float32& a, const v_float32& b, const v_float32& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
+{
+    return vfmacc_vv_f64m1(c, a, b, VTraits<v_float64>::vlanes());
+}
+
+inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+////////////// Check all/any //////////////
+
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    return vcpop(vmslt(a, 0, vl), vl) == vl; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    return vcpop(vmslt(a, 0, vl), vl) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64, VTraits<v_int64>::vlanes())
+
+
+inline bool v_check_all(const v_uint8& a)
+{ return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint8& a)
+{ return v_check_any(v_reinterpret_as_s8(a)); }
+
+inline bool v_check_all(const v_uint16& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint16& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+
+inline bool v_check_all(const v_uint32& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint32& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_float32& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_uint64& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint64& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+
+#if CV_SIMD_SCALABLE_64F
+inline bool v_check_all(const v_float64& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+#endif
+
+////////////// abs //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
+inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_sub(v_max(a, b), v_min(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
+#endif
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8, absdiffs)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs)
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \
+inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vnclipu(vreinterpret_u##width##m2(vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16, v_uint16, 32)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32, v_uint32, 64)
+
+#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
+inline _Tprvec v_abs(const _Tpvec& a) \
+{ \
+    return v_absdiff(a, v_setzero_##suffix()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
+OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
+inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_reduce_sum(v_absdiff(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32, float)
+
+////////////// Select //////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, vl) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vmerge(vmsne(mask, 0, vl), b, a, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
+
+inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
+{ \
+    return vmerge(vmfne(mask, 0, VTraits<v_float32>::vlanes()), b, a, VTraits<v_float32>::vlanes()); \
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_select(const v_float64& mask, const v_float64& a, const v_float64& b) \
+{ \
+    return vmerge(vmfne(mask, 0, VTraits<v_float64>::vlanes()), b, a, VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+////////////// Rotate shift //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return vslideup(vmv_v_x_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vmv_v_x_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8, i8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16, i16,  VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32, i32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64, i64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return vslideup(vfmv_v_f_##suffix##m1(0, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(vslidedown(vfmv_v_f_##suffix##m1(0, vl), b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64,  VTraits<v_float64>::vlanes())
+#endif
+
+////////////// Convert to float //////////////
+// TODO
+
+//////////// Broadcast //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
+template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
+{ \
+    return v_setall_##suffix(v_extract_n(v, i)); \
+} \
+inline _Tpvec v_broadcast_highest(_Tpvec v) \
+{ \
+    return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
+
+
+////////////// Reverse //////////////
+#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \
+inline _Tpvec v_reverse(const _Tpvec& a)  \
+{ \
+    vuint##width##m1_t vidx = vrsub(vid_v_u##width##m1(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \
+    return vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64, 64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int64, 64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64)
+#endif

 //////////// Value reordering ////////////

@@ -476,6 +1084,189 @@ inline v_int32 v_load_expand_q(const schar* ptr)
 }


+/* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
+  a0 = {A1 A2 A3 A4}
+  a1 = {B1 B2 B3 B4}
+---------------
+  {A1 B1 A2 B2} and {A3 B3 A4 B4}
+*/
+
+#define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
+    _wTpvec temp = vreinterpret_##suffix##m2(convert2um2( \
+        vor(vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \
+            vreinterpret_u##width2##m2(vslide1up(vreinterpret_u##width##m2(vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \
+            VTraits<_Tpvec>::vlanes()))); \
+    b0 = vget_##suffix##m1(temp, 0); \
+    b1 = vget_##suffix##m1(temp, 1); \
+}
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m2_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m2_t, i8, 8, 16, vreinterpret_u8m2, vreinterpret_u8m1)
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m2_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m2_t, i16, 16, 32, vreinterpret_u16m2, vreinterpret_u16m1)
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m2_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m2_t, i32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
+OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m2_t, f32, 32, 64, vreinterpret_u32m2, vreinterpret_u32m1)
+
+#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup(a, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes());\
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return vslideup( \
+            vslidedown(a, a, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
+            vslidedown(b, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
+            VTraits<_Tpvec>::vlanes()/2, \
+            VTraits<_Tpvec>::vlanes()); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c = v_combine_low(a, b); \
+    d = v_combine_high(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint8, 8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int8, 8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
+#endif
+
+static uint64_t idx_interleave_pairs[] = { \
+    0x0705060403010200, 0x0f0d0e0c0b090a08, 0x1715161413111210, 0x1f1d1e1c1b191a18, \
+    0x2725262423212220, 0x2f2d2e2c2b292a28, 0x3735363433313230, 0x3f3d3e3c3b393a38, \
+    0x4745464443414240, 0x4f4d4e4c4b494a48, 0x5755565453515250, 0x5f5d5e5c5b595a58, \
+    0x6765666463616260, 0x6f6d6e6c6b696a68, 0x7775767473717270, 0x7f7d7e7c7b797a78};
+
+static uint64_t idx_interleave_quads[] = { \
+    0x0703060205010400, 0x0f0b0e0a0d090c08, 0x1713161215111410, 0x1f1b1e1a1d191c18, \
+    0x2723262225212420, 0x2f2b2e2a2d292c28, 0x3733363235313430, 0x3f3b3e3a3d393c38, \
+    0x4743464245414440, 0x4f4b4e4a4d494c48, 0x5753565255515450, 0x5f5b5e5a5d595c58, \
+    0x6763666265616460, 0x6f6b6e6a6d696c68, 0x7773767275717470, 0x7f7b7e7a7d797c78};
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \
+inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
+    CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
+    vuint8m1_t vidx = vundefined_u8m1();\
+    vidx = vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)); \
+    return vrgather(vec, vidx, VTraits<v_uint8>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads)
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \
+inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
+    CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
+    vuint##width##m1_t vidx = vundefined_u##width##m1();\
+    vidx = vget_u##width##m1(vzext_vfx(vreinterpret_u8m1(vle64_v_u64m1(idx_interleave_##func, 16)), VTraits<v_uint8>::vlanes()), 0); \
+    return vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, pairs)
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, vzext_vf2, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, vzext_vf2, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, vzext_vf4, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, vzext_vf4, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, vzext_vf4, quads)
+
+////////////// Transpose4x4 //////////////
+#define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \
+static inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
+    int vl = 4; \
+    _wTpvec temp = vreinterpret_##suffix##m2(convert2u( \
+        vor(vzext_vf2(convert(a0), vl), \
+            vreinterpret_u64m2(vslide1up(vreinterpret_u32m2(vzext_vf2(convert(a1), vl)), 0, vl*2)), \
+            vl))); \
+    b0 = vget_##suffix##m1(temp, 0); \
+    b1 = vget_##suffix##m1(vrgather(temp, vadd(vid_v_u32m2(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \
+}
+
+OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m2_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m2_t, i32, vreinterpret_u32m2, vreinterpret_u32m1)
+OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m2_t, f32, vreinterpret_u32m2, vreinterpret_u32m1)
+
+#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \
+    _Tpvec t0,t1,t2,t3= vundefined_##suffix##m1(); \
+    v_zip4(a0, a2, t0, t2); \
+    v_zip4(a1, a3, t1, t3); \
+    v_zip4(t0, t1, b0, b1); \
+    v_zip4(t2, t3, b2, b3); \
+}
+
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_float32, f32)
+
+//////////// PopCount //////////
+// TODO
+
+//////////// SignMask ////////////
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    uint8_t ans[4] = {0}; \
+    vsm(ans, vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+    return *(reinterpret_cast<int*>(ans)); \
+} \
+inline int v_scan_forward(const _Tpvec& a) \
+{ \
+    return (int)vfirst(vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64)
+
+inline int64 v_signmask(const v_uint8& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+inline int64 v_signmask(const v_uint16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float32& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#if CV_SIMD_SCALABLE_64F
+inline int v_signmask(const v_float64& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#endif
+
+//////////// Scan forward ////////////
+inline int v_scan_forward(const v_uint8& a)
+{ return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_uint16& a)
+{ return v_scan_forward(v_reinterpret_as_s16(a)); }
+inline int v_scan_forward(const v_uint32& a)
+{ return v_scan_forward(v_reinterpret_as_s32(a)); }
+inline int v_scan_forward(const v_float32& a)
+{ return v_scan_forward(v_reinterpret_as_s32(a)); }
+inline int v_scan_forward(const v_uint64& a)
+{ return v_scan_forward(v_reinterpret_as_s64(a)); }
+#if CV_SIMD_SCALABLE_64F
+inline int v_scan_forward(const v_float64& a)
+{ return v_scan_forward(v_reinterpret_as_s64(a)); }
+#endif
+
+//////////// Pack triplets ////////////
+// TODO
+
+
 ////// FP16 support ///////

 inline v_float32 v_load_expand(const float16_t* ptr)
@@ -484,6 +1275,15 @@ inline v_float32 v_load_expand(const float16_t* ptr)
    return vundefined_f32m1();
 }

+////////////// Rounding //////////////
+// TODO
+
+//////// Dot Product ////////
+// TODO
+
+//////// Fast Dot Product ////////
+// TODO
+
 inline void v_cleanup() {}

 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
@@ -503,6 +503,34 @@ template<typename R> struct TheTest

        return *this;
    }
+    TheTest & test_interleave_pq()
+    {
+        Data<R> dataA;
+        R a = dataA;
+        Data<R> resP = v_interleave_pairs(a);
+        Data<R> resQ = v_interleave_quads(a);
+        for (int i = 0; i < VTraits<R>::vlanes()/4; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(resP[4*i],     dataA[4*i  ]);
+            EXPECT_EQ(resP[4*i + 1], dataA[4*i+2]);
+            EXPECT_EQ(resP[4*i + 2], dataA[4*i+1]);
+            EXPECT_EQ(resP[4*i + 3], dataA[4*i+3]);
+        }
+        for (int i = 0; i < VTraits<R>::vlanes()/8; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(resQ[8*i],     dataA[8*i  ]);
+            EXPECT_EQ(resQ[8*i + 1], dataA[8*i+4]);
+            EXPECT_EQ(resQ[8*i + 2], dataA[8*i+1]);
+            EXPECT_EQ(resQ[8*i + 3], dataA[8*i+5]);
+            EXPECT_EQ(resQ[8*i + 4], dataA[8*i+2]);
+            EXPECT_EQ(resQ[8*i + 5], dataA[8*i+6]);
+            EXPECT_EQ(resQ[8*i + 6], dataA[8*i+3]);
+            EXPECT_EQ(resQ[8*i + 7], dataA[8*i+7]);
+        }
+        return *this;
+    }

    // float32x4 only
    TheTest & test_interleave_2channel()
@@ -578,16 +606,18 @@ template<typename R> struct TheTest

    TheTest & test_addsub()
    {
-        Data<R> dataA, dataB;
+        Data<R> dataA, dataB, dataC;
        dataB.reverse();
-        R a = dataA, b = dataB;
+        dataA[1] = static_cast<LaneType>(std::numeric_limits<LaneType>::max());
+        R a = dataA, b = dataB, c = dataC;

-        Data<R> resC = v_add(a, b), resD = v_sub(a, b);
+        Data<R> resD = v_add(a, b), resE = v_add(a, b, c), resF = v_sub(a, b);
        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
        {
            SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resC[i]);
-            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] - dataB[i]), resD[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i]), resD[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] + dataB[i] + dataC[i]), resE[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] - dataB[i]), resF[i]);
        }

        return *this;
@@ -614,16 +644,18 @@ template<typename R> struct TheTest

    TheTest & test_mul()
    {
-        Data<R> dataA, dataB;
+        Data<R> dataA, dataB, dataC;
        dataA[1] = static_cast<LaneType>(std::numeric_limits<LaneType>::max());
        dataB.reverse();
-        R a = dataA, b = dataB;
+        R a = dataA, b = dataB, c = dataC;

-        Data<R> resC = v_mul(a, b);
+        Data<R> resD = v_mul(a, b);
+        Data<R> resE = v_mul(a, b, c);
        for (int i = 0; i < VTraits<R>::vlanes(); ++i)
        {
            SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i]), resC[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i]), resD[i]);
+            EXPECT_EQ(saturate_cast<LaneType>(dataA[i] * dataB[i] * dataC[i]), resE[i]);
        }

        return *this;
@@ -699,7 +731,7 @@ template<typename R> struct TheTest
        for (int i = 0; i < VTraits<Ru>::vlanes(); ++i)
        {
            SCOPED_TRACE(cv::format("i=%d", i));
-            R_type ssub = dataA[i] - dataB[i] < std::numeric_limits<R_type>::min() ? std::numeric_limits<R_type>::min() : dataA[i] - dataB[i];
+            R_type ssub = dataA[i] - dataB[i] < std::numeric_limits<R_type>::lowest() ? std::numeric_limits<R_type>::lowest() : dataA[i] - dataB[i];
            EXPECT_EQ((u_type)std::abs(ssub), resC[i]);
        }

@@ -1573,19 +1605,27 @@ template<typename R> struct TheTest
        v_transpose4x4(a, b, c, d,
                       e, f, g, h);

-        Data<R> res[4] = {e, f, g, h};
-        // for (int i = 0; i < VTraits<R>::vlanes(); i += 4)
-        // {
-            int i = 0;
-            for (int j = 0; j < 4; ++j)
-            {
-                SCOPED_TRACE(cv::format("i=%d j=%d", i, j));
-                EXPECT_EQ(dataA[i + j], res[j][i]);
-                EXPECT_EQ(dataB[i + j], res[j][i + 1]);
-                EXPECT_EQ(dataC[i + j], res[j][i + 2]);
-                EXPECT_EQ(dataD[i + j], res[j][i + 3]);
-            }
-        // }
+        // Data<R> res[4] = {e, f, g, h}; // Generates incorrect data in certain RVV case.
+        Data<R> res0 = e, res1 = f, res2 = g, res3 = h;
+        EXPECT_EQ(dataA[0], res0[0]);
+        EXPECT_EQ(dataB[0], res0[1]);
+        EXPECT_EQ(dataC[0], res0[2]);
+        EXPECT_EQ(dataD[0], res0[3]);
+
+        EXPECT_EQ(dataA[1], res1[0]);
+        EXPECT_EQ(dataB[1], res1[1]);
+        EXPECT_EQ(dataC[1], res1[2]);
+        EXPECT_EQ(dataD[1], res1[3]);
+
+        EXPECT_EQ(dataA[2], res2[0]);
+        EXPECT_EQ(dataB[2], res2[1]);
+        EXPECT_EQ(dataC[2], res2[2]);
+        EXPECT_EQ(dataD[2], res2[3]);
+
+        EXPECT_EQ(dataA[3], res3[0]);
+        EXPECT_EQ(dataB[3], res3[1]);
+        EXPECT_EQ(dataC[3], res3[2]);
+        EXPECT_EQ(dataD[3], res3[3]);
        return *this;
    }

@@ -1737,7 +1777,34 @@ void test_hal_intrin_uint8()
    // typedef v_uint8 R;
    TheTest<v_uint8>()
        .test_loadstore()
+        .test_interleave_pq()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_arithm_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_logic()
        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_extract_highest()
+        .test_unpack()
+        .test_reverse()
+#if 0 // not implemented in rvv backend yet.
+        .test_interleave()
+        .test_cmp()
+        .test_dotprod_expand()
+        .test_reduce()
+        .test_reduce_sad()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_pack_b()
+        .test_popcount()
+#endif
        ;
 }

@@ -1747,7 +1814,34 @@ void test_hal_intrin_int8()
    // typedef v_int8 R;
    TheTest<v_int8>()
        .test_loadstore()
+        .test_interleave_pq()
+        .test_expand()
+        .test_expand_q()
+        .test_addsub()
+        .test_arithm_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_logic()
        .test_min_max()
+        .test_absdiff()
+        .test_absdiffs()
+        .test_abs()
+        .test_mask()
+        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_extract_highest()
+        .test_unpack()
+        .test_reverse()
+#if 0
+        .test_interleave()
+        .test_cmp()
+        .test_dotprod_expand()
+        .test_reduce()
+        .test_reduce_sad()
+        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
+        .test_popcount()
+#endif
        ;
 }

@@ -1759,7 +1853,35 @@ void test_hal_intrin_uint16()
    // typedef v_uint16 R;
    TheTest<v_uint16>()
        .test_loadstore()
+        .test_interleave_pq()
+        .test_expand()
+        .test_addsub()
+        .test_arithm_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_mul_hi()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_extract_highest()
+        .test_unpack()
+        .test_reverse()
+#if 0
+        .test_interleave()
+        .test_cmp()
+        .test_dotprod_expand()
+        .test_reduce()
+        .test_reduce_sad()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
+        .test_popcount()
+#endif
        ;
 }

@@ -1769,7 +1891,38 @@ void test_hal_intrin_int16()
    // typedef v_int16 R;
    TheTest<v_int16>()
        .test_loadstore()
+        .test_interleave_pq()
+        .test_expand()
+        .test_addsub()
+        .test_arithm_wrap()
+        .test_mul()
+        .test_mul_expand()
+        .test_mul_hi()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
        .test_min_max()
+        .test_absdiff()
+        .test_absdiffs()
+        .test_abs()
+        .test_mask()
+        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_extract_highest()
+        .test_unpack()
+        .test_reverse()
+#if 0
+        .test_interleave()
+
+        .test_cmp()
+        .test_dotprod()
+        .test_dotprod_expand()
+        .test_reduce()
+        .test_reduce_sad()
+        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
+        .test_popcount()
+#endif
        ;
 }

@@ -1781,7 +1934,34 @@ void test_hal_intrin_uint32()
    // typedef v_uint32 R;
    TheTest<v_uint32>()
        .test_loadstore()
+        .test_interleave_pq()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_mul_expand()
+        .test_shift<1>()
+        .test_shift<8>()
+        .test_logic()
        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_broadcast_element<0>().test_broadcast_element<1>()
+        .test_extract_highest()
+        .test_broadcast_highest()
+        .test_unpack()
+        .test_transpose()
+        .test_reverse()
+#if 0
+        .test_interleave()
+        .test_cmp()
+        .test_reduce()
+        .test_reduce_sad()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_popcount()
+#endif
        ;
 }

@@ -1791,7 +1971,37 @@ void test_hal_intrin_int32()
    // typedef v_int32 R;
    TheTest<v_int32>()
        .test_loadstore()
+        .test_interleave_pq()
+        .test_expand()
+        .test_addsub()
+        .test_mul()
+        .test_abs()
+        .test_shift<1>().test_shift<8>()
+        .test_dotprod_expand_f64()
+        .test_logic()
        .test_min_max()
+        .test_absdiff()
+        .test_mask()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_broadcast_element<0>().test_broadcast_element<1>()
+        .test_extract_highest()
+        .test_broadcast_highest()
+        .test_unpack()
+        .test_transpose()
+        .test_reverse()
+#if 0
+        .test_interleave()
+        .test_cmp()
+        .test_dotprod()
+        .test_reduce()
+        .test_reduce_sad()
+        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
+        .test_float_cvt32()
+        .test_float_cvt64()
+        .test_popcount()
+#endif
        ;
 }

@@ -1803,7 +2013,20 @@ void test_hal_intrin_uint64()
    // typedef v_uint64 R;
    TheTest<v_uint64>()
        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_extract_highest()
+        .test_reverse()
        ;
+#if 0
+    #if CV_SIMD_64F
+       .test_cmp64()
+    #endif
+#endif
 }

 void test_hal_intrin_int64()
@@ -1812,7 +2035,21 @@ void test_hal_intrin_int64()
    // typedef v_int64 R;
    TheTest<v_int64>()
        .test_loadstore()
+        .test_addsub()
+        .test_shift<1>().test_shift<8>()
+        .test_logic()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_extract_highest()
+        .test_cvt64_double()
+        .test_reverse()
        ;
+#if 0
+    #if CV_SIMD_64F
+       .test_cmp64()
+    #endif
+#endif
 }

 //============= Floating point =====================================================================
@@ -1822,18 +2059,64 @@ void test_hal_intrin_float32()
    // typedef v_float32 R;
    TheTest<v_float32>()
        .test_loadstore()
+        .test_interleave_pq()
+        .test_addsub()
+        .test_abs()
+        .test_mul()
+        .test_div()
+        .test_sqrt_abs()
        .test_min_max()
+        .test_float_absdiff()
+        .test_mask()
+        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_broadcast_element<0>().test_broadcast_element<1>()
+        .test_extract_highest()
+        .test_broadcast_highest()
+        .test_unpack()
+        .test_transpose()
+        .test_reverse()
+#if 0
+        .test_interleave()
+        .test_interleave_2channel()
+        .test_cmp()
+        .test_reduce()
+        .test_reduce_sad()
+        .test_float_math()
+        .test_float_cvt64()
+        .test_matmul()
+        .test_reduce_sum4()
+#endif
        ;
 }

 void test_hal_intrin_float64()
 {
    DUMP_ENTRY(v_float64);
-#if CV_SIMD_64F
+#if CV_SIMD_SCALABLE_64F
    // typedef v_float64 R;
    TheTest<v_float64>()
        .test_loadstore()
+        .test_addsub()
+        .test_mul()
+        .test_div()
+        .test_abs()
+        .test_sqrt_abs()
        .test_min_max()
+        .test_float_absdiff()
+        .test_mask()
+        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
+        .test_extract_n<0>().test_extract_n<1>()
+        .test_extract_highest()
+        .test_reverse()
+#if 0
+        .test_cmp()
+        .test_unpack()
+        .test_float_cvt32()
+        .test_float_math()
+#endif
        ;

 #endif
@@ -1851,6 +2134,7 @@ void test_hal_intrin_uint8()
    TheTest<v_uint8>()
        .test_loadstore()
        .test_interleave()
+        .test_interleave_pq()
        .test_expand()
        .test_expand_q()
        .test_addsub()
@@ -1874,6 +2158,7 @@ void test_hal_intrin_uint8()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_extract_highest()
        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
 #if CV_SIMD_WIDTH == 32
        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
@@ -1891,6 +2176,7 @@ void test_hal_intrin_int8()
    TheTest<v_int8>()
        .test_loadstore()
        .test_interleave()
+        .test_interleave_pq()
        .test_expand()
        .test_expand_q()
        .test_addsub()
@@ -1914,6 +2200,7 @@ void test_hal_intrin_int8()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_extract_highest()
        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }
@@ -1927,6 +2214,7 @@ void test_hal_intrin_uint16()
    TheTest<v_uint16>()
        .test_loadstore()
        .test_interleave()
+        .test_interleave_pq()
        .test_expand()
        .test_addsub()
        .test_arithm_wrap()
@@ -1951,6 +2239,7 @@ void test_hal_intrin_uint16()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_extract_highest()
        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }
@@ -1962,6 +2251,7 @@ void test_hal_intrin_int16()
    TheTest<v_int16>()
        .test_loadstore()
        .test_interleave()
+        .test_interleave_pq()
        .test_expand()
        .test_addsub()
        .test_arithm_wrap()
@@ -1988,6 +2278,7 @@ void test_hal_intrin_int16()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_extract_highest()
        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }
@@ -2001,6 +2292,7 @@ void test_hal_intrin_uint32()
    TheTest<v_uint32>()
        .test_loadstore()
        .test_interleave()
+        // .test_interleave_pq() //not implemented in AVX
        .test_expand()
        .test_addsub()
        .test_mul()
@@ -2022,6 +2314,8 @@ void test_hal_intrin_uint32()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
        .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
+        .test_extract_highest()
+        .test_broadcast_highest()
        .test_transpose()
        ;
 }
@@ -2033,6 +2327,7 @@ void test_hal_intrin_int32()
    TheTest<v_int32>()
        .test_loadstore()
        .test_interleave()
+        // .test_interleave_pq() //not implemented in AVX
        .test_expand()
        .test_addsub()
        .test_mul()
@@ -2058,6 +2353,8 @@ void test_hal_intrin_int32()
        .test_float_cvt32()
        .test_float_cvt64()
        .test_transpose()
+        .test_extract_highest()
+        .test_broadcast_highest()
        ;
 }

@@ -2079,6 +2376,7 @@ void test_hal_intrin_uint64()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_extract_highest()
        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }
@@ -2099,6 +2397,7 @@ void test_hal_intrin_int64()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_extract_highest()
        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        .test_cvt64_double()
        ;
@@ -2113,9 +2412,11 @@ void test_hal_intrin_float32()
        .test_loadstore()
        .test_interleave()
        .test_interleave_2channel()
+        // .test_interleave_pq() //not implemented in AVX
        .test_addsub()
        .test_mul()
        .test_div()
+        .test_abs()
        .test_cmp()
        .test_sqrt_abs()
        .test_min_max()
@@ -2134,6 +2435,8 @@ void test_hal_intrin_float32()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
        .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
+        .test_extract_highest()
+        .test_broadcast_highest()
 #if CV_SIMD_WIDTH == 32
        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
@@ -2151,6 +2454,7 @@ void test_hal_intrin_float64()
        .test_addsub()
        .test_mul()
        .test_div()
+        .test_abs()
        .test_cmp()
        .test_sqrt_abs()
        .test_min_max()
@@ -2163,6 +2467,7 @@ void test_hal_intrin_float64()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_extract_highest()
        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
 #if CV_SIMD_WIDTH == 32
        .test_extract<2>().test_extract<3>()