From 6d7f5871dbf9e1e9082c6a99f5358e413e844b80 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Mon, 10 Sep 2018 16:56:29 +0300 Subject: [PATCH] added basic support for CV_16F (the new datatype etc.) (#12463) * added basic support for CV_16F (the new datatype etc.). CV_USRTYPE1 is now equal to CV_16F, which may break some [rarely used] functionality. We'll see * fixed just introduced bug in norm; reverted errorneous changes in Torch importer (need to find a better solution) * addressed some issues found during the PR review * restored the patch to fix some perf test failures --- modules/core/include/opencv2/core.hpp | 1 + modules/core/include/opencv2/core/cvdef.h | 9 +- modules/core/include/opencv2/core/hal/hal.hpp | 6 + .../core/include/opencv2/core/hal/interface.h | 7 + modules/core/include/opencv2/core/mat.hpp | 2 + .../core/include/opencv2/core/saturate.hpp | 16 ++ modules/core/include/opencv2/core/traits.hpp | 20 ++ modules/core/src/array.cpp | 3 + modules/core/src/check.cpp | 6 +- modules/core/src/convert.cpp | 57 +++-- modules/core/src/convert.hpp | 33 ++- modules/core/src/convert_scale.cpp | 37 ++- modules/core/src/merge.cpp | 6 +- modules/core/src/norm.cpp | 74 ++++-- modules/core/src/out.cpp | 14 +- modules/core/src/rand.cpp | 217 ++++-------------- modules/core/src/split.cpp | 6 +- modules/core/test/ocl/test_matrix_expr.cpp | 2 +- modules/core/test/test_arithm.cpp | 4 +- modules/ts/include/opencv2/ts/ocl_test.hpp | 1 + modules/ts/include/opencv2/ts/ts_perf.hpp | 2 +- modules/ts/src/ts_func.cpp | 30 ++- modules/ts/src/ts_perf.cpp | 6 +- 23 files changed, 281 insertions(+), 278 deletions(-) diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp index e9f09e235b..1e271a6fd3 100644 --- a/modules/core/include/opencv2/core.hpp +++ b/modules/core/include/opencv2/core.hpp @@ -3009,6 +3009,7 @@ public: virtual Ptr format(const Mat& mtx) const = 0; + virtual void set16fPrecision(int p = 4) = 0; virtual void set32fPrecision(int p = 8) = 0; virtual void set64fPrecision(int p = 16) = 0; virtual void setMultiline(bool ml = true) = 0; diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 5c8b9f9b5c..21fce842a8 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -317,13 +317,10 @@ Cv64suf; #define CV_IS_SUBMAT(flags) ((flags) & CV_MAT_SUBMAT_FLAG) /** Size of each channel item, - 0x8442211 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */ -#define CV_ELEM_SIZE1(type) \ - ((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15) + 0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */ +#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15) -/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */ -#define CV_ELEM_SIZE(type) \ - (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3)) +#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type)) #ifndef MIN # define MIN(a,b) ((a) > (b) ? (b) : (a)) diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp index 68900ec428..0d68078d98 100644 --- a/modules/core/include/opencv2/core/hal/hal.hpp +++ b/modules/core/include/opencv2/core/hal/hal.hpp @@ -195,6 +195,12 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars ); CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars ); +CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len ); +CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len ); + +CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len ); +CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len ); + struct CV_EXPORTS DFT1D { static Ptr create(int len, int count, int depth, int flags, bool * useBuffer = 0); diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h index 8f640254c3..70de04d762 100644 --- a/modules/core/include/opencv2/core/hal/interface.h +++ b/modules/core/include/opencv2/core/hal/interface.h @@ -76,6 +76,7 @@ typedef signed char schar; #define CV_32F 5 #define CV_64F 6 #define CV_USRTYPE1 7 +#define CV_16F 7 #define CV_MAT_DEPTH_MASK (CV_DEPTH_MAX - 1) #define CV_MAT_DEPTH(flags) ((flags) & CV_MAT_DEPTH_MASK) @@ -124,6 +125,12 @@ typedef signed char schar; #define CV_64FC3 CV_MAKETYPE(CV_64F,3) #define CV_64FC4 CV_MAKETYPE(CV_64F,4) #define CV_64FC(n) CV_MAKETYPE(CV_64F,(n)) + +#define CV_16FC1 CV_MAKETYPE(CV_16F,1) +#define CV_16FC2 CV_MAKETYPE(CV_16F,2) +#define CV_16FC3 CV_MAKETYPE(CV_16F,3) +#define CV_16FC4 CV_MAKETYPE(CV_16F,4) +#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n)) //! @} //! @name Comparison operation diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index d918ee9652..2efcf17b6c 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -296,8 +296,10 @@ public: DEPTH_MASK_32S = 1 << CV_32S, DEPTH_MASK_32F = 1 << CV_32F, DEPTH_MASK_64F = 1 << CV_64F, + DEPTH_MASK_16F = 1 << CV_16F, DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1, DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S, + DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1, DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F }; diff --git a/modules/core/include/opencv2/core/saturate.hpp b/modules/core/include/opencv2/core/saturate.hpp index 118599f8f9..ced5e66f43 100644 --- a/modules/core/include/opencv2/core/saturate.hpp +++ b/modules/core/include/opencv2/core/saturate.hpp @@ -158,6 +158,22 @@ template<> inline uint64 saturate_cast(int64 v) { return (uint64)st template<> inline int64 saturate_cast(uint64 v) { return (int64)std::min(v, (uint64)LLONG_MAX); } +/** @overload */ +template static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); } + +// in theory, we could use a LUT for 8u/8s->16f conversion, +// but with hardware support for FP32->FP16 conversion the current approach is preferable +template<> inline float16_t saturate_cast(uchar v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(schar v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(ushort v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(short v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(unsigned v){ return float16_t((float)v); } +template<> inline float16_t saturate_cast(int v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(uint64 v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(int64 v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(float v) { return float16_t(v); } +template<> inline float16_t saturate_cast(double v) { return float16_t((float)v); } + //! @} } // cv diff --git a/modules/core/include/opencv2/core/traits.hpp b/modules/core/include/opencv2/core/traits.hpp index 6cb10f44cf..52ab083ca4 100644 --- a/modules/core/include/opencv2/core/traits.hpp +++ b/modules/core/include/opencv2/core/traits.hpp @@ -261,6 +261,20 @@ public: }; }; +template<> class DataType +{ +public: + typedef float16_t value_type; + typedef float work_type; + typedef value_type channel_type; + typedef value_type vec_type; + enum { generic_type = 0, + depth = CV_16F, + channels = 1, + fmt = (int)'h', + type = CV_MAKETYPE(depth, channels) + }; +}; /** @brief A helper class for cv::DataType @@ -330,6 +344,12 @@ template<> class TypeDepth typedef double value_type; }; +template<> class TypeDepth +{ + enum { depth = CV_16F }; + typedef float16_t value_type; +}; + #endif //! @} diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp index dde8b2606f..49b533b8e9 100644 --- a/modules/core/src/array.cpp +++ b/modules/core/src/array.cpp @@ -3262,6 +3262,9 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to) case CV_64F: scalarToRawData_(s, (double*)_buf, cn, unroll_to); break; + case CV_16F: + scalarToRawData_(s, (float16_t*)_buf, cn, unroll_to); + break; default: CV_Error(CV_StsUnsupportedFormat,""); } diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp index 92a3b6006e..676f755d1d 100644 --- a/modules/core/src/check.cpp +++ b/modules/core/src/check.cpp @@ -43,15 +43,15 @@ static const char* getTestOpMath(unsigned testOp) const char* depthToString_(int depth) { - static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" }; - return (depth <= CV_USRTYPE1 && depth >= 0) ? depthNames[depth] : NULL; + static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_16F" }; + return (depth <= CV_16F && depth >= 0) ? depthNames[depth] : NULL; } const cv::String typeToString_(int type) { int depth = CV_MAT_DEPTH(type); int cn = CV_MAT_CN(type); - if (depth >= 0 && depth <= CV_USRTYPE1) + if (depth >= 0 && depth <= CV_16F) return cv::format("%sC%d", depthToString_(depth), cn); return cv::String(); } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index a54f4c1bcd..09a7d345d4 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -8,7 +8,7 @@ namespace cv { -/*namespace hal { +namespace hal { void cvt16f32f( const float16_t* src, float* dst, int len ) { @@ -50,21 +50,21 @@ void cvt32f16f( const float* src, float16_t* dst, int len ) dst[j] = float16_t(src[j]); } -/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len ) +void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len ) { // the loop is simple enough, so we let the compiler to vectorize it for( int i = 0; i < len; i++ ) - arr[i] = scaleBiasPairs[i*2 + 1]; + arr[i] += scaleBiasPairs[i*2 + 1]; } void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len ) { // the loop is simple enough, so we let the compiler to vectorize it for( int i = 0; i < len; i++ ) - arr[i] = scaleBiasPairs[i*2 + 1]; + arr[i] += scaleBiasPairs[i*2 + 1]; } -}*/ +} template inline void cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size ) @@ -150,7 +150,7 @@ DEF_CVT_FUNC(8u16s, cvt_, uchar, short, v_int16) DEF_CVT_FUNC(8u32s, cvt_, uchar, int, v_int32) DEF_CVT_FUNC(8u32f, cvt_, uchar, float, v_float32) DEF_CVT_FUNC(8u64f, cvt_, uchar, double, v_int32) -//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32) +DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32) ////////////////////// 8s -> ... //////////////////////// @@ -160,7 +160,7 @@ DEF_CVT_FUNC(8s16s, cvt_, schar, short, v_int16) DEF_CVT_FUNC(8s32s, cvt_, schar, int, v_int32) DEF_CVT_FUNC(8s32f, cvt_, schar, float, v_float32) DEF_CVT_FUNC(8s64f, cvt_, schar, double, v_int32) -//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32) +DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32) ////////////////////// 16u -> ... //////////////////////// @@ -170,7 +170,7 @@ DEF_CVT_FUNC(16u16s, cvt_, ushort, short, v_int32) DEF_CVT_FUNC(16u32s, cvt_, ushort, int, v_int32) DEF_CVT_FUNC(16u32f, cvt_, ushort, float, v_float32) DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32) -//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32) +DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32) ////////////////////// 16s -> ... //////////////////////// @@ -180,7 +180,7 @@ DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32) DEF_CVT_FUNC(16s32s, cvt_, short, int, v_int32) DEF_CVT_FUNC(16s32f, cvt_, short, float, v_float32) DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32) -//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32) +DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32) ////////////////////// 32s -> ... //////////////////////// @@ -190,7 +190,7 @@ DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32) DEF_CVT_FUNC(32s16s, cvt_, int, short, v_int32) DEF_CVT_FUNC(32s32f, cvt_, int, float, v_float32) DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32) -//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32) +DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32) ////////////////////// 32f -> ... //////////////////////// @@ -210,17 +210,17 @@ DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32) DEF_CVT_FUNC(64f16s, cvt_, double, short, v_int32) DEF_CVT_FUNC(64f32s, cvt_, double, int, v_int32) DEF_CVT_FUNC(64f32f, cvt_, double, float, v_float32) -//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32) +DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32) ////////////////////// 16f -> ... //////////////////////// -//DEF_CVT_FUNC(16f8u, cvt_, float16_t, uchar, v_float32) -//DEF_CVT_FUNC(16f8s, cvt_, float16_t, schar, v_float32) -//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32) -//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short, v_float32) -//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int, v_float32) +DEF_CVT_FUNC(16f8u, cvt_, float16_t, uchar, v_float32) +DEF_CVT_FUNC(16f8s, cvt_, float16_t, schar, v_float32) +DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32) +DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short, v_float32) +DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int, v_float32) DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float, v_float32) -//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32) +DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32) ///////////// "conversion" w/o conversion /////////////// @@ -339,42 +339,41 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth) { (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u), (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u), - (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u) + (BinaryFunc)GET_OPTIMIZED(cvt64f8u), (BinaryFunc)(cvt16f8u) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s), (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s), - (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s) + (BinaryFunc)GET_OPTIMIZED(cvt64f8s), (BinaryFunc)(cvt16f8s) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u), - (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u) + (BinaryFunc)GET_OPTIMIZED(cvt64f16u), (BinaryFunc)(cvt16f16u) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s), (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s), - (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s) + (BinaryFunc)GET_OPTIMIZED(cvt64f16s), (BinaryFunc)(cvt16f16s) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s), (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s), - (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s) + (BinaryFunc)GET_OPTIMIZED(cvt64f32s), (BinaryFunc)(cvt16f32s) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f), (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s, - (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f) + (BinaryFunc)GET_OPTIMIZED(cvt64f32f), (BinaryFunc)(cvt16f32f) }, { (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f), (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f), - (BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f) + (BinaryFunc)(cvt64s), (BinaryFunc)(cvt16f64f) }, { - 0, 0, 0, 0, 0, 0, 0, 0 - //(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f), - //(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u) + (BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f), + (BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u) } }; return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)]; @@ -481,7 +480,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst ) if(_dst.fixedType()) { ddepth = _dst.depth(); - CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/); + CV_Assert(ddepth == CV_16S || ddepth == CV_16F); CV_Assert(_dst.channels() == _src.channels()); } else @@ -489,7 +488,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst ) func = (BinaryFunc)cvt32f16f; break; case CV_16S: - //case CV_16F: + case CV_16F: ddepth = CV_32F; func = (BinaryFunc)cvt16f32f; break; diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp index 0d0aa3a770..4b9ddbb413 100644 --- a/modules/core/src/convert.hpp +++ b/modules/core/src/convert.hpp @@ -150,12 +150,11 @@ static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b) static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b) { a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); } -//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b) -//{ -// a = vx_load_expand(ptr); -// b = vx_load_expand(ptr + v_float32::nlanes); -//} - +static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b) +{ + a = vx_load_expand(ptr); + b = vx_load_expand(ptr + v_float32::nlanes); +} static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b) { @@ -295,12 +294,12 @@ static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b b = vx_load(ptr + v_float64::nlanes); } -//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b) -//{ -// v_float32 v0 = vx_load_expand(ptr); -// a = v_cvt_f64(v0); -// b = v_cvt_f64_high(v0); -//} +static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b) +{ + v_float32 v0 = vx_load_expand(ptr); + a = v_cvt_f64(v0); + b = v_cvt_f64_high(v0); +} static inline void v_store_as(double* ptr, const v_float32& a) { @@ -349,11 +348,11 @@ static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float v_store(ptr, v); } -//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b) -//{ -// v_float32 v = v_cvt_f32(a, b); -// v_pack_store(ptr, v); -//} +static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b) +{ + v_float32 v = v_cvt_f32(a, b); + v_pack_store(ptr, v); +} #else diff --git a/modules/core/src/convert_scale.cpp b/modules/core/src/convert_scale.cpp index 0d4b5151a3..751f7fe626 100644 --- a/modules/core/src/convert_scale.cpp +++ b/modules/core/src/convert_scale.cpp @@ -222,7 +222,7 @@ DEF_CVT_SCALE_FUNC(16s8u, cvt_32f, short, uchar, float) DEF_CVT_SCALE_FUNC(32s8u, cvt_32f, int, uchar, float) DEF_CVT_SCALE_FUNC(32f8u, cvt_32f, float, uchar, float) DEF_CVT_SCALE_FUNC(64f8u, cvt_32f, double, uchar, float) -//DEF_CVT_SCALE_FUNC(16f8u, cvt_32f, float16_t, uchar, float) +DEF_CVT_SCALE_FUNC(16f8u, cvt_32f, float16_t, uchar, float) DEF_CVT_SCALE_FUNC(8u8s, cvt_32f, uchar, schar, float) DEF_CVT_SCALE_FUNC(8s, cvt_32f, schar, schar, float) @@ -231,7 +231,7 @@ DEF_CVT_SCALE_FUNC(16s8s, cvt_32f, short, schar, float) DEF_CVT_SCALE_FUNC(32s8s, cvt_32f, int, schar, float) DEF_CVT_SCALE_FUNC(32f8s, cvt_32f, float, schar, float) DEF_CVT_SCALE_FUNC(64f8s, cvt_32f, double, schar, float) -//DEF_CVT_SCALE_FUNC(16f8s, cvt_32f, float16_t, schar, float) +DEF_CVT_SCALE_FUNC(16f8s, cvt_32f, float16_t, schar, float) DEF_CVT_SCALE_FUNC(8u16u, cvt_32f, uchar, ushort, float) DEF_CVT_SCALE_FUNC(8s16u, cvt_32f, schar, ushort, float) @@ -240,7 +240,7 @@ DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short, ushort, float) DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int, ushort, float) DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float, ushort, float) DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float) -//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float) +DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float) DEF_CVT_SCALE_FUNC(8u16s, cvt_32f, uchar, short, float) DEF_CVT_SCALE_FUNC(8s16s, cvt_32f, schar, short, float) @@ -249,7 +249,7 @@ DEF_CVT_SCALE_FUNC(16s, cvt_32f, short, short, float) DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int, short, float) DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float, short, float) DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float) -//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float) +DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float) DEF_CVT_SCALE_FUNC(8u32s, cvt_32f, uchar, int, float) DEF_CVT_SCALE_FUNC(8s32s, cvt_32f, schar, int, float) @@ -258,7 +258,7 @@ DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short, int, float) DEF_CVT_SCALE_FUNC(32s, cvt_64f, int, int, double) DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float, int, float) DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double) -//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float) +DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float) DEF_CVT_SCALE_FUNC(8u32f, cvt_32f, uchar, float, float) DEF_CVT_SCALE_FUNC(8s32f, cvt_32f, schar, float, float) @@ -267,7 +267,7 @@ DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short, float, float) DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int, float, float) DEF_CVT_SCALE_FUNC(32f, cvt_32f, float, float, float) DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double) -//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float) +DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float) DEF_CVT_SCALE_FUNC(8u64f, cvt_64f, uchar, double, double) DEF_CVT_SCALE_FUNC(8s64f, cvt_64f, schar, double, double) @@ -276,16 +276,16 @@ DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short, double, double) DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int, double, double) DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float, double, double) DEF_CVT_SCALE_FUNC(64f, cvt_64f, double, double, double) -//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double) +DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double) -/*DEF_CVT_SCALE_FUNC(8u16f, cvt1_32f, uchar, float16_t, float) +DEF_CVT_SCALE_FUNC(8u16f, cvt1_32f, uchar, float16_t, float) DEF_CVT_SCALE_FUNC(8s16f, cvt1_32f, schar, float16_t, float) DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float) DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short, float16_t, float) DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int, float16_t, float) DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float, float16_t, float) DEF_CVT_SCALE_FUNC(64f16f, cvt_64f, double, float16_t, double) -DEF_CVT_SCALE_FUNC(16f, cvt1_32f, float16_t, float16_t, float)*/ +DEF_CVT_SCALE_FUNC(16f, cvt1_32f, float16_t, float16_t, float) static BinaryFunc getCvtScaleAbsFunc(int depth) { @@ -306,43 +306,42 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth) { (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u), - (BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u + (BinaryFunc)cvtScale64f8u, (BinaryFunc)cvtScale16f8u }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s), - (BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s + (BinaryFunc)cvtScale64f8s, (BinaryFunc)cvtScale16f8s }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u), - (BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u + (BinaryFunc)cvtScale64f16u, (BinaryFunc)cvtScale16f16u }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s), - (BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s + (BinaryFunc)cvtScale64f16s, (BinaryFunc)cvtScale16f16s }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s), - (BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s + (BinaryFunc)cvtScale64f32s, (BinaryFunc)cvtScale16f32s }, { (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f), - (BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f + (BinaryFunc)cvtScale64f32f, (BinaryFunc)cvtScale16f32f }, { (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f, (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f, - (BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f + (BinaryFunc)cvtScale64f, (BinaryFunc)cvtScale16f64f }, { - 0, 0, 0, 0, 0, 0, 0, 0 - /*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f, + (BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f, (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f, - (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/ + (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f }, }; diff --git a/modules/core/src/merge.cpp b/modules/core/src/merge.cpp index 300a718506..5f4eaf8f00 100644 --- a/modules/core/src/merge.cpp +++ b/modules/core/src/merge.cpp @@ -216,8 +216,10 @@ static MergeFunc getMergeFunc(int depth) { static MergeFunc mergeTab[] = { - (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), - (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0 + (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), + (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), + (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), + (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u) }; return mergeTab[depth]; diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp index f2171a907a..dc49937412 100644 --- a/modules/core/src/norm.cpp +++ b/modules/core/src/norm.cpp @@ -723,7 +723,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) return result; } - NormFunc func = getNormFunc(normType >> 1, depth); + NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth); CV_Assert( func != 0 ); const Mat* arrays[] = {&src, &mask, 0}; @@ -737,19 +737,31 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) result; result.d = 0; NAryMatIterator it(arrays, ptrs); - int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0; - bool blockSum = (normType == NORM_L1 && depth <= CV_16S) || + int j, total = (int)it.size, blockSize = total; + bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) || ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S); int isum = 0; int *ibuf = &result.i; + AutoBuffer fltbuf_; + float* fltbuf = 0; size_t esz = 0; if( blockSum ) { - intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn; - blockSize = std::min(blockSize, intSumBlockSize); - ibuf = &isum; esz = src.elemSize(); + + if( depth == CV_16F ) + { + blockSize = std::min(blockSize, 1024); + fltbuf_.allocate(blockSize); + fltbuf = fltbuf_.data(); + } + else + { + int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn; + blockSize = std::min(blockSize, intSumBlockSize); + ibuf = &isum; + } } for( size_t i = 0; i < it.nplanes; i++, ++it ) @@ -757,13 +769,17 @@ double cv::norm( InputArray _src, int normType, InputArray _mask ) for( j = 0; j < total; j += blockSize ) { int bsz = std::min(total - j, blockSize); - func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn ); - count += bsz; - if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) + const uchar* data = ptrs[0]; + if( depth == CV_16F ) + { + hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz); + data = (const uchar*)fltbuf; + } + func( data, ptrs[1], (uchar*)ibuf, bsz, cn ); + if( blockSum && depth != CV_16F ) { result.d += isum; isum = 0; - count = 0; } ptrs[0] += bsz*esz; if( ptrs[1] ) @@ -1181,7 +1197,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m return result; } - NormDiffFunc func = getNormDiffFunc(normType >> 1, depth); + NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth); CV_Assert( func != 0 ); const Mat* arrays[] = {&src1, &src2, &mask, 0}; @@ -1196,19 +1212,31 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m result; result.d = 0; NAryMatIterator it(arrays, ptrs); - int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0; - bool blockSum = (normType == NORM_L1 && depth <= CV_16S) || + int j, total = (int)it.size, blockSize = total; + bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) || ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S); unsigned isum = 0; unsigned *ibuf = &result.u; + AutoBuffer fltbuf_; + float* fltbuf = 0; size_t esz = 0; if( blockSum ) { - intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15); - blockSize = std::min(blockSize, intSumBlockSize); - ibuf = &isum; esz = src1.elemSize(); + + if( depth == CV_16F ) + { + blockSize = std::min(blockSize, 1024); + fltbuf_.allocate(blockSize*2); + fltbuf = fltbuf_.data(); + } + else + { + int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn; + blockSize = std::min(blockSize, intSumBlockSize); + ibuf = &isum; + } } for( size_t i = 0; i < it.nplanes; i++, ++it ) @@ -1216,13 +1244,19 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m for( j = 0; j < total; j += blockSize ) { int bsz = std::min(total - j, blockSize); - func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn ); - count += bsz; - if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) + const uchar *data0 = ptrs[0], *data1 = ptrs[1]; + if( depth == CV_16F ) + { + hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz); + hal::cvt16f32f((const float16_t*)ptrs[1], fltbuf + bsz, bsz); + data0 = (const uchar*)fltbuf; + data1 = (const uchar*)(fltbuf + bsz); + } + func( data0, data1, ptrs[2], (uchar*)ibuf, bsz, cn ); + if( blockSum && depth != CV_16F ) { result.d += isum; isum = 0; - count = 0; } ptrs[0] += bsz*esz; ptrs[1] += bsz*esz; diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp index 9f28654358..89770fe33f 100644 --- a/modules/core/src/out.cpp +++ b/modules/core/src/out.cpp @@ -77,6 +77,7 @@ namespace cv void valueToStr32s() { sprintf(buf, "%d", mtx.ptr(row, col)[cn]); } void valueToStr32f() { sprintf(buf, floatFormat, mtx.ptr(row, col)[cn]); } void valueToStr64f() { sprintf(buf, floatFormat, mtx.ptr(row, col)[cn]); } + void valueToStr16f() { sprintf(buf, floatFormat, (float)mtx.ptr(row, col)[cn]); } void valueToStrOther() { buf[0] = 0; } public: @@ -115,7 +116,8 @@ namespace cv case CV_32S: valueToStr = &FormattedImpl::valueToStr32s; break; case CV_32F: valueToStr = &FormattedImpl::valueToStr32f; break; case CV_64F: valueToStr = &FormattedImpl::valueToStr64f; break; - default: valueToStr = &FormattedImpl::valueToStrOther; break; + default: CV_Assert(mtx.depth() == CV_16F); + valueToStr = &FormattedImpl::valueToStr16f; } } @@ -256,7 +258,12 @@ namespace cv class FormatterBase : public Formatter { public: - FormatterBase() : prec32f(8), prec64f(16), multiline(true) {} + FormatterBase() : prec16f(4), prec32f(8), prec64f(16), multiline(true) {} + + void set16fPrecision(int p) CV_OVERRIDE + { + prec16f = p; + } void set32fPrecision(int p) CV_OVERRIDE { @@ -274,6 +281,7 @@ namespace cv } protected: + int prec16f; int prec32f; int prec64f; int multiline; @@ -325,7 +333,7 @@ namespace cv { static const char* numpyTypes[] = { - "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64" + "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "float16" }; char braces[5] = {'[', ']', ',', '[', ']'}; if (mtx.cols == 1) diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp index e791fd131b..ea45ec4ea0 100644 --- a/modules/core/src/rand.cpp +++ b/modules/core/src/rand.cpp @@ -48,18 +48,6 @@ #include "precomp.hpp" -#if defined _WIN32 || defined WINCE - #include - #undef small - #undef min - #undef max - #undef abs -#endif - -#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP) - #include "emmintrin.h" -#endif - namespace cv { @@ -74,12 +62,6 @@ namespace cv #define RNG_NEXT(x) ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32)) -#ifdef __PPC64__ - #define PPC_MUL_ADD(ret, tmp, p0, p1) \ - asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret) \ - : "f" (tmp), "f" (p0), "f" (p1)) -#endif - /***************************************************************************************\ * Pseudo-Random Number Generators (PRNGs) * \***************************************************************************************/ @@ -154,59 +136,26 @@ template static void randi_( T* arr, int len, uint64* state, const DivStruct* p ) { uint64 temp = *state; - int i = 0; - unsigned t0, t1, v0, v1; - - for( i = 0; i <= len - 4; i += 4 ) + for( int i = 0; i < len; i++ ) { temp = RNG_NEXT(temp); - t0 = (unsigned)temp; - temp = RNG_NEXT(temp); - t1 = (unsigned)temp; - v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32); - v1 = (unsigned)(((uint64)t1 * p[i+1].M) >> 32); - v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2; - v1 = (v1 + ((t1 - v1) >> p[i+1].sh1)) >> p[i+1].sh2; - v0 = t0 - v0*p[i].d + p[i].delta; - v1 = t1 - v1*p[i+1].d + p[i+1].delta; - arr[i] = saturate_cast((int)v0); - arr[i+1] = saturate_cast((int)v1); - - temp = RNG_NEXT(temp); - t0 = (unsigned)temp; - temp = RNG_NEXT(temp); - t1 = (unsigned)temp; - v0 = (unsigned)(((uint64)t0 * p[i+2].M) >> 32); - v1 = (unsigned)(((uint64)t1 * p[i+3].M) >> 32); - v0 = (v0 + ((t0 - v0) >> p[i+2].sh1)) >> p[i+2].sh2; - v1 = (v1 + ((t1 - v1) >> p[i+3].sh1)) >> p[i+3].sh2; - v0 = t0 - v0*p[i+2].d + p[i+2].delta; - v1 = t1 - v1*p[i+3].d + p[i+3].delta; - arr[i+2] = saturate_cast((int)v0); - arr[i+3] = saturate_cast((int)v1); + unsigned t = (unsigned)temp; + unsigned v = (unsigned)(((uint64)t * p[i].M) >> 32); + v = (v + ((t - v) >> p[i].sh1)) >> p[i].sh2; + v = t - v*p[i].d + p[i].delta; + arr[i] = saturate_cast((int)v); } - - for( ; i < len; i++ ) - { - temp = RNG_NEXT(temp); - t0 = (unsigned)temp; - v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32); - v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2; - v0 = t0 - v0*p[i].d + p[i].delta; - arr[i] = saturate_cast((int)v0); - } - *state = temp; } #define DEF_RANDI_FUNC(suffix, type) \ static void randBits_##suffix(type* arr, int len, uint64* state, \ - const Vec2i* p, bool small_flag) \ + const Vec2i* p, void*, bool small_flag) \ { randBits_(arr, len, state, p, small_flag); } \ \ static void randi_##suffix(type* arr, int len, uint64* state, \ - const DivStruct* p, bool ) \ + const DivStruct* p, void*, bool ) \ { randi_(arr, len, state, p); } DEF_RANDI_FUNC(8u, uchar) @@ -215,131 +164,62 @@ DEF_RANDI_FUNC(16u, ushort) DEF_RANDI_FUNC(16s, short) DEF_RANDI_FUNC(32s, int) -static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool ) +static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, void*, bool ) { uint64 temp = *state; - int i = 0; - - for( ; i <= len - 4; i += 4 ) + for( int i = 0; i < len; i++ ) { - float f[4]; - f[0] = (float)(int)(temp = RNG_NEXT(temp)); - f[1] = (float)(int)(temp = RNG_NEXT(temp)); - f[2] = (float)(int)(temp = RNG_NEXT(temp)); - f[3] = (float)(int)(temp = RNG_NEXT(temp)); - - // handwritten SSE is required not for performance but for numerical stability! - // both 32-bit gcc and MSVC compilers trend to generate double precision SSE - // while 64-bit compilers generate single precision SIMD instructions - // so manual vectorisation forces all compilers to the single precision -#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP) - __m128 q0 = _mm_loadu_ps((const float*)(p + i)); - __m128 q1 = _mm_loadu_ps((const float*)(p + i + 2)); - - __m128 q01l = _mm_unpacklo_ps(q0, q1); - __m128 q01h = _mm_unpackhi_ps(q0, q1); - - __m128 p0 = _mm_unpacklo_ps(q01l, q01h); - __m128 p1 = _mm_unpackhi_ps(q01l, q01h); - - _mm_storeu_ps(arr + i, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(f), p0), p1)); -#elif defined __ARM_NEON && defined __aarch64__ - // handwritten NEON is required not for performance but for numerical stability! - // 64bit gcc tends to use fmadd instead of separate multiply and add - // use volatile to ensure to separate the multiply and add - float32x4x2_t q = vld2q_f32((const float*)(p + i)); - - float32x4_t p0 = q.val[0]; - float32x4_t p1 = q.val[1]; - - volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0); - vst1q_f32(arr+i, vaddq_f32(v0, p1)); -#elif defined __PPC64__ - // inline asm is required for numerical stability! - // compilers tends to use floating multiply-add single(fmadds) - // instead of separate multiply and add - PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]); - PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]); - PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]); - PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]); -#else - arr[i+0] = f[0]*p[i+0][0] + p[i+0][1]; - arr[i+1] = f[1]*p[i+1][0] + p[i+1][1]; - arr[i+2] = f[2]*p[i+2][0] + p[i+2][1]; - arr[i+3] = f[3]*p[i+3][0] + p[i+3][1]; -#endif + int t = (int)(temp = RNG_NEXT(temp)); + arr[i] = (float)(t*p[i][0]); } - - for( ; i < len; i++ ) - { - temp = RNG_NEXT(temp); -#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP) - _mm_store_ss(arr + i, _mm_add_ss( - _mm_mul_ss(_mm_set_ss((float)(int)temp), _mm_set_ss(p[i][0])), - _mm_set_ss(p[i][1])) - ); -#elif defined __ARM_NEON && defined __aarch64__ - float32x2_t t = vadd_f32(vmul_f32( - vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])), - vdup_n_f32(p[i][1])); - arr[i] = vget_lane_f32(t, 0); -#elif defined __PPC64__ - PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]); -#else - arr[i] = (int)temp*p[i][0] + p[i][1]; -#endif - } - *state = temp; -} + // add bias separately to make the generated random numbers + // more deterministic, independent of + // architecture details (FMA instruction use etc.) + hal::addRNGBias32f(arr, &p[0][0], len); +} static void -randf_64f( double* arr, int len, uint64* state, const Vec2d* p, bool ) +randf_64f( double* arr, int len, uint64* state, const Vec2d* p, void*, bool ) { uint64 temp = *state; - int64 v = 0; - int i; - - for( i = 0; i <= len - 4; i += 4 ) - { - double f0, f1; - - temp = RNG_NEXT(temp); - v = (temp >> 32)|(temp << 32); - f0 = v*p[i][0] + p[i][1]; - temp = RNG_NEXT(temp); - v = (temp >> 32)|(temp << 32); - f1 = v*p[i+1][0] + p[i+1][1]; - arr[i] = f0; arr[i+1] = f1; - - temp = RNG_NEXT(temp); - v = (temp >> 32)|(temp << 32); - f0 = v*p[i+2][0] + p[i+2][1]; - temp = RNG_NEXT(temp); - v = (temp >> 32)|(temp << 32); - f1 = v*p[i+3][0] + p[i+3][1]; - arr[i+2] = f0; arr[i+3] = f1; - } - - for( ; i < len; i++ ) + for( int i = 0; i < len; i++ ) { temp = RNG_NEXT(temp); - v = (temp >> 32)|(temp << 32); - arr[i] = v*p[i][0] + p[i][1]; + int64 v = (temp >> 32)|(temp << 32); + arr[i] = v*p[i][0]; } - *state = temp; + + hal::addRNGBias64f(arr, &p[0][0], len); } -typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, bool small_flag); +static void randf_16f( float16_t* arr, int len, uint64* state, const Vec2f* p, float* fbuf, bool ) +{ + uint64 temp = *state; + for( int i = 0; i < len; i++ ) + { + float f = (float)(int)(temp = RNG_NEXT(temp)); + fbuf[i] = f*p[i][0]; + } + *state = temp; + + // add bias separately to make the generated random numbers + // more deterministic, independent of + // architecture details (FMA instruction use etc.) + hal::addRNGBias32f(fbuf, &p[0][0], len); + hal::cvt32f16f(fbuf, arr, len); +} + +typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, void* tempbuf, bool small_flag); static RandFunc randTab[][8] = { { (RandFunc)randi_8u, (RandFunc)randi_8s, (RandFunc)randi_16u, (RandFunc)randi_16s, - (RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, 0 + (RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, (RandFunc)randf_16f }, { (RandFunc)randBits_8u, (RandFunc)randBits_8s, (RandFunc)randBits_16u, (RandFunc)randBits_16s, @@ -350,7 +230,7 @@ static RandFunc randTab[][8] = /* The code below implements the algorithm described in "The Ziggurat Method for Generating Random Variables" - by Marsaglia and Tsang, Journal of Statistical Software. + by George Marsaglia and Wai Wan Tsang, Journal of Statistical Software, 2007. */ static void randn_0_1_32f( float* arr, int len, uint64* state ) @@ -631,8 +511,8 @@ void RNG::fill( InputOutputArray _mat, int disttype, // for each channel i compute such dparam[0][i] & dparam[1][i], // so that a signed 32/64-bit integer X is transformed to // the range [param1.val[i], param2.val[i]) using - // dparam[1][i]*X + dparam[0][i] - if( depth == CV_32F ) + // dparam[0][i]*X + dparam[1][i] + if( depth != CV_64F ) { fp = (Vec2f*)(parambuf + cn*2); for( j = 0; j < cn; j++ ) @@ -704,6 +584,7 @@ void RNG::fill( InputOutputArray _mat, int disttype, AutoBuffer buf; uchar* param = 0; float* nbuf = 0; + float* tmpbuf = 0; if( disttype == UNIFORM ) { @@ -727,12 +608,14 @@ void RNG::fill( InputOutputArray _mat, int disttype, p[j + k] = ip[k]; } } - else if( depth == CV_32F ) + else if( depth != CV_64F ) { Vec2f* p = (Vec2f*)param; for( j = 0; j < blockSize*cn; j += cn ) for( k = 0; k < cn; k++ ) p[j + k] = fp[k]; + if( depth == CV_16F ) + tmpbuf = (float*)p + blockSize*cn*2; } else { @@ -755,7 +638,7 @@ void RNG::fill( InputOutputArray _mat, int disttype, int len = std::min(total - j, blockSize); if( disttype == CV_RAND_UNI ) - func( ptr, len*cn, &state, param, smallFlag ); + func( ptr, len*cn, &state, param, tmpbuf, smallFlag ); else { randn_0_1_32f(nbuf, len*cn, &state); diff --git a/modules/core/src/split.cpp b/modules/core/src/split.cpp index 3fab6874b7..34d331a800 100644 --- a/modules/core/src/split.cpp +++ b/modules/core/src/split.cpp @@ -224,8 +224,10 @@ static SplitFunc getSplitFunc(int depth) { static SplitFunc splitTab[] = { - (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), - (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0 + (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), + (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), + (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), + (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u) }; return splitTab[depth]; diff --git a/modules/core/test/ocl/test_matrix_expr.cpp b/modules/core/test/ocl/test_matrix_expr.cpp index 11be5a3a36..7a5ff72cb2 100644 --- a/modules/core/test/ocl/test_matrix_expr.cpp +++ b/modules/core/test/ocl/test_matrix_expr.cpp @@ -78,7 +78,7 @@ OCL_TEST_P(UMatExpr, Ones) //////////////////////////////// Instantiation ///////////////////////////////////////////////// -OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS)); +OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS_16F, OCL_ALL_CHANNELS)); } } // namespace opencv_test::ocl diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp index c81f8d83e1..ccf68cbf90 100644 --- a/modules/core/test/test_arithm.cpp +++ b/modules/core/test/test_arithm.cpp @@ -476,7 +476,7 @@ struct CopyOp : public BaseElemWiseOp } int getRandomType(RNG& rng) { - return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS); + return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS); } double getMaxErr(int) { @@ -498,7 +498,7 @@ struct SetOp : public BaseElemWiseOp } int getRandomType(RNG& rng) { - return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS); + return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS); } double getMaxErr(int) { diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp index 6126883091..11572e9f48 100644 --- a/modules/ts/include/opencv2/ts/ocl_test.hpp +++ b/modules/ts/include/opencv2/ts/ocl_test.hpp @@ -372,6 +372,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int) #define OCL_ON(...) cv::ocl::setUseOpenCL(true); __VA_ARGS__ ; #define OCL_ALL_DEPTHS Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F) +#define OCL_ALL_DEPTHS_16F Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F) #define OCL_ALL_CHANNELS Values(1, 2, 3, 4) CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA, INTER_LINEAR_EXACT) diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp index 83988c2b86..586d83fae6 100644 --- a/modules/ts/include/opencv2/ts/ts_perf.hpp +++ b/modules/ts/include/opencv2/ts/ts_perf.hpp @@ -160,7 +160,7 @@ private: }; \ static inline void PrintTo(const class_name& t, std::ostream* os) { t.PrintTo(os); } } -CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_USRTYPE1) +CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F) /*****************************************************************************************\ * Regression control utility for performance testing * diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 60c88a7e65..3b9e0198f2 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -72,10 +72,10 @@ int randomType(RNG& rng, int typeMask, int minChannels, int maxChannels) { int channels = rng.uniform(minChannels, maxChannels+1); int depth = 0; - CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL) != 0); + CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL_16F) != 0); for(;;) { - depth = rng.uniform(CV_8U, CV_64F+1); + depth = rng.uniform(CV_8U, CV_16F+1); if( ((1 << depth) & typeMask) != 0 ) break; } @@ -1260,6 +1260,13 @@ norm_(const _Tp* src1, const _Tp* src2, size_t total, int cn, int normType, doub double norm(InputArray _src, int normType, InputArray _mask) { Mat src = _src.getMat(), mask = _mask.getMat(); + if( src.depth() == CV_16F ) + { + Mat src32f; + src.convertTo(src32f, CV_32F); + return cvtest::norm(src32f, normType, _mask); + } + if( normType == NORM_HAMMING || normType == NORM_HAMMING2 ) { if( !mask.empty() ) @@ -1340,6 +1347,14 @@ double norm(InputArray _src, int normType, InputArray _mask) double norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask) { Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat(); + if( src1.depth() == CV_16F ) + { + Mat src1_32f, src2_32f; + src1.convertTo(src1_32f, CV_32F); + src2.convertTo(src2_32f, CV_32F); + return cvtest::norm(src1_32f, src2_32f, normType, _mask); + } + bool isRelative = (normType & NORM_RELATIVE) != 0; normType &= ~NORM_RELATIVE; @@ -1982,11 +1997,20 @@ int check( const Mat& a, double fmin, double fmax, vector* _idx ) // success_err_level is maximum allowed difference, idx is the index of the first // element for which difference is >success_err_level // (or index of element with the maximum difference) -int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff, +int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff, double success_err_level, vector* _idx, bool element_wise_relative_error ) { + Mat arr = arr_, refarr = refarr_; CV_Assert( arr.type() == refarr.type() && arr.size == refarr.size ); + if( arr.depth() == CV_16F ) + { + Mat arr32f, refarr32f; + arr.convertTo(arr32f, CV_32F); + refarr.convertTo(refarr32f, CV_32F); + arr = arr32f; + refarr = refarr32f; + } int ilevel = refarr.depth() <= CV_32S ? cvFloor(success_err_level) : 0; int result = CMP_EPS_OK; diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp index 7bf60af716..c139a58180 100644 --- a/modules/ts/src/ts_perf.cpp +++ b/modules/ts/src/ts_perf.cpp @@ -594,11 +594,11 @@ Regression& Regression::operator() (const std::string& name, cv::InputArray arra // exit if current test is already failed if(::testing::UnitTest::GetInstance()->current_test_info()->result()->Failed()) return *this; - if(!array.empty() && array.depth() == CV_USRTYPE1) + /*if(!array.empty() && array.depth() == CV_USRTYPE1) { ADD_FAILURE() << " Can not check regression for CV_USRTYPE1 data type for " << name; return *this; - } + }*/ std::string nodename = getCurrentTestNodeName(); @@ -2207,7 +2207,7 @@ void PrintTo(const MatType& t, ::std::ostream* os) case CV_32S: *os << "32S"; break; case CV_32F: *os << "32F"; break; case CV_64F: *os << "64F"; break; - case CV_USRTYPE1: *os << "USRTYPE1"; break; + case CV_USRTYPE1: *os << "16F"; break; default: *os << "INVALID_TYPE"; break; } *os << 'C' << CV_MAT_CN((int)t);