From 6d7f5871dbf9e1e9082c6a99f5358e413e844b80 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Mon, 10 Sep 2018 16:56:29 +0300
Subject: [PATCH] added basic support for CV_16F (the new datatype etc.)
 (#12463)

* added basic support for CV_16F (the new datatype etc.). CV_USRTYPE1 is now equal to CV_16F, which may break some [rarely used] functionality. We'll see

* fixed just introduced bug in norm; reverted errorneous changes in Torch importer (need to find a better solution)

* addressed some issues found during the PR review

* restored the patch to fix some perf test failures
---
 modules/core/include/opencv2/core.hpp         |   1 +
 modules/core/include/opencv2/core/cvdef.h     |   9 +-
 modules/core/include/opencv2/core/hal/hal.hpp |   6 +
 .../core/include/opencv2/core/hal/interface.h |   7 +
 modules/core/include/opencv2/core/mat.hpp     |   2 +
 .../core/include/opencv2/core/saturate.hpp    |  16 ++
 modules/core/include/opencv2/core/traits.hpp  |  20 ++
 modules/core/src/array.cpp                    |   3 +
 modules/core/src/check.cpp                    |   6 +-
 modules/core/src/convert.cpp                  |  57 +++--
 modules/core/src/convert.hpp                  |  33 ++-
 modules/core/src/convert_scale.cpp            |  37 ++-
 modules/core/src/merge.cpp                    |   6 +-
 modules/core/src/norm.cpp                     |  74 ++++--
 modules/core/src/out.cpp                      |  14 +-
 modules/core/src/rand.cpp                     | 217 ++++--------------
 modules/core/src/split.cpp                    |   6 +-
 modules/core/test/ocl/test_matrix_expr.cpp    |   2 +-
 modules/core/test/test_arithm.cpp             |   4 +-
 modules/ts/include/opencv2/ts/ocl_test.hpp    |   1 +
 modules/ts/include/opencv2/ts/ts_perf.hpp     |   2 +-
 modules/ts/src/ts_func.cpp                    |  30 ++-
 modules/ts/src/ts_perf.cpp                    |   6 +-
 23 files changed, 281 insertions(+), 278 deletions(-)
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index e9f09e235b..1e271a6fd3 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -3009,6 +3009,7 @@ public:
 
     virtual Ptr<Formatted> format(const Mat& mtx) const = 0;
 
+    virtual void set16fPrecision(int p = 4) = 0;
     virtual void set32fPrecision(int p = 8) = 0;
     virtual void set64fPrecision(int p = 16) = 0;
     virtual void setMultiline(bool ml = true) = 0;
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 5c8b9f9b5c..21fce842a8 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -317,13 +317,10 @@ Cv64suf;
 #define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
 
 /** Size of each channel item,
-   0x8442211 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
-#define CV_ELEM_SIZE1(type) \
-    ((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
+   0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
 
-/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
-#define CV_ELEM_SIZE(type) \
-    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
 
 #ifndef MIN
 #  define MIN(a,b)  ((a) > (b) ? (b) : (a))
diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 68900ec428..0d68078d98 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -195,6 +195,12 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
+CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+
+CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
+CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
+
 struct CV_EXPORTS DFT1D
 {
     static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h
index 8f640254c3..70de04d762 100644
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -76,6 +76,7 @@ typedef signed char schar;
 #define CV_32F  5
 #define CV_64F  6
 #define CV_USRTYPE1 7
+#define CV_16F  7
 
 #define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
 #define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
@@ -124,6 +125,12 @@ typedef signed char schar;
 #define CV_64FC3 CV_MAKETYPE(CV_64F,3)
 #define CV_64FC4 CV_MAKETYPE(CV_64F,4)
 #define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
 //! @}
 
 //! @name Comparison operation
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index d918ee9652..2efcf17b6c 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -296,8 +296,10 @@ public:
         DEPTH_MASK_32S = 1 << CV_32S,
         DEPTH_MASK_32F = 1 << CV_32F,
         DEPTH_MASK_64F = 1 << CV_64F,
+        DEPTH_MASK_16F = 1 << CV_16F,
         DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
         DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
+        DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
         DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
     };
 
diff --git a/modules/core/include/opencv2/core/saturate.hpp b/modules/core/include/opencv2/core/saturate.hpp
index 118599f8f9..ced5e66f43 100644
--- a/modules/core/include/opencv2/core/saturate.hpp
+++ b/modules/core/include/opencv2/core/saturate.hpp
@@ -158,6 +158,22 @@ template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)st
 
 template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }
 
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
+
+// in theory, we could use a LUT for 8u/8s->16f conversion,
+// but with hardware support for FP32->FP16 conversion the current approach is preferable
+template<> inline float16_t saturate_cast<float16_t>(uchar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(schar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(ushort v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(short v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(unsigned v){ return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int v)     { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(uint64 v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int64 v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(float v)   { return float16_t(v); }
+template<> inline float16_t saturate_cast<float16_t>(double v)  { return float16_t((float)v); }
+
 //! @}
 
 } // cv
diff --git a/modules/core/include/opencv2/core/traits.hpp b/modules/core/include/opencv2/core/traits.hpp
index 6cb10f44cf..52ab083ca4 100644
--- a/modules/core/include/opencv2/core/traits.hpp
+++ b/modules/core/include/opencv2/core/traits.hpp
@@ -261,6 +261,20 @@ public:
          };
 };
 
+template<> class DataType<float16_t>
+{
+public:
+    typedef float16_t   value_type;
+    typedef float       work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16F,
+           channels     = 1,
+           fmt          = (int)'h',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
 
 /** @brief A helper class for cv::DataType
 
@@ -330,6 +344,12 @@ template<> class TypeDepth<CV_64F>
     typedef double value_type;
 };
 
+template<> class TypeDepth<CV_16F>
+{
+    enum { depth = CV_16F };
+    typedef float16_t value_type;
+};
+
 #endif
 
 //! @}
diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp
index dde8b2606f..49b533b8e9 100644
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -3262,6 +3262,9 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
     case CV_64F:
         scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
         break;
+    case CV_16F:
+        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
+        break;
     default:
         CV_Error(CV_StsUnsupportedFormat,"");
     }
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
index 92a3b6006e..676f755d1d 100644
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -43,15 +43,15 @@ static const char* getTestOpMath(unsigned testOp)
 
 const char* depthToString_(int depth)
 {
-    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
-    return (depth <= CV_USRTYPE1 && depth >= 0) ? depthNames[depth] : NULL;
+    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_16F" };
+    return (depth <= CV_16F && depth >= 0) ? depthNames[depth] : NULL;
 }
 
 const cv::String typeToString_(int type)
 {
     int depth = CV_MAT_DEPTH(type);
     int cn = CV_MAT_CN(type);
-    if (depth >= 0 && depth <= CV_USRTYPE1)
+    if (depth >= 0 && depth <= CV_16F)
         return cv::format("%sC%d", depthToString_(depth), cn);
     return cv::String();
 }
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index a54f4c1bcd..09a7d345d4 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -8,7 +8,7 @@
 
 namespace cv {
 
-/*namespace hal {
+namespace hal {
 
 void cvt16f32f( const float16_t* src, float* dst, int len )
 {
@@ -50,21 +50,21 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
         dst[j] = float16_t(src[j]);
 }
 
-/*void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
+void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len )
 {
     // the loop is simple enough, so we let the compiler to vectorize it
     for( int i = 0; i < len; i++ )
-        arr[i] = scaleBiasPairs[i*2 + 1];
+        arr[i] += scaleBiasPairs[i*2 + 1];
 }
 
 void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len )
 {
     // the loop is simple enough, so we let the compiler to vectorize it
     for( int i = 0; i < len; i++ )
-        arr[i] = scaleBiasPairs[i*2 + 1];
+        arr[i] += scaleBiasPairs[i*2 + 1];
 }
 
-}*/
+}
 
 template<typename _Ts, typename _Td, typename _Twvec> inline void
 cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
@@ -150,7 +150,7 @@ DEF_CVT_FUNC(8u16s, cvt_,  uchar, short,    v_int16)
 DEF_CVT_FUNC(8u32s, cvt_,  uchar, int,      v_int32)
 DEF_CVT_FUNC(8u32f, cvt_,  uchar, float,    v_float32)
 DEF_CVT_FUNC(8u64f, cvt_,  uchar, double,   v_int32)
-//DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
+DEF_CVT_FUNC(8u16f, cvt1_, uchar, float16_t, v_float32)
 
 ////////////////////// 8s -> ... ////////////////////////
 
@@ -160,7 +160,7 @@ DEF_CVT_FUNC(8s16s, cvt_,  schar, short,    v_int16)
 DEF_CVT_FUNC(8s32s, cvt_,  schar, int,      v_int32)
 DEF_CVT_FUNC(8s32f, cvt_,  schar, float,    v_float32)
 DEF_CVT_FUNC(8s64f, cvt_,  schar, double,   v_int32)
-//DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
+DEF_CVT_FUNC(8s16f, cvt1_, schar, float16_t, v_float32)
 
 ////////////////////// 16u -> ... ////////////////////////
 
@@ -170,7 +170,7 @@ DEF_CVT_FUNC(16u16s, cvt_, ushort, short,  v_int32)
 DEF_CVT_FUNC(16u32s, cvt_, ushort, int,    v_int32)
 DEF_CVT_FUNC(16u32f, cvt_, ushort, float,  v_float32)
 DEF_CVT_FUNC(16u64f, cvt_, ushort, double, v_int32)
-//DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
+DEF_CVT_FUNC(16u16f, cvt1_,ushort, float16_t, v_float32)
 
 ////////////////////// 16s -> ... ////////////////////////
 
@@ -180,7 +180,7 @@ DEF_CVT_FUNC(16s16u, cvt_, short, ushort, v_int32)
 DEF_CVT_FUNC(16s32s, cvt_, short, int,    v_int32)
 DEF_CVT_FUNC(16s32f, cvt_, short, float,  v_float32)
 DEF_CVT_FUNC(16s64f, cvt_, short, double, v_int32)
-//DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
+DEF_CVT_FUNC(16s16f, cvt1_,short, float16_t, v_float32)
 
 ////////////////////// 32s -> ... ////////////////////////
 
@@ -190,7 +190,7 @@ DEF_CVT_FUNC(32s16u, cvt_, int, ushort, v_int32)
 DEF_CVT_FUNC(32s16s, cvt_, int, short,  v_int32)
 DEF_CVT_FUNC(32s32f, cvt_, int, float,  v_float32)
 DEF_CVT_FUNC(32s64f, cvt_, int, double, v_int32)
-//DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
+DEF_CVT_FUNC(32s16f, cvt1_,int, float16_t, v_float32)
 
 ////////////////////// 32f -> ... ////////////////////////
 
@@ -210,17 +210,17 @@ DEF_CVT_FUNC(64f16u, cvt_, double, ushort, v_int32)
 DEF_CVT_FUNC(64f16s, cvt_, double, short,  v_int32)
 DEF_CVT_FUNC(64f32s, cvt_, double, int,    v_int32)
 DEF_CVT_FUNC(64f32f, cvt_, double, float,  v_float32)
-//DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
+DEF_CVT_FUNC(64f16f, cvt1_,double, float16_t, v_float32)
 
 ////////////////////// 16f -> ... ////////////////////////
 
-//DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
-//DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
-//DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
-//DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
-//DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
+DEF_CVT_FUNC(16f8u,  cvt_,  float16_t, uchar,  v_float32)
+DEF_CVT_FUNC(16f8s,  cvt_,  float16_t, schar,  v_float32)
+DEF_CVT_FUNC(16f16u, cvt1_, float16_t, ushort, v_float32)
+DEF_CVT_FUNC(16f16s, cvt1_, float16_t, short,  v_float32)
+DEF_CVT_FUNC(16f32s, cvt1_, float16_t, int,    v_float32)
 DEF_CVT_FUNC(16f32f, cvt1_, float16_t, float,  v_float32)
-//DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
+DEF_CVT_FUNC(16f64f, cvt1_, float16_t, double, v_float32)
 
 ///////////// "conversion" w/o conversion ///////////////
 
@@ -339,42 +339,41 @@ BinaryFunc getConvertFunc(int sdepth, int ddepth)
         {
             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0 //(BinaryFunc)(cvt16f8u)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8u), (BinaryFunc)(cvt16f8u)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0 //(BinaryFunc)(cvt16f8s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f8s), (BinaryFunc)(cvt16f8s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0 //(BinaryFunc)(cvt16f16u)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16u), (BinaryFunc)(cvt16f16u)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0 //(BinaryFunc)(cvt16f16s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f16s), (BinaryFunc)(cvt16f16s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0 //(BinaryFunc)(cvt16f32s)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32s), (BinaryFunc)(cvt16f32s)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
-            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0 //(BinaryFunc)(cvt16f32f)
+            (BinaryFunc)GET_OPTIMIZED(cvt64f32f), (BinaryFunc)(cvt16f32f)
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
-            (BinaryFunc)(cvt64s), 0 //(BinaryFunc)(cvt16f64f)
+            (BinaryFunc)(cvt64s), (BinaryFunc)(cvt16f64f)
         },
         {
-            0, 0, 0, 0, 0, 0, 0, 0
-            //(BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
-            //(BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
+            (BinaryFunc)(cvt8u16f), (BinaryFunc)(cvt8s16f), (BinaryFunc)(cvt16u16f), (BinaryFunc)(cvt16s16f),
+            (BinaryFunc)(cvt32s16f), (BinaryFunc)(cvt32f16f), (BinaryFunc)(cvt64f16f), (BinaryFunc)(cvt16u)
         }
     };
     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
@@ -481,7 +480,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
         if(_dst.fixedType())
         {
             ddepth = _dst.depth();
-            CV_Assert(ddepth == CV_16S /*|| ddepth == CV_16F*/);
+            CV_Assert(ddepth == CV_16S || ddepth == CV_16F);
             CV_Assert(_dst.channels() == _src.channels());
         }
         else
@@ -489,7 +488,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
         func = (BinaryFunc)cvt32f16f;
         break;
     case CV_16S:
-    //case CV_16F:
+    case CV_16F:
         ddepth = CV_32F;
         func = (BinaryFunc)cvt16f32f;
         break;
diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp
index 0d0aa3a770..4b9ddbb413 100644
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -150,12 +150,11 @@ static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
 { a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
 
-//static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
-//{
-//    a = vx_load_expand(ptr);
-//    b = vx_load_expand(ptr + v_float32::nlanes);
-//}
-
+static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
+{
+    a = vx_load_expand(ptr);
+    b = vx_load_expand(ptr + v_float32::nlanes);
+}
 
 static inline void v_store_pair_as(uchar* ptr, const v_uint16& a, const v_uint16& b)
 {
@@ -295,12 +294,12 @@ static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b
     b = vx_load(ptr + v_float64::nlanes);
 }
 
-//static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
-//{
-//    v_float32 v0 = vx_load_expand(ptr);
-//    a = v_cvt_f64(v0);
-//    b = v_cvt_f64_high(v0);
-//}
+static inline void vx_load_pair_as(const float16_t* ptr, v_float64& a, v_float64& b)
+{
+    v_float32 v0 = vx_load_expand(ptr);
+    a = v_cvt_f64(v0);
+    b = v_cvt_f64_high(v0);
+}
 
 static inline void v_store_as(double* ptr, const v_float32& a)
 {
@@ -349,11 +348,11 @@ static inline void v_store_pair_as(float* ptr, const v_float64& a, const v_float
     v_store(ptr, v);
 }
 
-//static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
-//{
-//    v_float32 v = v_cvt_f32(a, b);
-//    v_pack_store(ptr, v);
-//}
+static inline void v_store_pair_as(float16_t* ptr, const v_float64& a, const v_float64& b)
+{
+    v_float32 v = v_cvt_f32(a, b);
+    v_pack_store(ptr, v);
+}
 
 #else
 
diff --git a/modules/core/src/convert_scale.cpp b/modules/core/src/convert_scale.cpp
index 0d4b5151a3..751f7fe626 100644
--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@@ -222,7 +222,7 @@ DEF_CVT_SCALE_FUNC(16s8u,  cvt_32f, short,  uchar, float)
 DEF_CVT_SCALE_FUNC(32s8u,  cvt_32f, int,    uchar, float)
 DEF_CVT_SCALE_FUNC(32f8u,  cvt_32f, float,  uchar, float)
 DEF_CVT_SCALE_FUNC(64f8u,  cvt_32f, double, uchar, float)
-//DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
+DEF_CVT_SCALE_FUNC(16f8u,  cvt_32f, float16_t, uchar, float)
 
 DEF_CVT_SCALE_FUNC(8u8s,   cvt_32f, uchar,  schar, float)
 DEF_CVT_SCALE_FUNC(8s,     cvt_32f, schar,  schar, float)
@@ -231,7 +231,7 @@ DEF_CVT_SCALE_FUNC(16s8s,  cvt_32f, short,  schar, float)
 DEF_CVT_SCALE_FUNC(32s8s,  cvt_32f, int,    schar, float)
 DEF_CVT_SCALE_FUNC(32f8s,  cvt_32f, float,  schar, float)
 DEF_CVT_SCALE_FUNC(64f8s,  cvt_32f, double, schar, float)
-//DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
+DEF_CVT_SCALE_FUNC(16f8s,  cvt_32f, float16_t, schar, float)
 
 DEF_CVT_SCALE_FUNC(8u16u,  cvt_32f, uchar,  ushort, float)
 DEF_CVT_SCALE_FUNC(8s16u,  cvt_32f, schar,  ushort, float)
@@ -240,7 +240,7 @@ DEF_CVT_SCALE_FUNC(16s16u, cvt_32f, short,  ushort, float)
 DEF_CVT_SCALE_FUNC(32s16u, cvt_32f, int,    ushort, float)
 DEF_CVT_SCALE_FUNC(32f16u, cvt_32f, float,  ushort, float)
 DEF_CVT_SCALE_FUNC(64f16u, cvt_32f, double, ushort, float)
-//DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
+DEF_CVT_SCALE_FUNC(16f16u, cvt1_32f, float16_t, ushort, float)
 
 DEF_CVT_SCALE_FUNC(8u16s,  cvt_32f, uchar,  short, float)
 DEF_CVT_SCALE_FUNC(8s16s,  cvt_32f, schar,  short, float)
@@ -249,7 +249,7 @@ DEF_CVT_SCALE_FUNC(16s,    cvt_32f, short,  short, float)
 DEF_CVT_SCALE_FUNC(32s16s, cvt_32f, int,    short, float)
 DEF_CVT_SCALE_FUNC(32f16s, cvt_32f, float,  short, float)
 DEF_CVT_SCALE_FUNC(64f16s, cvt_32f, double, short, float)
-//DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
+DEF_CVT_SCALE_FUNC(16f16s, cvt1_32f, float16_t, short, float)
 
 DEF_CVT_SCALE_FUNC(8u32s,  cvt_32f, uchar,  int, float)
 DEF_CVT_SCALE_FUNC(8s32s,  cvt_32f, schar,  int, float)
@@ -258,7 +258,7 @@ DEF_CVT_SCALE_FUNC(16s32s, cvt_32f, short,  int, float)
 DEF_CVT_SCALE_FUNC(32s,    cvt_64f, int,    int, double)
 DEF_CVT_SCALE_FUNC(32f32s, cvt_32f, float,  int, float)
 DEF_CVT_SCALE_FUNC(64f32s, cvt_64f, double, int, double)
-//DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
+DEF_CVT_SCALE_FUNC(16f32s, cvt1_32f, float16_t, int, float)
 
 DEF_CVT_SCALE_FUNC(8u32f,  cvt_32f, uchar,  float, float)
 DEF_CVT_SCALE_FUNC(8s32f,  cvt_32f, schar,  float, float)
@@ -267,7 +267,7 @@ DEF_CVT_SCALE_FUNC(16s32f, cvt_32f, short,  float, float)
 DEF_CVT_SCALE_FUNC(32s32f, cvt_32f, int,    float, float)
 DEF_CVT_SCALE_FUNC(32f,    cvt_32f, float,  float, float)
 DEF_CVT_SCALE_FUNC(64f32f, cvt_64f, double, float, double)
-//DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
+DEF_CVT_SCALE_FUNC(16f32f, cvt1_32f, float16_t, float, float)
 
 DEF_CVT_SCALE_FUNC(8u64f,  cvt_64f, uchar,  double, double)
 DEF_CVT_SCALE_FUNC(8s64f,  cvt_64f, schar,  double, double)
@@ -276,16 +276,16 @@ DEF_CVT_SCALE_FUNC(16s64f, cvt_64f, short,  double, double)
 DEF_CVT_SCALE_FUNC(32s64f, cvt_64f, int,    double, double)
 DEF_CVT_SCALE_FUNC(32f64f, cvt_64f, float,  double, double)
 DEF_CVT_SCALE_FUNC(64f,    cvt_64f, double, double, double)
-//DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
+DEF_CVT_SCALE_FUNC(16f64f, cvt_64f, float16_t, double, double)
 
-/*DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
+DEF_CVT_SCALE_FUNC(8u16f,  cvt1_32f, uchar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(8s16f,  cvt1_32f, schar,  float16_t, float)
 DEF_CVT_SCALE_FUNC(16u16f, cvt1_32f, ushort, float16_t, float)
 DEF_CVT_SCALE_FUNC(16s16f, cvt1_32f, short,  float16_t, float)
 DEF_CVT_SCALE_FUNC(32s16f, cvt1_32f, int,    float16_t, float)
 DEF_CVT_SCALE_FUNC(32f16f, cvt1_32f, float,  float16_t, float)
 DEF_CVT_SCALE_FUNC(64f16f, cvt_64f,  double, float16_t, double)
-DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)*/
+DEF_CVT_SCALE_FUNC(16f,    cvt1_32f, float16_t, float16_t, float)
 
 static BinaryFunc getCvtScaleAbsFunc(int depth)
 {
@@ -306,43 +306,42 @@ BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
-            (BinaryFunc)cvtScale64f8u, 0 //(BinaryFunc)cvtScale16f8u
+            (BinaryFunc)cvtScale64f8u, (BinaryFunc)cvtScale16f8u
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
-            (BinaryFunc)cvtScale64f8s, 0 //(BinaryFunc)cvtScale16f8s
+            (BinaryFunc)cvtScale64f8s, (BinaryFunc)cvtScale16f8s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
-            (BinaryFunc)cvtScale64f16u, 0 //(BinaryFunc)cvtScale16f16u
+            (BinaryFunc)cvtScale64f16u, (BinaryFunc)cvtScale16f16u
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
-            (BinaryFunc)cvtScale64f16s, 0 //(BinaryFunc)cvtScale16f16s
+            (BinaryFunc)cvtScale64f16s, (BinaryFunc)cvtScale16f16s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
-            (BinaryFunc)cvtScale64f32s, 0 //(BinaryFunc)cvtScale16f32s
+            (BinaryFunc)cvtScale64f32s, (BinaryFunc)cvtScale16f32s
         },
         {
             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
-            (BinaryFunc)cvtScale64f32f, 0 //(BinaryFunc)cvtScale16f32f
+            (BinaryFunc)cvtScale64f32f, (BinaryFunc)cvtScale16f32f
         },
         {
             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
-            (BinaryFunc)cvtScale64f, 0 //(BinaryFunc)cvtScale16f64f
+            (BinaryFunc)cvtScale64f, (BinaryFunc)cvtScale16f64f
         },
         {
-            0, 0, 0, 0, 0, 0, 0, 0
-            /*(BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
+            (BinaryFunc)cvtScale8u16f, (BinaryFunc)cvtScale8s16f, (BinaryFunc)cvtScale16u16f,
             (BinaryFunc)cvtScale16s16f, (BinaryFunc)cvtScale32s16f, (BinaryFunc)cvtScale32f16f,
-            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f*/
+            (BinaryFunc)cvtScale64f16f, (BinaryFunc)cvtScale16f
         },
     };
 
diff --git a/modules/core/src/merge.cpp b/modules/core/src/merge.cpp
index 300a718506..5f4eaf8f00 100644
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -216,8 +216,10 @@ static MergeFunc getMergeFunc(int depth)
 {
     static MergeFunc mergeTab[] =
     {
-        (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
-        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u)
     };
 
     return mergeTab[depth];
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index f2171a907a..dc49937412 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -723,7 +723,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
         return result;
     }
 
-    NormFunc func = getNormFunc(normType >> 1, depth);
+    NormFunc func = getNormFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
     CV_Assert( func != 0 );
 
     const Mat* arrays[] = {&src, &mask, 0};
@@ -737,19 +737,31 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
     result;
     result.d = 0;
     NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
-    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
+    int j, total = (int)it.size, blockSize = total;
+    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
     int isum = 0;
     int *ibuf = &result.i;
+    AutoBuffer<float> fltbuf_;
+    float* fltbuf = 0;
     size_t esz = 0;
 
     if( blockSum )
     {
-        intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
-        blockSize = std::min(blockSize, intSumBlockSize);
-        ibuf = &isum;
         esz = src.elemSize();
+
+        if( depth == CV_16F )
+        {
+            blockSize = std::min(blockSize, 1024);
+            fltbuf_.allocate(blockSize);
+            fltbuf = fltbuf_.data();
+        }
+        else
+        {
+            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+            blockSize = std::min(blockSize, intSumBlockSize);
+            ibuf = &isum;
+        }
     }
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -757,13 +769,17 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
         for( j = 0; j < total; j += blockSize )
         {
             int bsz = std::min(total - j, blockSize);
-            func( ptrs[0], ptrs[1], (uchar*)ibuf, bsz, cn );
-            count += bsz;
-            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
+            const uchar* data = ptrs[0];
+            if( depth == CV_16F )
+            {
+                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
+                data = (const uchar*)fltbuf;
+            }
+            func( data, ptrs[1], (uchar*)ibuf, bsz, cn );
+            if( blockSum && depth != CV_16F )
             {
                 result.d += isum;
                 isum = 0;
-                count = 0;
             }
             ptrs[0] += bsz*esz;
             if( ptrs[1] )
@@ -1181,7 +1197,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
         return result;
     }
 
-    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
+    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth == CV_16F ? CV_32F : depth);
     CV_Assert( func != 0 );
 
     const Mat* arrays[] = {&src1, &src2, &mask, 0};
@@ -1196,19 +1212,31 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
     result;
     result.d = 0;
     NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total, intSumBlockSize = 0, count = 0;
-    bool blockSum = (normType == NORM_L1 && depth <= CV_16S) ||
+    int j, total = (int)it.size, blockSize = total;
+    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
     unsigned isum = 0;
     unsigned *ibuf = &result.u;
+    AutoBuffer<float> fltbuf_;
+    float* fltbuf = 0;
     size_t esz = 0;
 
     if( blockSum )
     {
-        intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
-        blockSize = std::min(blockSize, intSumBlockSize);
-        ibuf = &isum;
         esz = src1.elemSize();
+
+        if( depth == CV_16F )
+        {
+            blockSize = std::min(blockSize, 1024);
+            fltbuf_.allocate(blockSize*2);
+            fltbuf = fltbuf_.data();
+        }
+        else
+        {
+            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+            blockSize = std::min(blockSize, intSumBlockSize);
+            ibuf = &isum;
+        }
     }
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
@@ -1216,13 +1244,19 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
         for( j = 0; j < total; j += blockSize )
         {
             int bsz = std::min(total - j, blockSize);
-            func( ptrs[0], ptrs[1], ptrs[2], (uchar*)ibuf, bsz, cn );
-            count += bsz;
-            if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) )
+            const uchar *data0 = ptrs[0], *data1 = ptrs[1];
+            if( depth == CV_16F )
+            {
+                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
+                hal::cvt16f32f((const float16_t*)ptrs[1], fltbuf + bsz, bsz);
+                data0 = (const uchar*)fltbuf;
+                data1 = (const uchar*)(fltbuf + bsz);
+            }
+            func( data0, data1, ptrs[2], (uchar*)ibuf, bsz, cn );
+            if( blockSum && depth != CV_16F )
             {
                 result.d += isum;
                 isum = 0;
-                count = 0;
             }
             ptrs[0] += bsz*esz;
             ptrs[1] += bsz*esz;
diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp
index 9f28654358..89770fe33f 100644
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -77,6 +77,7 @@ namespace cv
         void valueToStr32s() { sprintf(buf, "%d", mtx.ptr<int>(row, col)[cn]); }
         void valueToStr32f() { sprintf(buf, floatFormat, mtx.ptr<float>(row, col)[cn]); }
         void valueToStr64f() { sprintf(buf, floatFormat, mtx.ptr<double>(row, col)[cn]); }
+        void valueToStr16f() { sprintf(buf, floatFormat, (float)mtx.ptr<float16_t>(row, col)[cn]); }
         void valueToStrOther() { buf[0] = 0; }
 
     public:
@@ -115,7 +116,8 @@ namespace cv
                 case CV_32S: valueToStr = &FormattedImpl::valueToStr32s; break;
                 case CV_32F: valueToStr = &FormattedImpl::valueToStr32f; break;
                 case CV_64F: valueToStr = &FormattedImpl::valueToStr64f; break;
-                default:     valueToStr = &FormattedImpl::valueToStrOther; break;
+                default:     CV_Assert(mtx.depth() == CV_16F);
+                             valueToStr = &FormattedImpl::valueToStr16f;
             }
         }
 
@@ -256,7 +258,12 @@ namespace cv
     class FormatterBase : public Formatter
     {
     public:
-        FormatterBase() : prec32f(8), prec64f(16), multiline(true) {}
+        FormatterBase() : prec16f(4), prec32f(8), prec64f(16), multiline(true) {}
+
+        void set16fPrecision(int p) CV_OVERRIDE
+        {
+            prec16f = p;
+        }
 
         void set32fPrecision(int p) CV_OVERRIDE
         {
@@ -274,6 +281,7 @@ namespace cv
         }
 
     protected:
+        int prec16f;
         int prec32f;
         int prec64f;
         int multiline;
@@ -325,7 +333,7 @@ namespace cv
         {
             static const char* numpyTypes[] =
             {
-                "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64"
+                "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "float16"
             };
             char braces[5] = {'[', ']', ',', '[', ']'};
             if (mtx.cols == 1)
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index e791fd131b..ea45ec4ea0 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -48,18 +48,6 @@
 
 #include "precomp.hpp"
 
-#if defined _WIN32 || defined WINCE
-    #include <windows.h>
-    #undef small
-    #undef min
-    #undef max
-    #undef abs
-#endif
-
-#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
-    #include "emmintrin.h"
-#endif
-
 namespace cv
 {
 
@@ -74,12 +62,6 @@ namespace cv
 
 #define  RNG_NEXT(x)    ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))
 
-#ifdef __PPC64__
-    #define PPC_MUL_ADD(ret, tmp, p0, p1)                           \
-    asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret)  \
-                : "f" (tmp), "f" (p0), "f" (p1))
-#endif
-
 /***************************************************************************************\
 *                           Pseudo-Random Number Generators (PRNGs)                     *
 \***************************************************************************************/
@@ -154,59 +136,26 @@ template<typename T> static void
 randi_( T* arr, int len, uint64* state, const DivStruct* p )
 {
     uint64 temp = *state;
-    int i = 0;
-    unsigned t0, t1, v0, v1;
-
-    for( i = 0; i <= len - 4; i += 4 )
+    for( int i = 0; i < len; i++ )
     {
         temp = RNG_NEXT(temp);
-        t0 = (unsigned)temp;
-        temp = RNG_NEXT(temp);
-        t1 = (unsigned)temp;
-        v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32);
-        v1 = (unsigned)(((uint64)t1 * p[i+1].M) >> 32);
-        v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2;
-        v1 = (v1 + ((t1 - v1) >> p[i+1].sh1)) >> p[i+1].sh2;
-        v0 = t0 - v0*p[i].d + p[i].delta;
-        v1 = t1 - v1*p[i+1].d + p[i+1].delta;
-        arr[i] = saturate_cast<T>((int)v0);
-        arr[i+1] = saturate_cast<T>((int)v1);
-
-        temp = RNG_NEXT(temp);
-        t0 = (unsigned)temp;
-        temp = RNG_NEXT(temp);
-        t1 = (unsigned)temp;
-        v0 = (unsigned)(((uint64)t0 * p[i+2].M) >> 32);
-        v1 = (unsigned)(((uint64)t1 * p[i+3].M) >> 32);
-        v0 = (v0 + ((t0 - v0) >> p[i+2].sh1)) >> p[i+2].sh2;
-        v1 = (v1 + ((t1 - v1) >> p[i+3].sh1)) >> p[i+3].sh2;
-        v0 = t0 - v0*p[i+2].d + p[i+2].delta;
-        v1 = t1 - v1*p[i+3].d + p[i+3].delta;
-        arr[i+2] = saturate_cast<T>((int)v0);
-        arr[i+3] = saturate_cast<T>((int)v1);
+        unsigned t = (unsigned)temp;
+        unsigned v = (unsigned)(((uint64)t * p[i].M) >> 32);
+        v = (v + ((t - v) >> p[i].sh1)) >> p[i].sh2;
+        v = t - v*p[i].d + p[i].delta;
+        arr[i] = saturate_cast<T>((int)v);
     }
-
-    for( ; i < len; i++ )
-    {
-        temp = RNG_NEXT(temp);
-        t0 = (unsigned)temp;
-        v0 = (unsigned)(((uint64)t0 * p[i].M) >> 32);
-        v0 = (v0 + ((t0 - v0) >> p[i].sh1)) >> p[i].sh2;
-        v0 = t0 - v0*p[i].d + p[i].delta;
-        arr[i] = saturate_cast<T>((int)v0);
-    }
-
     *state = temp;
 }
 
 
 #define DEF_RANDI_FUNC(suffix, type) \
 static void randBits_##suffix(type* arr, int len, uint64* state, \
-                              const Vec2i* p, bool small_flag) \
+                              const Vec2i* p, void*, bool small_flag) \
 { randBits_(arr, len, state, p, small_flag); } \
 \
 static void randi_##suffix(type* arr, int len, uint64* state, \
-                           const DivStruct* p, bool ) \
+                           const DivStruct* p, void*, bool ) \
 { randi_(arr, len, state, p); }
 
 DEF_RANDI_FUNC(8u, uchar)
@@ -215,131 +164,62 @@ DEF_RANDI_FUNC(16u, ushort)
 DEF_RANDI_FUNC(16s, short)
 DEF_RANDI_FUNC(32s, int)
 
-static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool )
+static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, void*, bool )
 {
     uint64 temp = *state;
-    int i = 0;
-
-    for( ; i <= len - 4; i += 4 )
+    for( int i = 0; i < len; i++ )
     {
-        float f[4];
-        f[0] = (float)(int)(temp = RNG_NEXT(temp));
-        f[1] = (float)(int)(temp = RNG_NEXT(temp));
-        f[2] = (float)(int)(temp = RNG_NEXT(temp));
-        f[3] = (float)(int)(temp = RNG_NEXT(temp));
-
-        // handwritten SSE is required not for performance but for numerical stability!
-        // both 32-bit gcc and MSVC compilers trend to generate double precision SSE
-        // while 64-bit compilers generate single precision SIMD instructions
-        // so manual vectorisation forces all compilers to the single precision
-#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
-        __m128 q0 = _mm_loadu_ps((const float*)(p + i));
-        __m128 q1 = _mm_loadu_ps((const float*)(p + i + 2));
-
-        __m128 q01l = _mm_unpacklo_ps(q0, q1);
-        __m128 q01h = _mm_unpackhi_ps(q0, q1);
-
-        __m128 p0 = _mm_unpacklo_ps(q01l, q01h);
-        __m128 p1 = _mm_unpackhi_ps(q01l, q01h);
-
-        _mm_storeu_ps(arr + i, _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(f), p0), p1));
-#elif defined __ARM_NEON && defined __aarch64__
-        // handwritten NEON is required not for performance but for numerical stability!
-        // 64bit gcc tends to use fmadd instead of separate multiply and add
-        // use volatile to ensure to separate the multiply and add
-        float32x4x2_t q = vld2q_f32((const float*)(p + i));
-
-        float32x4_t p0 = q.val[0];
-        float32x4_t p1 = q.val[1];
-
-        volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0);
-        vst1q_f32(arr+i, vaddq_f32(v0, p1));
-#elif defined __PPC64__
-        // inline asm is required for numerical stability!
-        // compilers tends to use floating multiply-add single(fmadds)
-        // instead of separate multiply and add
-        PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]);
-        PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]);
-        PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]);
-        PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]);
-#else
-        arr[i+0] = f[0]*p[i+0][0] + p[i+0][1];
-        arr[i+1] = f[1]*p[i+1][0] + p[i+1][1];
-        arr[i+2] = f[2]*p[i+2][0] + p[i+2][1];
-        arr[i+3] = f[3]*p[i+3][0] + p[i+3][1];
-#endif
+        int t = (int)(temp = RNG_NEXT(temp));
+        arr[i] = (float)(t*p[i][0]);
     }
-
-    for( ; i < len; i++ )
-    {
-        temp = RNG_NEXT(temp);
-#if defined __SSE2__ || (defined _M_IX86_FP && 2 == _M_IX86_FP)
-        _mm_store_ss(arr + i, _mm_add_ss(
-                _mm_mul_ss(_mm_set_ss((float)(int)temp), _mm_set_ss(p[i][0])),
-                _mm_set_ss(p[i][1]))
-                );
-#elif defined __ARM_NEON && defined __aarch64__
-        float32x2_t t = vadd_f32(vmul_f32(
-                vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])),
-                vdup_n_f32(p[i][1]));
-        arr[i] = vget_lane_f32(t, 0);
-#elif defined __PPC64__
-        PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]);
-#else
-        arr[i] = (int)temp*p[i][0] + p[i][1];
-#endif
-    }
-
     *state = temp;
-}
 
+    // add bias separately to make the generated random numbers
+    // more deterministic, independent of
+    // architecture details (FMA instruction use etc.)
+    hal::addRNGBias32f(arr, &p[0][0], len);
+}
 
 static void
-randf_64f( double* arr, int len, uint64* state, const Vec2d* p, bool )
+randf_64f( double* arr, int len, uint64* state, const Vec2d* p, void*, bool )
 {
     uint64 temp = *state;
-    int64 v = 0;
-    int i;
-
-    for( i = 0; i <= len - 4; i += 4 )
-    {
-        double f0, f1;
-
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f0 = v*p[i][0] + p[i][1];
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f1 = v*p[i+1][0] + p[i+1][1];
-        arr[i] = f0; arr[i+1] = f1;
-
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f0 = v*p[i+2][0] + p[i+2][1];
-        temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        f1 = v*p[i+3][0] + p[i+3][1];
-        arr[i+2] = f0; arr[i+3] = f1;
-    }
-
-    for( ; i < len; i++ )
+    for( int i = 0; i < len; i++ )
     {
         temp = RNG_NEXT(temp);
-        v = (temp >> 32)|(temp << 32);
-        arr[i] = v*p[i][0] + p[i][1];
+        int64 v = (temp >> 32)|(temp << 32);
+        arr[i] = v*p[i][0];
     }
-
     *state = temp;
+
+    hal::addRNGBias64f(arr, &p[0][0], len);
 }
 
-typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, bool small_flag);
+static void randf_16f( float16_t* arr, int len, uint64* state, const Vec2f* p, float* fbuf, bool )
+{
+    uint64 temp = *state;
+    for( int i = 0; i < len; i++ )
+    {
+        float f = (float)(int)(temp = RNG_NEXT(temp));
+        fbuf[i] = f*p[i][0];
+    }
+    *state = temp;
+
+    // add bias separately to make the generated random numbers
+    // more deterministic, independent of
+    // architecture details (FMA instruction use etc.)
+    hal::addRNGBias32f(fbuf, &p[0][0], len);
+    hal::cvt32f16f(fbuf, arr, len);
+}
+
+typedef void (*RandFunc)(uchar* arr, int len, uint64* state, const void* p, void* tempbuf, bool small_flag);
 
 
 static RandFunc randTab[][8] =
 {
     {
         (RandFunc)randi_8u, (RandFunc)randi_8s, (RandFunc)randi_16u, (RandFunc)randi_16s,
-        (RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, 0
+        (RandFunc)randi_32s, (RandFunc)randf_32f, (RandFunc)randf_64f, (RandFunc)randf_16f
     },
     {
         (RandFunc)randBits_8u, (RandFunc)randBits_8s, (RandFunc)randBits_16u, (RandFunc)randBits_16s,
@@ -350,7 +230,7 @@ static RandFunc randTab[][8] =
 /*
    The code below implements the algorithm described in
    "The Ziggurat Method for Generating Random Variables"
-   by Marsaglia and Tsang, Journal of Statistical Software.
+   by George Marsaglia and Wai Wan Tsang, Journal of Statistical Software, 2007.
 */
 static void
 randn_0_1_32f( float* arr, int len, uint64* state )
@@ -631,8 +511,8 @@ void RNG::fill( InputOutputArray _mat, int disttype,
             // for each channel i compute such dparam[0][i] & dparam[1][i],
             // so that a signed 32/64-bit integer X is transformed to
             // the range [param1.val[i], param2.val[i]) using
-            // dparam[1][i]*X + dparam[0][i]
-            if( depth == CV_32F )
+            // dparam[0][i]*X + dparam[1][i]
+            if( depth != CV_64F )
             {
                 fp = (Vec2f*)(parambuf + cn*2);
                 for( j = 0; j < cn; j++ )
@@ -704,6 +584,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
     AutoBuffer<double> buf;
     uchar* param = 0;
     float* nbuf = 0;
+    float* tmpbuf = 0;
 
     if( disttype == UNIFORM )
     {
@@ -727,12 +608,14 @@ void RNG::fill( InputOutputArray _mat, int disttype,
                         p[j + k] = ip[k];
             }
         }
-        else if( depth == CV_32F )
+        else if( depth != CV_64F )
         {
             Vec2f* p = (Vec2f*)param;
             for( j = 0; j < blockSize*cn; j += cn )
                 for( k = 0; k < cn; k++ )
                     p[j + k] = fp[k];
+            if( depth == CV_16F )
+                tmpbuf = (float*)p + blockSize*cn*2;
         }
         else
         {
@@ -755,7 +638,7 @@ void RNG::fill( InputOutputArray _mat, int disttype,
             int len = std::min(total - j, blockSize);
 
             if( disttype == CV_RAND_UNI )
-                func( ptr, len*cn, &state, param, smallFlag );
+                func( ptr, len*cn, &state, param, tmpbuf, smallFlag );
             else
             {
                 randn_0_1_32f(nbuf, len*cn, &state);
diff --git a/modules/core/src/split.cpp b/modules/core/src/split.cpp
index 3fab6874b7..34d331a800 100644
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -224,8 +224,10 @@ static SplitFunc getSplitFunc(int depth)
 {
     static SplitFunc splitTab[] =
     {
-        (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
-        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u)
     };
 
     return splitTab[depth];
diff --git a/modules/core/test/ocl/test_matrix_expr.cpp b/modules/core/test/ocl/test_matrix_expr.cpp
index 11be5a3a36..7a5ff72cb2 100644
--- a/modules/core/test/ocl/test_matrix_expr.cpp
+++ b/modules/core/test/ocl/test_matrix_expr.cpp
@@ -78,7 +78,7 @@ OCL_TEST_P(UMatExpr, Ones)
 
 //////////////////////////////// Instantiation /////////////////////////////////////////////////
 
-OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS));
+OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS_16F, OCL_ALL_CHANNELS));
 
 } } // namespace opencv_test::ocl
 
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index c81f8d83e1..ccf68cbf90 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -476,7 +476,7 @@ struct CopyOp : public BaseElemWiseOp
     }
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
     }
     double getMaxErr(int)
     {
@@ -498,7 +498,7 @@ struct SetOp : public BaseElemWiseOp
     }
     int getRandomType(RNG& rng)
     {
-        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL, 1, ARITHM_MAX_CHANNELS);
+        return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_ALL_16F, 1, ARITHM_MAX_CHANNELS);
     }
     double getMaxErr(int)
     {
diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp
index 6126883091..11572e9f48 100644
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -372,6 +372,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
 #define OCL_ON(...) cv::ocl::setUseOpenCL(true); __VA_ARGS__ ;
 
 #define OCL_ALL_DEPTHS Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F)
+#define OCL_ALL_DEPTHS_16F Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)
 #define OCL_ALL_CHANNELS Values(1, 2, 3, 4)
 
 CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA, INTER_LINEAR_EXACT)
diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp
index 83988c2b86..586d83fae6 100644
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -160,7 +160,7 @@ private:
     };                                                                                  \
     static inline void PrintTo(const class_name& t, std::ostream* os) { t.PrintTo(os); } }
 
-CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_USRTYPE1)
+CV_ENUM(MatDepth, CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F, CV_16F)
 
 /*****************************************************************************************\
 *                 Regression control utility for performance testing                      *
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 60c88a7e65..3b9e0198f2 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -72,10 +72,10 @@ int randomType(RNG& rng, int typeMask, int minChannels, int maxChannels)
 {
     int channels = rng.uniform(minChannels, maxChannels+1);
     int depth = 0;
-    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL) != 0);
+    CV_Assert((typeMask & _OutputArray::DEPTH_MASK_ALL_16F) != 0);
     for(;;)
     {
-        depth = rng.uniform(CV_8U, CV_64F+1);
+        depth = rng.uniform(CV_8U, CV_16F+1);
         if( ((1 << depth) & typeMask) != 0 )
             break;
     }
@@ -1260,6 +1260,13 @@ norm_(const _Tp* src1, const _Tp* src2, size_t total, int cn, int normType, doub
 double norm(InputArray _src, int normType, InputArray _mask)
 {
     Mat src = _src.getMat(), mask = _mask.getMat();
+    if( src.depth() == CV_16F )
+    {
+        Mat src32f;
+        src.convertTo(src32f, CV_32F);
+        return cvtest::norm(src32f, normType, _mask);
+    }
+
     if( normType == NORM_HAMMING || normType == NORM_HAMMING2 )
     {
         if( !mask.empty() )
@@ -1340,6 +1347,14 @@ double norm(InputArray _src, int normType, InputArray _mask)
 double norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask)
 {
     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
+    if( src1.depth() == CV_16F )
+    {
+        Mat src1_32f, src2_32f;
+        src1.convertTo(src1_32f, CV_32F);
+        src2.convertTo(src2_32f, CV_32F);
+        return cvtest::norm(src1_32f, src2_32f, normType, _mask);
+    }
+
     bool isRelative = (normType & NORM_RELATIVE) != 0;
     normType &= ~NORM_RELATIVE;
 
@@ -1982,11 +1997,20 @@ int check( const Mat& a, double fmin, double fmax, vector<int>* _idx )
 // success_err_level is maximum allowed difference, idx is the index of the first
 // element for which difference is >success_err_level
 // (or index of element with the maximum difference)
-int cmpEps( const Mat& arr, const Mat& refarr, double* _realmaxdiff,
+int cmpEps( const Mat& arr_, const Mat& refarr_, double* _realmaxdiff,
             double success_err_level, vector<int>* _idx,
             bool element_wise_relative_error )
 {
+    Mat arr = arr_, refarr = refarr_;
     CV_Assert( arr.type() == refarr.type() && arr.size == refarr.size );
+    if( arr.depth() == CV_16F )
+    {
+        Mat arr32f, refarr32f;
+        arr.convertTo(arr32f, CV_32F);
+        refarr.convertTo(refarr32f, CV_32F);
+        arr = arr32f;
+        refarr = refarr32f;
+    }
 
     int ilevel = refarr.depth() <= CV_32S ? cvFloor(success_err_level) : 0;
     int result = CMP_EPS_OK;
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index 7bf60af716..c139a58180 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -594,11 +594,11 @@ Regression& Regression::operator() (const std::string& name, cv::InputArray arra
     // exit if current test is already failed
     if(::testing::UnitTest::GetInstance()->current_test_info()->result()->Failed()) return *this;
 
-    if(!array.empty() && array.depth() == CV_USRTYPE1)
+    /*if(!array.empty() && array.depth() == CV_USRTYPE1)
     {
         ADD_FAILURE() << "  Can not check regression for CV_USRTYPE1 data type for " << name;
         return *this;
-    }
+    }*/
 
     std::string nodename = getCurrentTestNodeName();
 
@@ -2207,7 +2207,7 @@ void PrintTo(const MatType& t, ::std::ostream* os)
         case CV_32S: *os << "32S"; break;
         case CV_32F: *os << "32F"; break;
         case CV_64F: *os << "64F"; break;
-        case CV_USRTYPE1: *os << "USRTYPE1"; break;
+        case CV_USRTYPE1: *os << "16F"; break;
         default: *os << "INVALID_TYPE"; break;
     }
     *os << 'C' << CV_MAT_CN((int)t);