From cc22a73d0fb88925617245bd03ed71b9bf358e9c Mon Sep 17 00:00:00 2001
From: Rachel A <aldridge.r.a@gmail.com>
Date: Mon, 1 Mar 2021 10:57:22 -0800
Subject: [PATCH 01/10] EXR alpha support for 4 channel reading and writing. 
 Issue https://github.com/opencv/opencv/issues/16115.

---
 modules/imgcodecs/src/grfmt_exr.cpp      | 180 ++++++++++++++---------
 modules/imgcodecs/src/grfmt_exr.hpp      |   4 +-
 modules/imgcodecs/test/test_exr.impl.hpp | 158 +++++++++++++++++++-
 3 files changed, 268 insertions(+), 74 deletions(-)
diff --git a/modules/imgcodecs/src/grfmt_exr.cpp b/modules/imgcodecs/src/grfmt_exr.cpp
index 1eceb4f5cd..9667b8ca03 100644
--- a/modules/imgcodecs/src/grfmt_exr.cpp
+++ b/modules/imgcodecs/src/grfmt_exr.cpp
@@ -84,12 +84,13 @@ ExrDecoder::ExrDecoder()
 {
     m_signature = "\x76\x2f\x31\x01";
     m_file = 0;
-    m_red = m_green = m_blue = 0;
+    m_red = m_green = m_blue = m_alpha = 0;
     m_type = ((Imf::PixelType)0);
     m_iscolor = false;
     m_bit_depth = 0;
     m_isfloat = false;
     m_ischroma = false;
+    m_hasalpha = false;
     m_native_depth = false;
 
 }
@@ -113,7 +114,7 @@ void  ExrDecoder::close()
 
 int  ExrDecoder::type() const
 {
-    return CV_MAKETYPE((m_isfloat ? CV_32F : CV_32S), m_iscolor ? 3 : 1);
+    return CV_MAKETYPE((m_isfloat ? CV_32F : CV_32S), ((m_iscolor && m_hasalpha) ? 4 : m_iscolor ? 3 : m_hasalpha ? 2 : 1));
 }
 
 
@@ -141,6 +142,11 @@ bool  ExrDecoder::readHeader()
     m_red = channels.findChannel( "R" );
     m_green = channels.findChannel( "G" );
     m_blue = channels.findChannel( "B" );
+    m_alpha = channels.findChannel( "A" );
+
+    if( m_alpha ) // alpha channel supported in RGB, Y, and YC scenarios
+        m_hasalpha = true;
+
     if( m_red || m_green || m_blue )
     {
         m_iscolor = true;
@@ -178,7 +184,8 @@ bool  ExrDecoder::readHeader()
 bool  ExrDecoder::readData( Mat& img )
 {
     m_native_depth = CV_MAT_DEPTH(type()) == img.depth();
-    bool color = img.channels() > 1;
+    bool color = img.channels() > 2; // output mat has 3+ channels; Y or YA are the 1 and 2 channel scenario
+    bool alphasupported = ( img.channels() % 2 == 0 );  // even number of channels indicates alpha
     int channels = 0;
     uchar* data = img.ptr();
     size_t step = img.step;
@@ -187,18 +194,22 @@ bool  ExrDecoder::readData( Mat& img )
     bool rgbtogray = ( !m_ischroma && m_iscolor && !color );
     bool result = true;
     FrameBuffer frame;
-    int xsample[3] = {1, 1, 1};
+    const int defaultchannels = 3;
+    int xsample[defaultchannels] = {1, 1, 1};
     char *buffer;
-    size_t xstep = 0;
+    CV_Assert(m_type == FLOAT);
+    const size_t floatsize = sizeof(float);
+    size_t xstep = m_native_depth ? floatsize : 1; // 4 bytes if native depth (FLOAT), otherwise converting to 1 byte U8 depth
     size_t ystep = 0;
-
-    xstep = m_native_depth ? 4 : 1;
+    const int channelstoread = ( (m_iscolor && alphasupported) ? 4 :
+                                ( (m_iscolor && !m_ischroma) || color) ? 3 : alphasupported ? 2 : 1 ); // number of channels to read may exceed channels in output img
+    size_t xStride = floatsize * channelstoread;
 
     AutoBuffer<char> copy_buffer;
 
     if( !justcopy )
     {
-        copy_buffer.allocate(sizeof(float) * m_width * 3);
+        copy_buffer.allocate(floatsize * m_width * defaultchannels);
         buffer = copy_buffer.data();
         ystep = 0;
     }
@@ -215,49 +226,49 @@ bool  ExrDecoder::readData( Mat& img )
             if( m_blue )
             {
                 frame.insert( "BY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                                           12, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
-                xsample[0] = m_blue->ySampling;
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                                           xStride, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
+                xsample[0] = m_blue->xSampling;
             }
             else
             {
                 frame.insert( "BY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                                           12, ystep, 1, 1, 0.0 ));
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                                           xStride, ystep, 1, 1, 0.0 ));
             }
             if( m_green )
             {
                 frame.insert( "Y", Slice( m_type,
-                                          buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                                          12, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-                xsample[1] = m_green->ySampling;
+                                          buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                                          xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+                xsample[1] = m_green->xSampling;
             }
             else
             {
                 frame.insert( "Y", Slice( m_type,
-                                          buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                                          12, ystep, 1, 1, 0.0 ));
+                                          buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                                          xStride, ystep, 1, 1, 0.0 ));
             }
             if( m_red )
             {
                 frame.insert( "RY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                                           12, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
-                xsample[2] = m_red->ySampling;
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                                           xStride, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
+                xsample[2] = m_red->xSampling;
             }
             else
             {
                 frame.insert( "RY", Slice( m_type,
-                                           buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                                           12, ystep, 1, 1, 0.0 ));
+                                           buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                                           xStride, ystep, 1, 1, 0.0 ));
             }
         }
         else
         {
             frame.insert( "Y", Slice( m_type,
-                            buffer - m_datawindow.min.x * 4 - m_datawindow.min.y * ystep,
-                            4, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-            xsample[0] = m_green->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+            xsample[0] = m_green->xSampling;
         }
     }
     else
@@ -265,67 +276,85 @@ bool  ExrDecoder::readData( Mat& img )
         if( m_blue )
         {
             frame.insert( "B", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                            12, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
-            xsample[0] = m_blue->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, m_blue->xSampling, m_blue->ySampling, 0.0 ));
+            xsample[0] = m_blue->xSampling;
         }
         else
         {
             frame.insert( "B", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep,
+                            xStride, ystep, 1, 1, 0.0 ));
         }
         if( m_green )
         {
             frame.insert( "G", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                            12, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
-            xsample[1] = m_green->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                            xStride, ystep, m_green->xSampling, m_green->ySampling, 0.0 ));
+            xsample[1] = m_green->xSampling;
         }
         else
         {
             frame.insert( "G", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 4,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + floatsize,
+                            xStride, ystep, 1, 1, 0.0 ));
         }
         if( m_red )
         {
             frame.insert( "R", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                            12, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
-            xsample[2] = m_red->ySampling;
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                            xStride, ystep, m_red->xSampling, m_red->ySampling, 0.0 ));
+            xsample[2] = m_red->xSampling;
         }
         else
         {
             frame.insert( "R", Slice( m_type,
-                            buffer - m_datawindow.min.x * 12 - m_datawindow.min.y * ystep + 8,
-                            12, ystep, 1, 1, 0.0 ));
+                            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + (floatsize * 2),
+                            xStride, ystep, 1, 1, 0.0 ));
         }
     }
 
+    if( justcopy && m_hasalpha && alphasupported )
+    { // alpha preserved only in justcopy scenario where alpha is desired (alphasupported)
+      // and present in original file (m_hasalpha)
+        CV_Assert(channelstoread == img.channels());
+        int offset = (channelstoread - 1) * floatsize;
+        frame.insert( "A", Slice( m_type,
+            buffer - m_datawindow.min.x * xStride - m_datawindow.min.y * ystep + offset,
+            xStride, ystep, m_alpha->xSampling, m_alpha->ySampling, 0.0 ));
+    }
+
     for (FrameBuffer::Iterator it = frame.begin(); it != frame.end(); it++) {
         channels++;
     }
 
+    CV_Assert(channels == channelstoread);
+
+    if( (channels != channelstoread) || (!justcopy && channels > defaultchannels) )
+    { // safety checking what ought to be true here
+        close();
+        return false;
+    }
+
     m_file->setFrameBuffer( frame );
     if( justcopy )
     {
         m_file->readPixels( m_datawindow.min.y, m_datawindow.max.y );
 
-        if( color )
+        if( m_iscolor )
         {
             if( m_blue && (m_blue->xSampling != 1 || m_blue->ySampling != 1) )
-                UpSample( data, 3, step / xstep, xsample[0], m_blue->ySampling );
+                UpSample( data, channelstoread, step / xstep, m_blue->xSampling, m_blue->ySampling );
             if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-                UpSample( data + xstep, 3, step / xstep, xsample[1], m_green->ySampling );
+                UpSample( data + xstep, channelstoread, step / xstep, m_green->xSampling, m_green->ySampling );
             if( m_red && (m_red->xSampling != 1 || m_red->ySampling != 1) )
-                UpSample( data + 2 * xstep, 3, step / xstep, xsample[2], m_red->ySampling );
+                UpSample( data + 2 * xstep, channelstoread, step / xstep, m_red->xSampling, m_red->ySampling );
         }
         else if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-            UpSample( data, 1, step / xstep, xsample[0], m_green->ySampling );
+            UpSample( data, channelstoread, step / xstep, m_green->xSampling, m_green->ySampling );
 
         if( chromatorgb )
-            ChromaToBGR( (float *)data, m_height, step / xstep );
+            ChromaToBGR( (float *)data, m_height, channelstoread, step / xstep );
     }
     else
     {
@@ -347,7 +376,7 @@ bool  ExrDecoder::readData( Mat& img )
             else
             {
                 if( chromatorgb )
-                    ChromaToBGR( (float *)buffer, 1, step );
+                    ChromaToBGR( (float *)buffer, 1, defaultchannels, step );
 
                 if( m_type == FLOAT )
                 {
@@ -372,11 +401,11 @@ bool  ExrDecoder::readData( Mat& img )
         if( color )
         {
             if( m_blue && (m_blue->xSampling != 1 || m_blue->ySampling != 1) )
-                UpSampleY( data, 3, step / xstep, m_blue->ySampling );
+                UpSampleY( data, defaultchannels, step / xstep, m_blue->ySampling );
             if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
-                UpSampleY( data + xstep, 3, step / xstep, m_green->ySampling );
+                UpSampleY( data + xstep, defaultchannels, step / xstep, m_green->ySampling );
             if( m_red && (m_red->xSampling != 1 || m_red->ySampling != 1) )
-                UpSampleY( data + 2 * xstep, 3, step / xstep, m_red->ySampling );
+                UpSampleY( data + 2 * xstep, defaultchannels, step / xstep, m_red->ySampling );
         }
         else if( m_green && (m_green->xSampling != 1 || m_green->ySampling != 1) )
             UpSampleY( data, 1, step / xstep, m_green->ySampling );
@@ -457,7 +486,7 @@ void  ExrDecoder::UpSampleY( uchar *data, int xstep, int ystep, int ysample )
 /**
 // algorithm from ImfRgbaYca.cpp
  */
-void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
+void  ExrDecoder::ChromaToBGR( float *data, int numlines, int xstep, int ystep )
 {
     for( int y = 0; y < numlines; y++ )
     {
@@ -466,15 +495,15 @@ void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
             double b, Y, r;
             if( m_type == FLOAT )
             {
-                b = data[y * step + x * 3];
-                Y = data[y * step + x * 3 + 1];
-                r = data[y * step + x * 3 + 2];
+                b = data[y * ystep + x * xstep];
+                Y = data[y * ystep + x * xstep + 1];
+                r = data[y * ystep + x * xstep + 2];
             }
             else
             {
-                b = ((unsigned *)data)[y * step + x * 3];
-                Y = ((unsigned *)data)[y * step + x * 3 + 1];
-                r = ((unsigned *)data)[y * step + x * 3 + 2];
+                b = ((unsigned *)data)[y * ystep + x * xstep];
+                Y = ((unsigned *)data)[y * ystep + x * xstep + 1];
+                r = ((unsigned *)data)[y * ystep + x * xstep + 2];
             }
             r = (r + 1) * Y;
             b = (b + 1) * Y;
@@ -482,18 +511,18 @@ void  ExrDecoder::ChromaToBGR( float *data, int numlines, int step )
 
             if( m_type == FLOAT )
             {
-                data[y * step + x * 3] = (float)b;
-                data[y * step + x * 3 + 1] = (float)Y;
-                data[y * step + x * 3 + 2] = (float)r;
+                data[y * ystep + x * xstep] = (float)b;
+                data[y * ystep + x * xstep + 1] = (float)Y;
+                data[y * ystep + x * xstep + 2] = (float)r;
             }
             else
             {
                 int t = cvRound(b);
-                ((unsigned *)data)[y * step + x * 3 + 0] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 0] = (unsigned)MAX(t, 0);
                 t = cvRound(Y);
-                ((unsigned *)data)[y * step + x * 3 + 1] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 1] = (unsigned)MAX(t, 0);
                 t = cvRound(r);
-                ((unsigned *)data)[y * step + x * 3 + 2] = (unsigned)MAX(t, 0);
+                ((unsigned *)data)[y * ystep + x * xstep + 2] = (unsigned)MAX(t, 0);
             }
         }
     }
@@ -571,7 +600,6 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
     int depth = img.depth();
     CV_Assert( depth == CV_32F );
     int channels = img.channels();
-    CV_Assert( channels == 3 || channels == 1 );
     bool result = false;
     Header header( width, height );
     Imf::PixelType type = FLOAT;
@@ -594,7 +622,7 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
         }
     }
 
-    if( channels == 3 )
+    if( channels == 3 || channels == 4 )
     {
         header.channels().insert( "R", Channel( type ) );
         header.channels().insert( "G", Channel( type ) );
@@ -607,6 +635,11 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
         //printf("gray\n");
     }
 
+    if( channels % 2 == 0 )
+    { // even number of channels indicates Alpha
+        header.channels().insert( "A", Channel( type ) );
+    }
+
     OutputFile file( m_filename.c_str(), header );
 
     FrameBuffer frame;
@@ -629,14 +662,19 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
         size = 4;
     }
 
-    if( channels == 3 )
+    if( channels == 3 || channels == 4 )
     {
-        frame.insert( "B", Slice( type, buffer, size * 3, bufferstep ));
-        frame.insert( "G", Slice( type, buffer + size, size * 3, bufferstep ));
-        frame.insert( "R", Slice( type, buffer + size * 2, size * 3, bufferstep ));
+        frame.insert( "B", Slice( type, buffer, size * channels, bufferstep ));
+        frame.insert( "G", Slice( type, buffer + size, size * channels, bufferstep ));
+        frame.insert( "R", Slice( type, buffer + size * 2, size * channels, bufferstep ));
     }
     else
-        frame.insert( "Y", Slice( type, buffer, size, bufferstep ));
+        frame.insert( "Y", Slice( type, buffer, size * channels, bufferstep ));
+
+    if( channels % 2 == 0 )
+    { // even channel count indicates Alpha channel
+        frame.insert( "A", Slice( type, buffer + size * (channels - 1), size * channels, bufferstep ));
+    }
 
     file.setFrameBuffer( frame );
 
diff --git a/modules/imgcodecs/src/grfmt_exr.hpp b/modules/imgcodecs/src/grfmt_exr.hpp
index ec08028e22..99acd775c2 100644
--- a/modules/imgcodecs/src/grfmt_exr.hpp
+++ b/modules/imgcodecs/src/grfmt_exr.hpp
@@ -81,7 +81,7 @@ protected:
     void  UpSample( uchar *data, int xstep, int ystep, int xsample, int ysample );
     void  UpSampleX( float *data, int xstep, int xsample );
     void  UpSampleY( uchar *data, int xstep, int ystep, int ysample );
-    void  ChromaToBGR( float *data, int numlines, int step );
+    void  ChromaToBGR( float *data, int numlines, int xstep, int ystep );
     void  RGBToGray( float *in, float *out );
 
     InputFile      *m_file;
@@ -91,11 +91,13 @@ protected:
     const Channel  *m_red;
     const Channel  *m_green;
     const Channel  *m_blue;
+    const Channel  *m_alpha;
     Chromaticities  m_chroma;
     int             m_bit_depth;
     bool            m_native_depth;
     bool            m_iscolor;
     bool            m_isfloat;
+    bool            m_hasalpha;
 
 private:
     ExrDecoder(const ExrDecoder &); // copy disabled
diff --git a/modules/imgcodecs/test/test_exr.impl.hpp b/modules/imgcodecs/test/test_exr.impl.hpp
index 1f78a8f38f..ae5af53c78 100644
--- a/modules/imgcodecs/test/test_exr.impl.hpp
+++ b/modules/imgcodecs/test/test_exr.impl.hpp
@@ -7,7 +7,7 @@
 namespace opencv_test { namespace {
 
 TEST(Imgcodecs_EXR, readWrite_32FC1)
-{
+{ // Y channels
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test32FC1.exr";
     const string filenameOutput = cv::tempfile(".exr");
@@ -31,7 +31,7 @@ TEST(Imgcodecs_EXR, readWrite_32FC1)
 }
 
 TEST(Imgcodecs_EXR, readWrite_32FC3)
-{
+{ // RGB channels
     const string root = cvtest::TS::ptr()->get_data_path();
     const string filenameInput = root + "readwrite/test32FC3.exr";
     const string filenameOutput = cv::tempfile(".exr");
@@ -113,5 +113,159 @@ TEST(Imgcodecs_EXR, readWrite_32FC3_half)
     EXPECT_EQ(0, remove(filenameOutput.c_str()));
 }
 
+// Note: YC to GRAYSCALE (IMREAD_GRAYSCALE | IMREAD_ANYDEPTH)
+// outputs a black image,
+// as does Y to RGB (IMREAD_COLOR | IMREAD_ANYDEPTH).
+// This behavoir predates adding EXR alpha support issue
+// 16115.
+
+TEST(Imgcodecs_EXR, read_YA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_GRAYSCALE | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC1, img.type());
+
+    // Writing Y covered by test 32FC1
+}
+
+TEST(Imgcodecs_EXR, read_YA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC2, img.type());
+
+    // Cannot test writing, 2 channel writing not suppported by loadsave
+}
+
+TEST(Imgcodecs_EXR, read_YC_changeDepth)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBY.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_8UC3, img.type());
+
+    // Cannot test writing, EXR encoder doesn't support 8U depth
+}
+
+TEST(Imgcodecs_EXR, readwrite_YCA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBYA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, read_YC_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBY.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    // Writing YC covered by test readwrite_YCA_ignore_alpha
+}
+
+TEST(Imgcodecs_EXR, readwrite_YCA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_YRYBYA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC4, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, readwrite_RGBA_togreyscale)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+    const Mat img = cv::imread(filenameInput, IMREAD_GRAYSCALE | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC1, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
+
+TEST(Imgcodecs_EXR, read_RGBA_ignore_alpha)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+
+    const Mat img = cv::imread(filenameInput, IMREAD_COLOR | IMREAD_ANYDEPTH);
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC3, img.type());
+
+    // Writing RGB covered by test 32FC3
+}
+
+TEST(Imgcodecs_EXR, read_RGBA_unchanged)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filenameInput = root + "readwrite/test_GeneratedRGBA.exr";
+    const string filenameOutput = cv::tempfile(".exr");
+
+#ifndef GENERATE_DATA
+    const Mat img = cv::imread(filenameInput, IMREAD_UNCHANGED);
+#else
+    const Size sz(64, 32);
+    Mat img(sz, CV_32FC4, Scalar(0.5, 0.1, 1, 1));
+    img(Rect(10, 5, sz.width - 30, sz.height - 20)).setTo(Scalar(1, 0, 0, 1));
+    img(Rect(10, 20, sz.width - 30, sz.height - 20)).setTo(Scalar(1, 1, 0, 0));
+    ASSERT_TRUE(cv::imwrite(filenameInput, img));
+#endif
+
+    ASSERT_FALSE(img.empty());
+    ASSERT_EQ(CV_32FC4, img.type());
+
+    ASSERT_TRUE(cv::imwrite(filenameOutput, img));
+    const Mat img2 = cv::imread(filenameOutput, IMREAD_UNCHANGED);
+    ASSERT_EQ(img2.type(), img.type());
+    ASSERT_EQ(img2.size(), img.size());
+    EXPECT_LE(cvtest::norm(img, img2, NORM_INF | NORM_RELATIVE), 1e-3);
+    EXPECT_EQ(0, remove(filenameOutput.c_str()));
+}
 
 }} // namespace

From cbfd38bd41e91433b7a23348ae65d3adff2bc20b Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 23 Feb 2021 00:22:06 +0000
Subject: [PATCH 02/10] core: rework code locality

- to reduce binaries size of FFmpeg Windows wrapper
- MinGW linker doesn't support -ffunction-sections (used for FFmpeg Windows wrapper)
- move code to improve locality with its used dependencies
- move UMat::dot() to matmul.dispatch.cpp (Mat::dot() is already there)
- move UMat::inv() to lapack.cpp
- move UMat::mul() to arithm.cpp
- move UMat:eye() to matrix_operations.cpp (near setIdentity() implementation)
- move normalize(): convert_scale.cpp => norm.cpp
- move convertAndUnrollScalar(): arithm.cpp => copy.cpp
- move scalarToRawData(): array.cpp => copy.cpp
- move transpose(): matrix_operations.cpp => matrix_transform.cpp
- move flip(), rotate(): copy.cpp => matrix_transform.cpp (rotate90 uses flip and transpose)
- add 'OPENCV_CORE_EXCLUDE_C_API' CMake variable to exclude compilation of C-API functions from the core module
- matrix_wrap.cpp: add compile-time checks for CUDA/OpenGL calls
- the steps above allow to reduce FFmpeg wrapper size for ~1.5Mb (initial size of OpenCV part is about 3Mb)

backport is done to improve merge experience (less conflicts)
backport of commit: 65eb9467567598c08049bb190a4f3d3cbfabdcd0
---
 modules/core/CMakeLists.txt                 |   4 +
 modules/core/src/arithm.cpp                 |  40 +-
 modules/core/src/array.cpp                  |  89 +--
 modules/core/src/convert_c.cpp              |   3 +
 modules/core/src/convert_scale.dispatch.cpp | 140 ----
 modules/core/src/copy.cpp                   | 557 ++------------
 modules/core/src/datastructs.cpp            |   3 +
 modules/core/src/dxt.cpp                    |   4 +
 modules/core/src/lapack.cpp                 |  25 +-
 modules/core/src/mathfuncs.cpp              |   7 +
 modules/core/src/matmul.dispatch.cpp        |  73 ++
 modules/core/src/matrix_c.cpp               |   4 +-
 modules/core/src/matrix_operations.cpp      | 296 +-------
 modules/core/src/matrix_transform.cpp       | 770 ++++++++++++++++++++
 modules/core/src/matrix_wrap.cpp            |  61 ++
 modules/core/src/norm.cpp                   | 174 ++++-
 modules/core/src/persistence_c.cpp          |  42 --
 modules/core/src/rand.cpp                   |   6 +
 modules/core/src/stat_c.cpp                 |   4 +
 modules/core/src/umatrix.cpp                |  94 ---
 20 files changed, 1251 insertions(+), 1145 deletions(-)
 create mode 100644 modules/core/src/matrix_transform.cpp

diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 8da28d275f..a84d7fc3ad 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -112,6 +112,10 @@ ocv_target_link_libraries(${the_module} PRIVATE
     "${OPENCV_HAL_LINKER_LIBS}"
 )
 
+if(OPENCV_CORE_EXCLUDE_C_API)
+  ocv_target_compile_definitions(${the_module} PRIVATE "OPENCV_EXCLUDE_C_API=1")
+endif()
+
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()
 
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 760bbcb088..41b281c8de 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -57,24 +57,6 @@ namespace cv
 *                                   logical operations                                   *
 \****************************************************************************************/
 
-void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
-{
-    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
-    size_t esz = CV_ELEM_SIZE(buftype);
-    getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
-    // unroll the scalar
-    if( scn < cn )
-    {
-        CV_Assert( scn == 1 );
-        size_t esz1 = CV_ELEM_SIZE1(buftype);
-        for( size_t i = esz1; i < esz; i++ )
-            scbuf[i] = scbuf[i - esz1];
-    }
-    for( size_t i = esz; i < blocksize*esz; i++ )
-        scbuf[i] = scbuf[i - esz];
-}
-
-
 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
        OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
        OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
@@ -1041,9 +1023,7 @@ static BinaryFuncC* getRecipTab()
     return recipTab;
 }
 
-}
-
-void cv::multiply(InputArray src1, InputArray src2,
+void multiply(InputArray src1, InputArray src2,
                   OutputArray dst, double scale, int dtype)
 {
     CV_INSTRUMENT_REGION();
@@ -1052,7 +1032,7 @@ void cv::multiply(InputArray src1, InputArray src2,
               true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
 }
 
-void cv::divide(InputArray src1, InputArray src2,
+void divide(InputArray src1, InputArray src2,
                 OutputArray dst, double scale, int dtype)
 {
     CV_INSTRUMENT_REGION();
@@ -1060,7 +1040,7 @@ void cv::divide(InputArray src1, InputArray src2,
     arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
 }
 
-void cv::divide(double scale, InputArray src2,
+void divide(double scale, InputArray src2,
                 OutputArray dst, int dtype)
 {
     CV_INSTRUMENT_REGION();
@@ -1068,13 +1048,17 @@ void cv::divide(double scale, InputArray src2,
     arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
 }
 
+UMat UMat::mul(InputArray m, double scale) const
+{
+    UMat dst;
+    multiply(*this, m, dst, scale);
+    return dst;
+}
+
 /****************************************************************************************\
 *                                      addWeighted                                       *
 \****************************************************************************************/
 
-namespace cv
-{
-
 static BinaryFuncC* getAddWeightedTab()
 {
     static BinaryFuncC addWeightedTab[] =
@@ -1879,6 +1863,9 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
     }
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 /****************************************************************************************\
 *                                Earlier API: cvAdd etc.                                 *
 \****************************************************************************************/
@@ -2141,4 +2128,5 @@ cvMaxS( const void* srcarr1, double value, void* dstarr )
     cv::max( src1, value, dst );
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp
index f2a79b5a69..1a5ea0100f 100644
--- a/modules/core/src/array.cpp
+++ b/modules/core/src/array.cpp
@@ -48,6 +48,8 @@
 
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 #define  CV_ORIGIN_TL  0
 #define  CV_ORIGIN_BL  1
 
@@ -3223,51 +3225,50 @@ template<> void DefaultDeleter<CvMemStorage>::operator ()(CvMemStorage* obj) con
 template<> void DefaultDeleter<CvFileStorage>::operator ()(CvFileStorage* obj) const
 { cvReleaseFileStorage(&obj); }
 
-template <typename T> static inline
-void scalarToRawData_(const Scalar& s, T * const buf, const int cn, const int unroll_to)
-{
-    int i = 0;
-    for(; i < cn; i++)
-        buf[i] = saturate_cast<T>(s.val[i]);
-    for(; i < unroll_to; i++)
-        buf[i] = buf[i-cn];
-}
-
-void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
-{
-    CV_INSTRUMENT_REGION();
-
-    const int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    CV_Assert(cn <= 4);
-    switch(depth)
-    {
-    case CV_8U:
-        scalarToRawData_<uchar>(s, (uchar*)_buf, cn, unroll_to);
-        break;
-    case CV_8S:
-        scalarToRawData_<schar>(s, (schar*)_buf, cn, unroll_to);
-        break;
-    case CV_16U:
-        scalarToRawData_<ushort>(s, (ushort*)_buf, cn, unroll_to);
-        break;
-    case CV_16S:
-        scalarToRawData_<short>(s, (short*)_buf, cn, unroll_to);
-        break;
-    case CV_32S:
-        scalarToRawData_<int>(s, (int*)_buf, cn, unroll_to);
-        break;
-    case CV_32F:
-        scalarToRawData_<float>(s, (float*)_buf, cn, unroll_to);
-        break;
-    case CV_64F:
-        scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
-        break;
-    default:
-        CV_Error(CV_StsUnsupportedFormat,"");
-    }
-}
-
 } // cv::
 
 
+/* universal functions */
+CV_IMPL void
+cvRelease( void** struct_ptr )
+{
+    CvTypeInfo* info;
+
+    if( !struct_ptr )
+        CV_Error( CV_StsNullPtr, "NULL double pointer" );
+
+    if( *struct_ptr )
+    {
+        info = cvTypeOf( *struct_ptr );
+        if( !info )
+            CV_Error( CV_StsError, "Unknown object type" );
+        if( !info->release )
+            CV_Error( CV_StsError, "release function pointer is NULL" );
+
+        info->release( struct_ptr );
+        *struct_ptr = 0;
+    }
+}
+
+
+void* cvClone( const void* struct_ptr )
+{
+    void* struct_copy = 0;
+    CvTypeInfo* info;
+
+    if( !struct_ptr )
+        CV_Error( CV_StsNullPtr, "NULL structure pointer" );
+
+    info = cvTypeOf( struct_ptr );
+    if( !info )
+        CV_Error( CV_StsError, "Unknown object type" );
+    if( !info->clone )
+        CV_Error( CV_StsError, "clone function pointer is NULL" );
+
+    struct_copy = info->clone( struct_ptr );
+    return struct_copy;
+}
+
+
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/convert_c.cpp b/modules/core/src/convert_c.cpp
index efe4de740a..96beffccc6 100644
--- a/modules/core/src/convert_c.cpp
+++ b/modules/core/src/convert_c.cpp
@@ -5,6 +5,7 @@
 
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
 
 CV_IMPL void
 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
@@ -132,3 +133,5 @@ CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
     CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
     cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
 }
+
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/convert_scale.dispatch.cpp b/modules/core/src/convert_scale.dispatch.cpp
index 83376aa61d..6902ecc24b 100644
--- a/modules/core/src/convert_scale.dispatch.cpp
+++ b/modules/core/src/convert_scale.dispatch.cpp
@@ -9,7 +9,6 @@
 #include "convert_scale.simd.hpp"
 #include "convert_scale.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
-
 namespace cv
 {
 
@@ -117,143 +116,4 @@ void convertScaleAbs(InputArray _src, OutputArray _dst, double alpha, double bet
     }
 }
 
-//==================================================================================================
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
-                           double scale, double delta )
-{
-    UMat src = _src.getUMat();
-
-    if( _mask.empty() )
-        src.convertTo( _dst, dtype, scale, delta );
-    else if (src.channels() <= 4)
-    {
-        const ocl::Device & dev = ocl::Device::getDefault();
-
-        int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
-                ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
-                rowsPerWI = dev.isIntel() ? 4 : 1;
-
-        float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
-        bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
-                haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
-                haveDelta = std::fabs(delta) > DBL_EPSILON,
-                doubleSupport = dev.doubleFPConfig() > 0;
-
-        if (!haveScale && !haveDelta && stype == dtype)
-        {
-            _src.copyTo(_dst, _mask);
-            return true;
-        }
-        if (haveZeroScale)
-        {
-            _dst.setTo(Scalar(delta), _mask);
-            return true;
-        }
-
-        if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
-            return false;
-
-        char cvt[2][40];
-        String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
-                             " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
-                             ocl::typeToStr(stype), ocl::typeToStr(dtype),
-                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
-                             rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
-                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
-                             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                             haveScale ? " -D HAVE_SCALE" : "",
-                             haveDelta ? " -D HAVE_DELTA" : "",
-                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
-
-        ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
-        if (k.empty())
-            return false;
-
-        UMat mask = _mask.getUMat(), dst = _dst.getUMat();
-
-        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
-                maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
-                dstarg = ocl::KernelArg::ReadWrite(dst);
-
-        if (haveScale)
-        {
-            if (haveDelta)
-                k.args(srcarg, maskarg, dstarg, fscale, fdelta);
-            else
-                k.args(srcarg, maskarg, dstarg, fscale);
-        }
-        else
-        {
-            if (haveDelta)
-                k.args(srcarg, maskarg, dstarg, fdelta);
-            else
-                k.args(srcarg, maskarg, dstarg);
-        }
-
-        size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
-        return k.run(2, globalsize, NULL, false);
-    }
-    else
-    {
-        UMat temp;
-        src.convertTo( temp, dtype, scale, delta );
-        temp.copyTo( _dst, _mask );
-    }
-
-    return true;
-}
-
-#endif
-
-void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
-               int norm_type, int rtype, InputArray _mask)
-{
-    CV_INSTRUMENT_REGION();
-
-    double scale = 1, shift = 0;
-    int type = _src.type(), depth = CV_MAT_DEPTH(type);
-
-    if( rtype < 0 )
-        rtype = _dst.fixedType() ? _dst.depth() : depth;
-
-    if( norm_type == CV_MINMAX )
-    {
-        double smin = 0, smax = 0;
-        double dmin = MIN( a, b ), dmax = MAX( a, b );
-        minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
-        scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
-        if( rtype == CV_32F )
-        {
-            scale = (float)scale;
-            shift = (float)dmin - (float)(smin*scale);
-        }
-        else
-            shift = dmin - smin*scale;
-    }
-    else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
-    {
-        scale = norm( _src, norm_type, _mask );
-        scale = scale > DBL_EPSILON ? a/scale : 0.;
-        shift = 0;
-    }
-    else
-        CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
-
-    CV_OCL_RUN(_dst.isUMat(),
-               ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
-
-    Mat src = _src.getMat();
-    if( _mask.empty() )
-        src.convertTo( _dst, rtype, scale, shift );
-    else
-    {
-        Mat temp;
-        src.convertTo( temp, rtype, scale, shift );
-        temp.copyTo( _dst, _mask );
-    }
-}
-
 } // namespace
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 798fde74d4..5262eb1b9c 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -53,6 +53,75 @@
 namespace cv
 {
 
+template <typename T> static inline
+void scalarToRawData_(const Scalar& s, T * const buf, const int cn, const int unroll_to)
+{
+    int i = 0;
+    for(; i < cn; i++)
+        buf[i] = saturate_cast<T>(s.val[i]);
+    for(; i < unroll_to; i++)
+        buf[i] = buf[i-cn];
+}
+
+void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
+{
+    CV_INSTRUMENT_REGION();
+
+    const int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert(cn <= 4);
+    switch(depth)
+    {
+    case CV_8U:
+        scalarToRawData_<uchar>(s, (uchar*)_buf, cn, unroll_to);
+        break;
+    case CV_8S:
+        scalarToRawData_<schar>(s, (schar*)_buf, cn, unroll_to);
+        break;
+    case CV_16U:
+        scalarToRawData_<ushort>(s, (ushort*)_buf, cn, unroll_to);
+        break;
+    case CV_16S:
+        scalarToRawData_<short>(s, (short*)_buf, cn, unroll_to);
+        break;
+    case CV_32S:
+        scalarToRawData_<int>(s, (int*)_buf, cn, unroll_to);
+        break;
+    case CV_32F:
+        scalarToRawData_<float>(s, (float*)_buf, cn, unroll_to);
+        break;
+    case CV_64F:
+        scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
+        break;
+#if CV_VERSION_MAJOR >= 4
+    case CV_16F:
+        scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
+        break;
+#endif
+    default:
+        CV_Error(CV_StsUnsupportedFormat,"");
+    }
+}
+
+void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
+{
+    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
+    size_t esz = CV_ELEM_SIZE(buftype);
+    BinaryFunc cvtFn = getConvertFunc(sc.depth(), buftype);
+    CV_Assert(cvtFn);
+    cvtFn(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
+    // unroll the scalar
+    if( scn < cn )
+    {
+        CV_Assert( scn == 1 );
+        size_t esz1 = CV_ELEM_SIZE1(buftype);
+        for( size_t i = esz1; i < esz; i++ )
+            scbuf[i] = scbuf[i - esz1];
+    }
+    for( size_t i = esz; i < blocksize*esz; i++ )
+        scbuf[i] = scbuf[i - esz];
+}
+
+
 template<typename T> static void
 copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
 {
@@ -594,490 +663,6 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
     return *this;
 }
 
-#if CV_SIMD128
-template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
-{
-    typedef typename V::lane_type T;
-    int end = (int)(size.width*esz);
-    int width = (end + 1)/2;
-    int width_1 = width & -v_uint8x16::nlanes;
-    int i, j;
-
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(T)>(src, dst));
-#endif
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
-        {
-            V t0, t1;
-
-            t0 = v_load((T*)((uchar*)src + i));
-            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
-            t0 = v_reverse(t0);
-            t1 = v_reverse(t1);
-            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
-            v_store((T*)(dst + i), t1);
-        }
-        if (isAligned<sizeof(T)>(src, dst))
-        {
-            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
-            {
-                T t0, t1;
-
-                t0 = *((T*)((uchar*)src + i));
-                t1 = *((T*)((uchar*)src + j - sizeof(T)));
-                *((T*)(dst + j - sizeof(T))) = t0;
-                *((T*)(dst + i)) = t1;
-            }
-        }
-        else
-        {
-            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
-            {
-                for (int k = 0; k < (int)sizeof(T); k++)
-                {
-                    uchar t0, t1;
-
-                    t0 = *((uchar*)src + i + k);
-                    t1 = *((uchar*)src + j + k - sizeof(T));
-                    *(dst + j + k - sizeof(T)) = t0;
-                    *(dst + i + k) = t1;
-                }
-            }
-        }
-    }
-}
-
-template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
-{
-    int end = (int)(size.width*esz);
-    int width = (end + 1)/2;
-
-#if CV_STRONG_ALIGNMENT
-    CV_Assert(isAligned<sizeof(T1)>(src, dst));
-    CV_Assert(isAligned<sizeof(T2)>(src, dst));
-#endif
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
-        {
-            T1 t0, t1;
-            T2 t2, t3;
-
-            t0 = *((T1*)((uchar*)src + i));
-            t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
-            t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
-            t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
-            *((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
-            *((T2*)(dst + j - sizeof(T2))) = t2;
-            *((T1*)(dst + i)) = t1;
-            *((T2*)(dst + i + sizeof(T1))) = t3;
-        }
-    }
-}
-#endif
-
-static void
-flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
-{
-#if CV_SIMD
-#if CV_STRONG_ALIGNMENT
-    size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
-#endif
-    if (esz == 2 * v_uint8x16::nlanes)
-    {
-        int end = (int)(size.width*esz);
-        int width = end/2;
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
-            {
-#if CV_SIMD256
-                v_uint8x32 t0, t1;
-
-                t0 = v256_load((uchar*)src + i);
-                t1 = v256_load((uchar*)src + j);
-                v_store(dst + j, t0);
-                v_store(dst + i, t1);
-#else
-                v_uint8x16 t0, t1, t2, t3;
-
-                t0 = v_load((uchar*)src + i);
-                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
-                t2 = v_load((uchar*)src + j);
-                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
-                v_store(dst + j, t0);
-                v_store(dst + j + v_uint8x16::nlanes, t1);
-                v_store(dst + i, t2);
-                v_store(dst + i + v_uint8x16::nlanes, t3);
-#endif
-            }
-        }
-    }
-    else if (esz == v_uint8x16::nlanes)
-    {
-        int end = (int)(size.width*esz);
-        int width = end/2;
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
-            {
-                v_uint8x16 t0, t1;
-
-                t0 = v_load((uchar*)src + i);
-                t1 = v_load((uchar*)src + j);
-                v_store(dst + j, t0);
-                v_store(dst + i, t1);
-            }
-        }
-    }
-    else if (esz == 8
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(uint64)>(alignmentMark)
-#endif
-    )
-    {
-        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 4
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(unsigned)>(alignmentMark)
-#endif
-    )
-    {
-        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 2
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(ushort)>(alignmentMark)
-#endif
-    )
-    {
-        flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 1)
-    {
-        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 24
-#if CV_STRONG_ALIGNMENT
-            && isAligned<sizeof(uint64_t)>(alignmentMark)
-#endif
-    )
-    {
-        int end = (int)(size.width*esz);
-        int width = (end + 1)/2;
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
-            {
-                v_uint8x16 t0, t1;
-                uint64_t t2, t3;
-
-                t0 = v_load((uchar*)src + i);
-                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
-                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
-                t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
-                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
-                *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
-                v_store(dst + i, t1);
-                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
-            }
-        }
-    }
-#if !CV_STRONG_ALIGNMENT
-    else if (esz == 12)
-    {
-        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 6)
-    {
-        flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
-    }
-    else if (esz == 3)
-    {
-        flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
-    }
-#endif
-    else
-#endif // CV_SIMD
-    {
-        int i, j, limit = (int)(((size.width + 1)/2)*esz);
-        AutoBuffer<int> _tab(size.width*esz);
-        int* tab = _tab.data();
-
-        for( i = 0; i < size.width; i++ )
-            for( size_t k = 0; k < esz; k++ )
-                tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
-
-        for( ; size.height--; src += sstep, dst += dstep )
-        {
-            for( i = 0; i < limit; i++ )
-            {
-                j = tab[i];
-                uchar t0 = src[i], t1 = src[j];
-                dst[i] = t1; dst[j] = t0;
-            }
-        }
-    }
-}
-
-static void
-flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, size_t esz )
-{
-    const uchar* src1 = src0 + (size.height - 1)*sstep;
-    uchar* dst1 = dst0 + (size.height - 1)*dstep;
-    size.width *= (int)esz;
-
-    for( int y = 0; y < (size.height + 1)/2; y++, src0 += sstep, src1 -= sstep,
-                                                  dst0 += dstep, dst1 -= dstep )
-    {
-        int i = 0;
-#if CV_SIMD
-#if CV_STRONG_ALIGNMENT
-        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
-#endif
-        {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
-            {
-                v_int32 t0 = vx_load((int*)(src0 + i));
-                v_int32 t1 = vx_load((int*)(src1 + i));
-                vx_store((int*)(dst0 + i), t1);
-                vx_store((int*)(dst1 + i), t0);
-            }
-        }
-#if CV_STRONG_ALIGNMENT
-        else
-        {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
-            {
-                v_uint8 t0 = vx_load(src0 + i);
-                v_uint8 t1 = vx_load(src1 + i);
-                vx_store(dst0 + i, t1);
-                vx_store(dst1 + i, t0);
-            }
-        }
-#endif
-#endif
-
-        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
-        {
-            for( ; i <= size.width - 16; i += 16 )
-            {
-                int t0 = ((int*)(src0 + i))[0];
-                int t1 = ((int*)(src1 + i))[0];
-
-                ((int*)(dst0 + i))[0] = t1;
-                ((int*)(dst1 + i))[0] = t0;
-
-                t0 = ((int*)(src0 + i))[1];
-                t1 = ((int*)(src1 + i))[1];
-
-                ((int*)(dst0 + i))[1] = t1;
-                ((int*)(dst1 + i))[1] = t0;
-
-                t0 = ((int*)(src0 + i))[2];
-                t1 = ((int*)(src1 + i))[2];
-
-                ((int*)(dst0 + i))[2] = t1;
-                ((int*)(dst1 + i))[2] = t0;
-
-                t0 = ((int*)(src0 + i))[3];
-                t1 = ((int*)(src1 + i))[3];
-
-                ((int*)(dst0 + i))[3] = t1;
-                ((int*)(dst1 + i))[3] = t0;
-            }
-
-            for( ; i <= size.width - 4; i += 4 )
-            {
-                int t0 = ((int*)(src0 + i))[0];
-                int t1 = ((int*)(src1 + i))[0];
-
-                ((int*)(dst0 + i))[0] = t1;
-                ((int*)(dst1 + i))[0] = t0;
-            }
-        }
-
-        for( ; i < size.width; i++ )
-        {
-            uchar t0 = src0[i];
-            uchar t1 = src1[i];
-
-            dst0[i] = t1;
-            dst1[i] = t0;
-        }
-    }
-}
-
-#ifdef HAVE_OPENCL
-
-enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
-
-static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
-{
-    CV_Assert(flipCode >= -1 && flipCode <= 1);
-
-    const ocl::Device & dev = ocl::Device::getDefault();
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
-
-    bool doubleSupport = dev.doubleFPConfig() > 0;
-    if (!doubleSupport && depth == CV_64F)
-        kercn = cn;
-
-    if (cn > 4)
-        return false;
-
-    const char * kernelName;
-    if (flipCode == 0)
-        kernelName = "arithm_flip_rows", flipType = FLIP_ROWS;
-    else if (flipCode > 0)
-        kernelName = "arithm_flip_cols", flipType = FLIP_COLS;
-    else
-        kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH;
-
-    int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
-    kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
-
-    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
-                kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
-                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
-    if (k.empty())
-        return false;
-
-    Size size = _src.size();
-    _dst.create(size, type);
-    UMat src = _src.getUMat(), dst = _dst.getUMat();
-
-    int cols = size.width * cn / kercn, rows = size.height;
-    cols = flipType == FLIP_COLS ? (cols + 1) >> 1 : cols;
-    rows = flipType & FLIP_ROWS ? (rows + 1) >> 1 : rows;
-
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src),
-           ocl::KernelArg::WriteOnly(dst, cn, kercn), rows, cols);
-
-    size_t maxWorkGroupSize = dev.maxWorkGroupSize();
-    CV_Assert(maxWorkGroupSize % 4 == 0);
-
-    size_t globalsize[2] = { (size_t)cols, ((size_t)rows + pxPerWIy - 1) / pxPerWIy },
-            localsize[2] = { maxWorkGroupSize / 4, 4 };
-    return k.run(2, globalsize, (flipType == FLIP_COLS) && !dev.isIntel() ? localsize : NULL, false);
-}
-
-#endif
-
-#if defined HAVE_IPP
-static bool ipp_flip(Mat &src, Mat &dst, int flip_mode)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP();
-
-    // Details: https://github.com/opencv/opencv/issues/12943
-    if (flip_mode <= 0 /* swap rows */
-        && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42
-        && (int64_t)(src.total()) * src.elemSize() >= CV_BIG_INT(0x80000000)/*2Gb*/
-    )
-        return false;
-
-    IppiAxis ippMode;
-    if(flip_mode < 0)
-        ippMode = ippAxsBoth;
-    else if(flip_mode == 0)
-        ippMode = ippAxsHorizontal;
-    else
-        ippMode = ippAxsVertical;
-
-    try
-    {
-        ::ipp::IwiImage iwSrc = ippiGetImage(src);
-        ::ipp::IwiImage iwDst = ippiGetImage(dst);
-
-        CV_INSTRUMENT_FUN_IPP(::ipp::iwiMirror, iwSrc, iwDst, ippMode);
-    }
-    catch(const ::ipp::IwException &)
-    {
-        return false;
-    }
-
-    return true;
-#else
-    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(flip_mode);
-    return false;
-#endif
-}
-#endif
-
-
-void flip( InputArray _src, OutputArray _dst, int flip_mode )
-{
-    CV_INSTRUMENT_REGION();
-
-    CV_Assert( _src.dims() <= 2 );
-    Size size = _src.size();
-
-    if (flip_mode < 0)
-    {
-        if (size.width == 1)
-            flip_mode = 0;
-        if (size.height == 1)
-            flip_mode = 1;
-    }
-
-    if ((size.width == 1 && flip_mode > 0) ||
-        (size.height == 1 && flip_mode == 0))
-    {
-        return _src.copyTo(_dst);
-    }
-
-    CV_OCL_RUN( _dst.isUMat(), ocl_flip(_src, _dst, flip_mode))
-
-    Mat src = _src.getMat();
-    int type = src.type();
-    _dst.create( size, type );
-    Mat dst = _dst.getMat();
-
-    CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
-
-    size_t esz = CV_ELEM_SIZE(type);
-
-    if( flip_mode <= 0 )
-        flipVert( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
-    else
-        flipHoriz( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
-
-    if( flip_mode < 0 )
-        flipHoriz( dst.ptr(), dst.step, dst.ptr(), dst.step, dst.size(), esz );
-}
-
-void rotate(InputArray _src, OutputArray _dst, int rotateMode)
-{
-    CV_Assert(_src.dims() <= 2);
-
-    switch (rotateMode)
-    {
-    case ROTATE_90_CLOCKWISE:
-        transpose(_src, _dst);
-        flip(_dst, _dst, 1);
-        break;
-    case ROTATE_180:
-        flip(_src, _dst, -1);
-        break;
-    case ROTATE_90_COUNTERCLOCKWISE:
-        transpose(_src, _dst);
-        flip(_dst, _dst, 0);
-        break;
-    default:
-        break;
-    }
-}
 
 #if defined HAVE_OPENCL && !defined __APPLE__
 
@@ -1499,6 +1084,9 @@ void cv::copyMakeBorder( InputArray _src, OutputArray _dst, int top, int bottom,
     }
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 /* dst = src */
 CV_IMPL void
 cvCopy( const void* srcarr, void* dstarr, const void* maskarr )
@@ -1614,4 +1202,5 @@ cvRepeat( const CvArr* srcarr, CvArr* dstarr )
     cv::repeat(src, dst.rows/src.rows, dst.cols/src.cols, dst);
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/datastructs.cpp b/modules/core/src/datastructs.cpp
index 61adf3493e..cd9196a130 100644
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@@ -40,6 +40,8 @@
 //M*/
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 /* default alignment for dynamic data strucutures, resided in storages. */
 #define  CV_STRUCT_ALIGN    ((int)sizeof(double))
 
@@ -3585,4 +3587,5 @@ void  seqInsertSlice( CvSeq* seq, int before_index, const CvArr* from_arr )
 
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index b307703a32..e378f31e66 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -4640,6 +4640,9 @@ int cv::getOptimalDFTSize( int size0 )
     return optimalDFTSizeTab[b];
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL void
 cvDFT( const CvArr* srcarr, CvArr* dstarr, int flags, int nonzero_rows )
 {
@@ -4695,4 +4698,5 @@ cvGetOptimalDFTSize( int size0 )
     return cv::getOptimalDFTSize(size0);
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 /* End of file. */
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index 486b7a5aba..9bca6a8211 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -753,8 +753,6 @@ SVBkSb( int m, int n, const double* w, size_t wstep,
                 (double*)alignPtr(buffer, sizeof(double)), DBL_EPSILON*2 );
 }
 
-}
-
 /****************************************************************************************\
 *                                 Determinant of the matrix                              *
 \****************************************************************************************/
@@ -764,7 +762,7 @@ SVBkSb( int m, int n, const double* w, size_t wstep,
                    m(0,1)*((double)m(1,0)*m(2,2) - (double)m(1,2)*m(2,0)) +  \
                    m(0,2)*((double)m(1,0)*m(2,1) - (double)m(1,1)*m(2,0)))
 
-double cv::determinant( InputArray _mat )
+double determinant( InputArray _mat )
 {
     CV_INSTRUMENT_REGION();
 
@@ -842,7 +840,7 @@ double cv::determinant( InputArray _mat )
 #define Df( y, x ) ((float*)(dstdata + y*dststep))[x]
 #define Dd( y, x ) ((double*)(dstdata + y*dststep))[x]
 
-double cv::invert( InputArray _src, OutputArray _dst, int method )
+double invert( InputArray _src, OutputArray _dst, int method )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1069,13 +1067,19 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
     return result;
 }
 
+UMat UMat::inv(int method) const
+{
+    UMat m;
+    invert(*this, m, method);
+    return m;
+}
 
 
 /****************************************************************************************\
 *                              Solving a linear system                                   *
 \****************************************************************************************/
 
-bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int method )
+bool solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int method )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1374,7 +1378,7 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
 
 /////////////////// finding eigenvalues and eigenvectors of a symmetric matrix ///////////////
 
-bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
+bool eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1396,7 +1400,7 @@ bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
     const bool evecNeeded = _evects.needed();
     const int esOptions = evecNeeded ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly;
     _evals.create(n, 1, type);
-    cv::Mat evals = _evals.getMat();
+    Mat evals = _evals.getMat();
     if ( type == CV_64F )
     {
         Eigen::MatrixXd src_eig, zeros_eig;
@@ -1448,9 +1452,6 @@ bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
 #endif
 }
 
-namespace cv
-{
-
 static void _SVDcompute( InputArray _aarr, OutputArray _w,
                          OutputArray _u, OutputArray _vt, int flags )
 {
@@ -1598,6 +1599,9 @@ void cv::SVBackSubst(InputArray w, InputArray u, InputArray vt, InputArray rhs,
 }
 
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL double
 cvDet( const CvArr* arr )
 {
@@ -1789,3 +1793,4 @@ cvSVBkSb( const CvArr* warr, const CvArr* uarr,
     cv::SVD::backSubst(w, u, v, rhs, dst);
     CV_Assert( dst.data == dst0.data );
 }
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index a4e5263aa8..9fdf7d7702 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -1637,6 +1637,9 @@ void patchNaNs( InputOutputArray _a, double _val )
 
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL float cvCbrt(float value) { return cv::cubeRoot(value); }
 CV_IMPL float cvFastArctan(float y, float x) { return cv::fastAtan2(y, x); }
 
@@ -1720,6 +1723,7 @@ CV_IMPL int cvCheckArr( const CvArr* arr, int flags,
     return cv::checkRange(cv::cvarrToMat(arr), (flags & CV_CHECK_QUIET) != 0, 0, minVal, maxVal );
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 
 /*
   Finds real roots of cubic, quadratic or linear equation.
@@ -2015,6 +2019,8 @@ double cv::solvePoly( InputArray _coeffs0, OutputArray _roots0, int maxIters )
 }
 
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL int
 cvSolveCubic( const CvMat* coeffs, CvMat* roots )
 {
@@ -2034,6 +2040,7 @@ void cvSolvePoly(const CvMat* a, CvMat *r, int maxiter, int)
     CV_Assert( _r.data == _r0.data ); // check that the array of roots was not reallocated
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
 
 
 // Common constants for dispatched code
diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp
index a9b82aee88..e81064ec16 100644
--- a/modules/core/src/matmul.dispatch.cpp
+++ b/modules/core/src/matmul.dispatch.cpp
@@ -999,8 +999,79 @@ double Mat::dot(InputArray _mat) const
     return r;
 }
 
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
+{
+    UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
+
+    int type = src1.type(), depth = CV_MAT_DEPTH(type),
+            kercn = ocl::predictOptimalVectorWidth(src1, src2);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if ( !doubleSupport && depth == CV_64F )
+        return false;
+
+    int dbsize = ocl::Device::getDefault().maxComputeUnits();
+    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    int ddepth = std::max(CV_32F, depth);
+
+    int wgs2_aligned = 1;
+    while (wgs2_aligned < (int)wgs)
+        wgs2_aligned <<= 1;
+    wgs2_aligned >>= 1;
+
+    char cvt[40];
+    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
+                  format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
+                         "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
+                         ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
+                         ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
+                         (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         _src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
+                         _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
+    if (k.empty())
+        return false;
+
+    UMat db(1, dbsize, ddepth);
+
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
+            dbarg = ocl::KernelArg::PtrWriteOnly(db);
+
+    k.args(src1arg, src1.cols, (int)src1.total(), dbsize, dbarg, src2arg);
+
+    size_t globalsize = dbsize * wgs;
+    if (k.run(1, &globalsize, &wgs, false))
+    {
+        res = sum(db.getMat(ACCESS_READ))[0];
+        return true;
+    }
+    return false;
+}
+
+#endif
+
+double UMat::dot(InputArray m) const
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert(m.sameSize(*this) && m.type() == type());
+
+#ifdef HAVE_OPENCL
+    double r = 0;
+    CV_OCL_RUN_(dims <= 2, ocl_dot(*this, m, r), r)
+#endif
+
+    return getMat(ACCESS_READ).dot(m);
+}
+
 }  // namespace cv::
 
+
+#ifndef OPENCV_EXCLUDE_C_API
 /****************************************************************************************\
 *                                    Earlier API                                         *
 \****************************************************************************************/
@@ -1225,4 +1296,6 @@ cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr,
     CV_Assert(dst0.data == dst.data);
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
+
 /* End of file. */
diff --git a/modules/core/src/matrix_c.cpp b/modules/core/src/matrix_c.cpp
index 2fead4100c..baa61bb66f 100644
--- a/modules/core/src/matrix_c.cpp
+++ b/modules/core/src/matrix_c.cpp
@@ -6,6 +6,7 @@
 #include "opencv2/core/mat.hpp"
 #include "opencv2/core/types_c.h"
 
+#ifndef OPENCV_EXCLUDE_C_API
 // glue
 
 CvMatND cvMatND(const cv::Mat& m)
@@ -360,7 +361,6 @@ cvSort( const CvArr* _src, CvArr* _dst, CvArr* _idx, int flags )
     }
 }
 
-
 CV_IMPL int
 cvKMeans2( const CvArr* _samples, int cluster_count, CvArr* _labels,
            CvTermCriteria termcrit, int attempts, CvRNG*,
@@ -389,3 +389,5 @@ cvKMeans2( const CvArr* _samples, int cluster_count, CvArr* _labels,
         *_compactness = compactness;
     return 1;
 }
+
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
index 6f863b8871..ca8edc4771 100644
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -226,6 +226,23 @@ void cv::setIdentity( InputOutputArray _m, const Scalar& s )
     }
 }
 
+
+namespace cv {
+
+UMat UMat::eye(int rows, int cols, int type)
+{
+    return UMat::eye(Size(cols, rows), type);
+}
+
+UMat UMat::eye(Size size, int type)
+{
+    UMat m(size, type);
+    setIdentity(m);
+    return m;
+}
+
+}  // namespace
+
 //////////////////////////////////////////// trace ///////////////////////////////////////////
 
 cv::Scalar cv::trace( InputArray _m )
@@ -260,285 +277,6 @@ cv::Scalar cv::trace( InputArray _m )
     return cv::sum(m.diag());
 }
 
-////////////////////////////////////// transpose /////////////////////////////////////////
-
-namespace cv
-{
-
-template<typename T> static void
-transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
-{
-    int i=0, j, m = sz.width, n = sz.height;
-
-    #if CV_ENABLE_UNROLLED
-    for(; i <= m - 4; i += 4 )
-    {
-        T* d0 = (T*)(dst + dstep*i);
-        T* d1 = (T*)(dst + dstep*(i+1));
-        T* d2 = (T*)(dst + dstep*(i+2));
-        T* d3 = (T*)(dst + dstep*(i+3));
-
-        for( j = 0; j <= n - 4; j += 4 )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
-            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
-            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
-            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
-
-            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
-            d1[j] = s0[1]; d1[j+1] = s1[1]; d1[j+2] = s2[1]; d1[j+3] = s3[1];
-            d2[j] = s0[2]; d2[j+1] = s1[2]; d2[j+2] = s2[2]; d2[j+3] = s3[2];
-            d3[j] = s0[3]; d3[j+1] = s1[3]; d3[j+2] = s2[3]; d3[j+3] = s3[3];
-        }
-
-        for( ; j < n; j++ )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
-            d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
-        }
-    }
-    #endif
-    for( ; i < m; i++ )
-    {
-        T* d0 = (T*)(dst + dstep*i);
-        j = 0;
-        #if CV_ENABLE_UNROLLED
-        for(; j <= n - 4; j += 4 )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
-            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
-            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
-            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
-
-            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
-        }
-        #endif
-        for( ; j < n; j++ )
-        {
-            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
-            d0[j] = s0[0];
-        }
-    }
-}
-
-template<typename T> static void
-transposeI_( uchar* data, size_t step, int n )
-{
-    for( int i = 0; i < n; i++ )
-    {
-        T* row = (T*)(data + step*i);
-        uchar* data1 = data + i*sizeof(T);
-        for( int j = i+1; j < n; j++ )
-            std::swap( row[j], *(T*)(data1 + step*j) );
-    }
-}
-
-typedef void (*TransposeFunc)( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz );
-typedef void (*TransposeInplaceFunc)( uchar* data, size_t step, int n );
-
-#define DEF_TRANSPOSE_FUNC(suffix, type) \
-static void transpose_##suffix( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) \
-{ transpose_<type>(src, sstep, dst, dstep, sz); } \
-\
-static void transposeI_##suffix( uchar* data, size_t step, int n ) \
-{ transposeI_<type>(data, step, n); }
-
-DEF_TRANSPOSE_FUNC(8u, uchar)
-DEF_TRANSPOSE_FUNC(16u, ushort)
-DEF_TRANSPOSE_FUNC(8uC3, Vec3b)
-DEF_TRANSPOSE_FUNC(32s, int)
-DEF_TRANSPOSE_FUNC(16uC3, Vec3s)
-DEF_TRANSPOSE_FUNC(32sC2, Vec2i)
-DEF_TRANSPOSE_FUNC(32sC3, Vec3i)
-DEF_TRANSPOSE_FUNC(32sC4, Vec4i)
-DEF_TRANSPOSE_FUNC(32sC6, Vec6i)
-DEF_TRANSPOSE_FUNC(32sC8, Vec8i)
-
-static TransposeFunc transposeTab[] =
-{
-    0, transpose_8u, transpose_16u, transpose_8uC3, transpose_32s, 0, transpose_16uC3, 0,
-    transpose_32sC2, 0, 0, 0, transpose_32sC3, 0, 0, 0, transpose_32sC4,
-    0, 0, 0, 0, 0, 0, 0, transpose_32sC6, 0, 0, 0, 0, 0, 0, 0, transpose_32sC8
-};
-
-static TransposeInplaceFunc transposeInplaceTab[] =
-{
-    0, transposeI_8u, transposeI_16u, transposeI_8uC3, transposeI_32s, 0, transposeI_16uC3, 0,
-    transposeI_32sC2, 0, 0, 0, transposeI_32sC3, 0, 0, 0, transposeI_32sC4,
-    0, 0, 0, 0, 0, 0, 0, transposeI_32sC6, 0, 0, 0, 0, 0, 0, 0, transposeI_32sC8
-};
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_transpose( InputArray _src, OutputArray _dst )
-{
-    const ocl::Device & dev = ocl::Device::getDefault();
-    const int TILE_DIM = 32, BLOCK_ROWS = 8;
-    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
-        rowsPerWI = dev.isIntel() ? 4 : 1;
-
-    UMat src = _src.getUMat();
-    _dst.create(src.cols, src.rows, type);
-    UMat dst = _dst.getUMat();
-
-    String kernelName("transpose");
-    bool inplace = dst.u == src.u;
-
-    if (inplace)
-    {
-        CV_Assert(dst.cols == dst.rows);
-        kernelName += "_inplace";
-    }
-    else
-    {
-        // check required local memory size
-        size_t required_local_memory = (size_t) TILE_DIM*(TILE_DIM+1)*CV_ELEM_SIZE(type);
-        if (required_local_memory > ocl::Device::getDefault().localMemSize())
-            return false;
-    }
-
-    ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
-                  format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d%s",
-                         ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
-                         cn, TILE_DIM, BLOCK_ROWS, rowsPerWI, inplace ? " -D INPLACE" : ""));
-    if (k.empty())
-        return false;
-
-    if (inplace)
-        k.args(ocl::KernelArg::ReadWriteNoSize(dst), dst.rows);
-    else
-        k.args(ocl::KernelArg::ReadOnly(src),
-               ocl::KernelArg::WriteOnlyNoSize(dst));
-
-    size_t localsize[2]  = { TILE_DIM, BLOCK_ROWS };
-    size_t globalsize[2] = { (size_t)src.cols, inplace ? ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI : (divUp((size_t)src.rows, TILE_DIM) * BLOCK_ROWS) };
-
-    if (inplace && dev.isIntel())
-    {
-        localsize[0] = 16;
-        localsize[1] = dev.maxWorkGroupSize() / localsize[0];
-    }
-
-    return k.run(2, globalsize, localsize, false);
-}
-
-#endif
-
-#ifdef HAVE_IPP
-static bool ipp_transpose( Mat &src, Mat &dst )
-{
-    CV_INSTRUMENT_REGION_IPP();
-
-    int type = src.type();
-    typedef IppStatus (CV_STDCALL * IppiTranspose)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize roiSize);
-    typedef IppStatus (CV_STDCALL * IppiTransposeI)(const void * pSrcDst, int srcDstStep, IppiSize roiSize);
-    IppiTranspose ippiTranspose = 0;
-    IppiTransposeI ippiTranspose_I = 0;
-
-    if (dst.data == src.data && dst.cols == dst.rows)
-    {
-        CV_SUPPRESS_DEPRECATED_START
-        ippiTranspose_I =
-            type == CV_8UC1 ? (IppiTransposeI)ippiTranspose_8u_C1IR :
-            type == CV_8UC3 ? (IppiTransposeI)ippiTranspose_8u_C3IR :
-            type == CV_8UC4 ? (IppiTransposeI)ippiTranspose_8u_C4IR :
-            type == CV_16UC1 ? (IppiTransposeI)ippiTranspose_16u_C1IR :
-            type == CV_16UC3 ? (IppiTransposeI)ippiTranspose_16u_C3IR :
-            type == CV_16UC4 ? (IppiTransposeI)ippiTranspose_16u_C4IR :
-            type == CV_16SC1 ? (IppiTransposeI)ippiTranspose_16s_C1IR :
-            type == CV_16SC3 ? (IppiTransposeI)ippiTranspose_16s_C3IR :
-            type == CV_16SC4 ? (IppiTransposeI)ippiTranspose_16s_C4IR :
-            type == CV_32SC1 ? (IppiTransposeI)ippiTranspose_32s_C1IR :
-            type == CV_32SC3 ? (IppiTransposeI)ippiTranspose_32s_C3IR :
-            type == CV_32SC4 ? (IppiTransposeI)ippiTranspose_32s_C4IR :
-            type == CV_32FC1 ? (IppiTransposeI)ippiTranspose_32f_C1IR :
-            type == CV_32FC3 ? (IppiTransposeI)ippiTranspose_32f_C3IR :
-            type == CV_32FC4 ? (IppiTransposeI)ippiTranspose_32f_C4IR : 0;
-        CV_SUPPRESS_DEPRECATED_END
-    }
-    else
-    {
-        ippiTranspose =
-            type == CV_8UC1 ? (IppiTranspose)ippiTranspose_8u_C1R :
-            type == CV_8UC3 ? (IppiTranspose)ippiTranspose_8u_C3R :
-            type == CV_8UC4 ? (IppiTranspose)ippiTranspose_8u_C4R :
-            type == CV_16UC1 ? (IppiTranspose)ippiTranspose_16u_C1R :
-            type == CV_16UC3 ? (IppiTranspose)ippiTranspose_16u_C3R :
-            type == CV_16UC4 ? (IppiTranspose)ippiTranspose_16u_C4R :
-            type == CV_16SC1 ? (IppiTranspose)ippiTranspose_16s_C1R :
-            type == CV_16SC3 ? (IppiTranspose)ippiTranspose_16s_C3R :
-            type == CV_16SC4 ? (IppiTranspose)ippiTranspose_16s_C4R :
-            type == CV_32SC1 ? (IppiTranspose)ippiTranspose_32s_C1R :
-            type == CV_32SC3 ? (IppiTranspose)ippiTranspose_32s_C3R :
-            type == CV_32SC4 ? (IppiTranspose)ippiTranspose_32s_C4R :
-            type == CV_32FC1 ? (IppiTranspose)ippiTranspose_32f_C1R :
-            type == CV_32FC3 ? (IppiTranspose)ippiTranspose_32f_C3R :
-            type == CV_32FC4 ? (IppiTranspose)ippiTranspose_32f_C4R : 0;
-    }
-
-    IppiSize roiSize = { src.cols, src.rows };
-    if (ippiTranspose != 0)
-    {
-        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, roiSize) >= 0)
-            return true;
-    }
-    else if (ippiTranspose_I != 0)
-    {
-        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose_I, dst.ptr(), (int)dst.step, roiSize) >= 0)
-            return true;
-    }
-    return false;
-}
-#endif
-
-}
-
-
-void cv::transpose( InputArray _src, OutputArray _dst )
-{
-    CV_INSTRUMENT_REGION();
-
-    int type = _src.type(), esz = CV_ELEM_SIZE(type);
-    CV_Assert( _src.dims() <= 2 && esz <= 32 );
-
-    CV_OCL_RUN(_dst.isUMat(),
-               ocl_transpose(_src, _dst))
-
-    Mat src = _src.getMat();
-    if( src.empty() )
-    {
-        _dst.release();
-        return;
-    }
-
-    _dst.create(src.cols, src.rows, src.type());
-    Mat dst = _dst.getMat();
-
-    // handle the case of single-column/single-row matrices, stored in STL vectors.
-    if( src.rows != dst.cols || src.cols != dst.rows )
-    {
-        CV_Assert( src.size() == dst.size() && (src.cols == 1 || src.rows == 1) );
-        src.copyTo(dst);
-        return;
-    }
-
-    CV_IPP_RUN_FAST(ipp_transpose(src, dst))
-
-    if( dst.data == src.data )
-    {
-        TransposeInplaceFunc func = transposeInplaceTab[esz];
-        CV_Assert( func != 0 );
-        CV_Assert( dst.cols == dst.rows );
-        func( dst.ptr(), dst.step, dst.rows );
-    }
-    else
-    {
-        TransposeFunc func = transposeTab[esz];
-        CV_Assert( func != 0 );
-        func( src.ptr(), src.step, dst.ptr(), dst.step, src.size() );
-    }
-}
-
 
 ////////////////////////////////////// completeSymm /////////////////////////////////////////
 
diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp
new file mode 100644
index 0000000000..37bc273b4d
--- /dev/null
+++ b/modules/core/src/matrix_transform.cpp
@@ -0,0 +1,770 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "opencl_kernels_core.hpp"
+
+namespace cv {
+
+////////////////////////////////////// transpose /////////////////////////////////////////
+
+template<typename T> static void
+transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
+{
+    int i=0, j, m = sz.width, n = sz.height;
+
+    #if CV_ENABLE_UNROLLED
+    for(; i <= m - 4; i += 4 )
+    {
+        T* d0 = (T*)(dst + dstep*i);
+        T* d1 = (T*)(dst + dstep*(i+1));
+        T* d2 = (T*)(dst + dstep*(i+2));
+        T* d3 = (T*)(dst + dstep*(i+3));
+
+        for( j = 0; j <= n - 4; j += 4 )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
+            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
+            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
+            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
+
+            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
+            d1[j] = s0[1]; d1[j+1] = s1[1]; d1[j+2] = s2[1]; d1[j+3] = s3[1];
+            d2[j] = s0[2]; d2[j+1] = s1[2]; d2[j+2] = s2[2]; d2[j+3] = s3[2];
+            d3[j] = s0[3]; d3[j+1] = s1[3]; d3[j+2] = s2[3]; d3[j+3] = s3[3];
+        }
+
+        for( ; j < n; j++ )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
+            d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
+        }
+    }
+    #endif
+    for( ; i < m; i++ )
+    {
+        T* d0 = (T*)(dst + dstep*i);
+        j = 0;
+        #if CV_ENABLE_UNROLLED
+        for(; j <= n - 4; j += 4 )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
+            const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
+            const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
+            const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
+
+            d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
+        }
+        #endif
+        for( ; j < n; j++ )
+        {
+            const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
+            d0[j] = s0[0];
+        }
+    }
+}
+
+template<typename T> static void
+transposeI_( uchar* data, size_t step, int n )
+{
+    for( int i = 0; i < n; i++ )
+    {
+        T* row = (T*)(data + step*i);
+        uchar* data1 = data + i*sizeof(T);
+        for( int j = i+1; j < n; j++ )
+            std::swap( row[j], *(T*)(data1 + step*j) );
+    }
+}
+
+typedef void (*TransposeFunc)( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz );
+typedef void (*TransposeInplaceFunc)( uchar* data, size_t step, int n );
+
+#define DEF_TRANSPOSE_FUNC(suffix, type) \
+static void transpose_##suffix( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) \
+{ transpose_<type>(src, sstep, dst, dstep, sz); } \
+\
+static void transposeI_##suffix( uchar* data, size_t step, int n ) \
+{ transposeI_<type>(data, step, n); }
+
+DEF_TRANSPOSE_FUNC(8u, uchar)
+DEF_TRANSPOSE_FUNC(16u, ushort)
+DEF_TRANSPOSE_FUNC(8uC3, Vec3b)
+DEF_TRANSPOSE_FUNC(32s, int)
+DEF_TRANSPOSE_FUNC(16uC3, Vec3s)
+DEF_TRANSPOSE_FUNC(32sC2, Vec2i)
+DEF_TRANSPOSE_FUNC(32sC3, Vec3i)
+DEF_TRANSPOSE_FUNC(32sC4, Vec4i)
+DEF_TRANSPOSE_FUNC(32sC6, Vec6i)
+DEF_TRANSPOSE_FUNC(32sC8, Vec8i)
+
+static TransposeFunc transposeTab[] =
+{
+    0, transpose_8u, transpose_16u, transpose_8uC3, transpose_32s, 0, transpose_16uC3, 0,
+    transpose_32sC2, 0, 0, 0, transpose_32sC3, 0, 0, 0, transpose_32sC4,
+    0, 0, 0, 0, 0, 0, 0, transpose_32sC6, 0, 0, 0, 0, 0, 0, 0, transpose_32sC8
+};
+
+static TransposeInplaceFunc transposeInplaceTab[] =
+{
+    0, transposeI_8u, transposeI_16u, transposeI_8uC3, transposeI_32s, 0, transposeI_16uC3, 0,
+    transposeI_32sC2, 0, 0, 0, transposeI_32sC3, 0, 0, 0, transposeI_32sC4,
+    0, 0, 0, 0, 0, 0, 0, transposeI_32sC6, 0, 0, 0, 0, 0, 0, 0, transposeI_32sC8
+};
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_transpose( InputArray _src, OutputArray _dst )
+{
+    const ocl::Device & dev = ocl::Device::getDefault();
+    const int TILE_DIM = 32, BLOCK_ROWS = 8;
+    int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
+        rowsPerWI = dev.isIntel() ? 4 : 1;
+
+    UMat src = _src.getUMat();
+    _dst.create(src.cols, src.rows, type);
+    UMat dst = _dst.getUMat();
+
+    String kernelName("transpose");
+    bool inplace = dst.u == src.u;
+
+    if (inplace)
+    {
+        CV_Assert(dst.cols == dst.rows);
+        kernelName += "_inplace";
+    }
+    else
+    {
+        // check required local memory size
+        size_t required_local_memory = (size_t) TILE_DIM*(TILE_DIM+1)*CV_ELEM_SIZE(type);
+        if (required_local_memory > ocl::Device::getDefault().localMemSize())
+            return false;
+    }
+
+    ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
+                  format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d%s",
+                         ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
+                         cn, TILE_DIM, BLOCK_ROWS, rowsPerWI, inplace ? " -D INPLACE" : ""));
+    if (k.empty())
+        return false;
+
+    if (inplace)
+        k.args(ocl::KernelArg::ReadWriteNoSize(dst), dst.rows);
+    else
+        k.args(ocl::KernelArg::ReadOnly(src),
+               ocl::KernelArg::WriteOnlyNoSize(dst));
+
+    size_t localsize[2]  = { TILE_DIM, BLOCK_ROWS };
+    size_t globalsize[2] = { (size_t)src.cols, inplace ? ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI : (divUp((size_t)src.rows, TILE_DIM) * BLOCK_ROWS) };
+
+    if (inplace && dev.isIntel())
+    {
+        localsize[0] = 16;
+        localsize[1] = dev.maxWorkGroupSize() / localsize[0];
+    }
+
+    return k.run(2, globalsize, localsize, false);
+}
+
+#endif
+
+#ifdef HAVE_IPP
+static bool ipp_transpose( Mat &src, Mat &dst )
+{
+    CV_INSTRUMENT_REGION_IPP();
+
+    int type = src.type();
+    typedef IppStatus (CV_STDCALL * IppiTranspose)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize roiSize);
+    typedef IppStatus (CV_STDCALL * IppiTransposeI)(const void * pSrcDst, int srcDstStep, IppiSize roiSize);
+    IppiTranspose ippiTranspose = 0;
+    IppiTransposeI ippiTranspose_I = 0;
+
+    if (dst.data == src.data && dst.cols == dst.rows)
+    {
+        CV_SUPPRESS_DEPRECATED_START
+        ippiTranspose_I =
+            type == CV_8UC1 ? (IppiTransposeI)ippiTranspose_8u_C1IR :
+            type == CV_8UC3 ? (IppiTransposeI)ippiTranspose_8u_C3IR :
+            type == CV_8UC4 ? (IppiTransposeI)ippiTranspose_8u_C4IR :
+            type == CV_16UC1 ? (IppiTransposeI)ippiTranspose_16u_C1IR :
+            type == CV_16UC3 ? (IppiTransposeI)ippiTranspose_16u_C3IR :
+            type == CV_16UC4 ? (IppiTransposeI)ippiTranspose_16u_C4IR :
+            type == CV_16SC1 ? (IppiTransposeI)ippiTranspose_16s_C1IR :
+            type == CV_16SC3 ? (IppiTransposeI)ippiTranspose_16s_C3IR :
+            type == CV_16SC4 ? (IppiTransposeI)ippiTranspose_16s_C4IR :
+            type == CV_32SC1 ? (IppiTransposeI)ippiTranspose_32s_C1IR :
+            type == CV_32SC3 ? (IppiTransposeI)ippiTranspose_32s_C3IR :
+            type == CV_32SC4 ? (IppiTransposeI)ippiTranspose_32s_C4IR :
+            type == CV_32FC1 ? (IppiTransposeI)ippiTranspose_32f_C1IR :
+            type == CV_32FC3 ? (IppiTransposeI)ippiTranspose_32f_C3IR :
+            type == CV_32FC4 ? (IppiTransposeI)ippiTranspose_32f_C4IR : 0;
+        CV_SUPPRESS_DEPRECATED_END
+    }
+    else
+    {
+        ippiTranspose =
+            type == CV_8UC1 ? (IppiTranspose)ippiTranspose_8u_C1R :
+            type == CV_8UC3 ? (IppiTranspose)ippiTranspose_8u_C3R :
+            type == CV_8UC4 ? (IppiTranspose)ippiTranspose_8u_C4R :
+            type == CV_16UC1 ? (IppiTranspose)ippiTranspose_16u_C1R :
+            type == CV_16UC3 ? (IppiTranspose)ippiTranspose_16u_C3R :
+            type == CV_16UC4 ? (IppiTranspose)ippiTranspose_16u_C4R :
+            type == CV_16SC1 ? (IppiTranspose)ippiTranspose_16s_C1R :
+            type == CV_16SC3 ? (IppiTranspose)ippiTranspose_16s_C3R :
+            type == CV_16SC4 ? (IppiTranspose)ippiTranspose_16s_C4R :
+            type == CV_32SC1 ? (IppiTranspose)ippiTranspose_32s_C1R :
+            type == CV_32SC3 ? (IppiTranspose)ippiTranspose_32s_C3R :
+            type == CV_32SC4 ? (IppiTranspose)ippiTranspose_32s_C4R :
+            type == CV_32FC1 ? (IppiTranspose)ippiTranspose_32f_C1R :
+            type == CV_32FC3 ? (IppiTranspose)ippiTranspose_32f_C3R :
+            type == CV_32FC4 ? (IppiTranspose)ippiTranspose_32f_C4R : 0;
+    }
+
+    IppiSize roiSize = { src.cols, src.rows };
+    if (ippiTranspose != 0)
+    {
+        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, roiSize) >= 0)
+            return true;
+    }
+    else if (ippiTranspose_I != 0)
+    {
+        if (CV_INSTRUMENT_FUN_IPP(ippiTranspose_I, dst.ptr(), (int)dst.step, roiSize) >= 0)
+            return true;
+    }
+    return false;
+}
+#endif
+
+
+void transpose( InputArray _src, OutputArray _dst )
+{
+    CV_INSTRUMENT_REGION();
+
+    int type = _src.type(), esz = CV_ELEM_SIZE(type);
+    CV_Assert( _src.dims() <= 2 && esz <= 32 );
+
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_transpose(_src, _dst))
+
+    Mat src = _src.getMat();
+    if( src.empty() )
+    {
+        _dst.release();
+        return;
+    }
+
+    _dst.create(src.cols, src.rows, src.type());
+    Mat dst = _dst.getMat();
+
+    // handle the case of single-column/single-row matrices, stored in STL vectors.
+    if( src.rows != dst.cols || src.cols != dst.rows )
+    {
+        CV_Assert( src.size() == dst.size() && (src.cols == 1 || src.rows == 1) );
+        src.copyTo(dst);
+        return;
+    }
+
+    CV_IPP_RUN_FAST(ipp_transpose(src, dst))
+
+    if( dst.data == src.data )
+    {
+        TransposeInplaceFunc func = transposeInplaceTab[esz];
+        CV_Assert( func != 0 );
+        CV_Assert( dst.cols == dst.rows );
+        func( dst.ptr(), dst.step, dst.rows );
+    }
+    else
+    {
+        TransposeFunc func = transposeTab[esz];
+        CV_Assert( func != 0 );
+        func( src.ptr(), src.step, dst.ptr(), dst.step, src.size() );
+    }
+}
+
+
+#if CV_SIMD128
+template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    typedef typename V::lane_type T;
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+    int width_1 = width & -v_uint8x16::nlanes;
+    int i, j;
+
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(T)>(src, dst));
+#endif
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+        {
+            V t0, t1;
+
+            t0 = v_load((T*)((uchar*)src + i));
+            t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
+            t0 = v_reverse(t0);
+            t1 = v_reverse(t1);
+            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
+            v_store((T*)(dst + i), t1);
+        }
+        if (isAligned<sizeof(T)>(src, dst))
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                T t0, t1;
+
+                t0 = *((T*)((uchar*)src + i));
+                t1 = *((T*)((uchar*)src + j - sizeof(T)));
+                *((T*)(dst + j - sizeof(T))) = t0;
+                *((T*)(dst + i)) = t1;
+            }
+        }
+        else
+        {
+            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
+            {
+                for (int k = 0; k < (int)sizeof(T); k++)
+                {
+                    uchar t0, t1;
+
+                    t0 = *((uchar*)src + i + k);
+                    t1 = *((uchar*)src + j + k - sizeof(T));
+                    *(dst + j + k - sizeof(T)) = t0;
+                    *(dst + i + k) = t1;
+                }
+            }
+        }
+    }
+}
+
+template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+    int end = (int)(size.width*esz);
+    int width = (end + 1)/2;
+
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(T1)>(src, dst));
+    CV_Assert(isAligned<sizeof(T2)>(src, dst));
+#endif
+
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
+        {
+            T1 t0, t1;
+            T2 t2, t3;
+
+            t0 = *((T1*)((uchar*)src + i));
+            t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
+            t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
+            t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
+            *((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
+            *((T2*)(dst + j - sizeof(T2))) = t2;
+            *((T1*)(dst + i)) = t1;
+            *((T2*)(dst + i + sizeof(T1))) = t3;
+        }
+    }
+}
+#endif
+
+static void
+flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
+{
+#if CV_SIMD
+#if CV_STRONG_ALIGNMENT
+    size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
+#endif
+    if (esz == 2 * v_uint8x16::nlanes)
+    {
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
+            {
+#if CV_SIMD256
+                v_uint8x32 t0, t1;
+
+                t0 = v256_load((uchar*)src + i);
+                t1 = v256_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+#else
+                v_uint8x16 t0, t1, t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
+                t2 = v_load((uchar*)src + j);
+                t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
+                v_store(dst + j, t0);
+                v_store(dst + j + v_uint8x16::nlanes, t1);
+                v_store(dst + i, t2);
+                v_store(dst + i + v_uint8x16::nlanes, t3);
+#endif
+            }
+        }
+    }
+    else if (esz == v_uint8x16::nlanes)
+    {
+        int end = (int)(size.width*esz);
+        int width = end/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
+            {
+                v_uint8x16 t0, t1;
+
+                t0 = v_load((uchar*)src + i);
+                t1 = v_load((uchar*)src + j);
+                v_store(dst + j, t0);
+                v_store(dst + i, t1);
+            }
+        }
+    }
+    else if (esz == 8
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(uint64)>(alignmentMark)
+#endif
+    )
+    {
+        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 4
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(unsigned)>(alignmentMark)
+#endif
+    )
+    {
+        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 2
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(ushort)>(alignmentMark)
+#endif
+    )
+    {
+        flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 1)
+    {
+        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 24
+#if CV_STRONG_ALIGNMENT
+            && isAligned<sizeof(uint64_t)>(alignmentMark)
+#endif
+    )
+    {
+        int end = (int)(size.width*esz);
+        int width = (end + 1)/2;
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
+            {
+                v_uint8x16 t0, t1;
+                uint64_t t2, t3;
+
+                t0 = v_load((uchar*)src + i);
+                t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
+                t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
+                t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
+                v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
+                *((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
+                v_store(dst + i, t1);
+                *((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
+            }
+        }
+    }
+#if !CV_STRONG_ALIGNMENT
+    else if (esz == 12)
+    {
+        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 6)
+    {
+        flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
+    }
+    else if (esz == 3)
+    {
+        flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
+    }
+#endif
+    else
+#endif // CV_SIMD
+    {
+        int i, j, limit = (int)(((size.width + 1)/2)*esz);
+        AutoBuffer<int> _tab(size.width*esz);
+        int* tab = _tab.data();
+
+        for( i = 0; i < size.width; i++ )
+            for( size_t k = 0; k < esz; k++ )
+                tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
+
+        for( ; size.height--; src += sstep, dst += dstep )
+        {
+            for( i = 0; i < limit; i++ )
+            {
+                j = tab[i];
+                uchar t0 = src[i], t1 = src[j];
+                dst[i] = t1; dst[j] = t0;
+            }
+        }
+    }
+}
+
+static void
+flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, size_t esz )
+{
+    const uchar* src1 = src0 + (size.height - 1)*sstep;
+    uchar* dst1 = dst0 + (size.height - 1)*dstep;
+    size.width *= (int)esz;
+
+    for( int y = 0; y < (size.height + 1)/2; y++, src0 += sstep, src1 -= sstep,
+                                                  dst0 += dstep, dst1 -= dstep )
+    {
+        int i = 0;
+#if CV_SIMD
+#if CV_STRONG_ALIGNMENT
+        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
+#endif
+        {
+            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            {
+                v_int32 t0 = vx_load((int*)(src0 + i));
+                v_int32 t1 = vx_load((int*)(src1 + i));
+                vx_store((int*)(dst0 + i), t1);
+                vx_store((int*)(dst1 + i), t0);
+            }
+        }
+#if CV_STRONG_ALIGNMENT
+        else
+        {
+            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            {
+                v_uint8 t0 = vx_load(src0 + i);
+                v_uint8 t1 = vx_load(src1 + i);
+                vx_store(dst0 + i, t1);
+                vx_store(dst1 + i, t0);
+            }
+        }
+#endif
+#endif
+
+        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
+        {
+            for( ; i <= size.width - 16; i += 16 )
+            {
+                int t0 = ((int*)(src0 + i))[0];
+                int t1 = ((int*)(src1 + i))[0];
+
+                ((int*)(dst0 + i))[0] = t1;
+                ((int*)(dst1 + i))[0] = t0;
+
+                t0 = ((int*)(src0 + i))[1];
+                t1 = ((int*)(src1 + i))[1];
+
+                ((int*)(dst0 + i))[1] = t1;
+                ((int*)(dst1 + i))[1] = t0;
+
+                t0 = ((int*)(src0 + i))[2];
+                t1 = ((int*)(src1 + i))[2];
+
+                ((int*)(dst0 + i))[2] = t1;
+                ((int*)(dst1 + i))[2] = t0;
+
+                t0 = ((int*)(src0 + i))[3];
+                t1 = ((int*)(src1 + i))[3];
+
+                ((int*)(dst0 + i))[3] = t1;
+                ((int*)(dst1 + i))[3] = t0;
+            }
+
+            for( ; i <= size.width - 4; i += 4 )
+            {
+                int t0 = ((int*)(src0 + i))[0];
+                int t1 = ((int*)(src1 + i))[0];
+
+                ((int*)(dst0 + i))[0] = t1;
+                ((int*)(dst1 + i))[0] = t0;
+            }
+        }
+
+        for( ; i < size.width; i++ )
+        {
+            uchar t0 = src0[i];
+            uchar t1 = src1[i];
+
+            dst0[i] = t1;
+            dst1[i] = t0;
+        }
+    }
+}
+
+#ifdef HAVE_OPENCL
+
+enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
+
+static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
+{
+    CV_Assert(flipCode >= -1 && flipCode <= 1);
+
+    const ocl::Device & dev = ocl::Device::getDefault();
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
+
+    bool doubleSupport = dev.doubleFPConfig() > 0;
+    if (!doubleSupport && depth == CV_64F)
+        kercn = cn;
+
+    if (cn > 4)
+        return false;
+
+    const char * kernelName;
+    if (flipCode == 0)
+        kernelName = "arithm_flip_rows", flipType = FLIP_ROWS;
+    else if (flipCode > 0)
+        kernelName = "arithm_flip_cols", flipType = FLIP_COLS;
+    else
+        kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH;
+
+    int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
+    kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
+
+    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
+        format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
+                kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
+                kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
+    if (k.empty())
+        return false;
+
+    Size size = _src.size();
+    _dst.create(size, type);
+    UMat src = _src.getUMat(), dst = _dst.getUMat();
+
+    int cols = size.width * cn / kercn, rows = size.height;
+    cols = flipType == FLIP_COLS ? (cols + 1) >> 1 : cols;
+    rows = flipType & FLIP_ROWS ? (rows + 1) >> 1 : rows;
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(src),
+           ocl::KernelArg::WriteOnly(dst, cn, kercn), rows, cols);
+
+    size_t maxWorkGroupSize = dev.maxWorkGroupSize();
+    CV_Assert(maxWorkGroupSize % 4 == 0);
+
+    size_t globalsize[2] = { (size_t)cols, ((size_t)rows + pxPerWIy - 1) / pxPerWIy },
+            localsize[2] = { maxWorkGroupSize / 4, 4 };
+    return k.run(2, globalsize, (flipType == FLIP_COLS) && !dev.isIntel() ? localsize : NULL, false);
+}
+
+#endif
+
+#if defined HAVE_IPP
+static bool ipp_flip(Mat &src, Mat &dst, int flip_mode)
+{
+#ifdef HAVE_IPP_IW
+    CV_INSTRUMENT_REGION_IPP();
+
+    // Details: https://github.com/opencv/opencv/issues/12943
+    if (flip_mode <= 0 /* swap rows */
+        && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42
+        && (int64_t)(src.total()) * src.elemSize() >= CV_BIG_INT(0x80000000)/*2Gb*/
+    )
+        return false;
+
+    IppiAxis ippMode;
+    if(flip_mode < 0)
+        ippMode = ippAxsBoth;
+    else if(flip_mode == 0)
+        ippMode = ippAxsHorizontal;
+    else
+        ippMode = ippAxsVertical;
+
+    try
+    {
+        ::ipp::IwiImage iwSrc = ippiGetImage(src);
+        ::ipp::IwiImage iwDst = ippiGetImage(dst);
+
+        CV_INSTRUMENT_FUN_IPP(::ipp::iwiMirror, iwSrc, iwDst, ippMode);
+    }
+    catch(const ::ipp::IwException &)
+    {
+        return false;
+    }
+
+    return true;
+#else
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(flip_mode);
+    return false;
+#endif
+}
+#endif
+
+
+void flip( InputArray _src, OutputArray _dst, int flip_mode )
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_Assert( _src.dims() <= 2 );
+    Size size = _src.size();
+
+    if (flip_mode < 0)
+    {
+        if (size.width == 1)
+            flip_mode = 0;
+        if (size.height == 1)
+            flip_mode = 1;
+    }
+
+    if ((size.width == 1 && flip_mode > 0) ||
+        (size.height == 1 && flip_mode == 0))
+    {
+        return _src.copyTo(_dst);
+    }
+
+    CV_OCL_RUN( _dst.isUMat(), ocl_flip(_src, _dst, flip_mode))
+
+    Mat src = _src.getMat();
+    int type = src.type();
+    _dst.create( size, type );
+    Mat dst = _dst.getMat();
+
+    CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
+
+    size_t esz = CV_ELEM_SIZE(type);
+
+    if( flip_mode <= 0 )
+        flipVert( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
+    else
+        flipHoriz( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
+
+    if( flip_mode < 0 )
+        flipHoriz( dst.ptr(), dst.step, dst.ptr(), dst.step, dst.size(), esz );
+}
+
+void rotate(InputArray _src, OutputArray _dst, int rotateMode)
+{
+    CV_Assert(_src.dims() <= 2);
+
+    switch (rotateMode)
+    {
+    case ROTATE_90_CLOCKWISE:
+        transpose(_src, _dst);
+        flip(_dst, _dst, 1);
+        break;
+    case ROTATE_180:
+        flip(_src, _dst, -1);
+        break;
+    case ROTATE_90_COUNTERCLOCKWISE:
+        transpose(_src, _dst);
+        flip(_dst, _dst, 0);
+        break;
+    default:
+        break;
+    }
+}
+
+}  // namespace
diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp
index 0d439759cc..53e0d24470 100644
--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@@ -316,6 +316,7 @@ void _InputArray::getUMatVector(std::vector<UMat>& umv) const
 
 cuda::GpuMat _InputArray::getGpuMat() const
 {
+#ifdef HAVE_CUDA
     int k = kind();
 
     if (k == CUDA_GPU_MAT)
@@ -339,14 +340,22 @@ cuda::GpuMat _InputArray::getGpuMat() const
         return cuda::GpuMat();
 
     CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::HostMem");
+#else
+    CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
 }
 void _InputArray::getGpuMatVector(std::vector<cuda::GpuMat>& gpumv) const
 {
+#ifdef HAVE_CUDA
     int k = kind();
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
         gpumv = *(std::vector<cuda::GpuMat>*)obj;
     }
+#else
+    CV_UNUSED(gpumv);
+    CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
 }
 ogl::Buffer _InputArray::getOGlBuffer() const
 {
@@ -457,11 +466,15 @@ Size _InputArray::size(int i) const
 
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
+#ifdef HAVE_CUDA
         const std::vector<cuda::GpuMat>& vv = *(const std::vector<cuda::GpuMat>*)obj;
         if (i < 0)
             return vv.empty() ? Size() : Size((int)vv.size(), 1);
         CV_Assert(i < (int)vv.size());
         return vv[i].size();
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == STD_VECTOR_UMAT )
@@ -795,6 +808,7 @@ int _InputArray::type(int i) const
 
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
+#ifdef HAVE_CUDA
         const std::vector<cuda::GpuMat>& vv = *(const std::vector<cuda::GpuMat>*)obj;
         if (vv.empty())
         {
@@ -803,6 +817,9 @@ int _InputArray::type(int i) const
         }
         CV_Assert(i < (int)vv.size());
         return vv[i >= 0 ? i : 0].type();
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == OPENGL_BUFFER )
@@ -1164,22 +1181,34 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
     {
         CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == _sz);
         CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::GpuMat*)obj)->create(_sz, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == _sz);
         CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
+#ifdef HAVE_OPENGL
         ((ogl::Buffer*)obj)->create(_sz, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
+#endif
     }
     if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == _sz);
         CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::HostMem*)obj)->create(_sz, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     int sizes[] = {_sz.height, _sz.width};
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
@@ -1206,22 +1235,34 @@ void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTran
     {
         CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::GpuMat*)obj)->create(_rows, _cols, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
+#ifdef HAVE_OPENGL
         ((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
+#endif
     }
     if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
         CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+#ifdef HAVE_CUDA
         ((cuda::HostMem*)obj)->create(_rows, _cols, mtype);
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     int sizes[] = {_rows, _cols};
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
@@ -1644,20 +1685,32 @@ void _OutputArray::release() const
 
     if( k == CUDA_GPU_MAT )
     {
+#ifdef HAVE_CUDA
         ((cuda::GpuMat*)obj)->release();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == CUDA_HOST_MEM )
     {
+#ifdef HAVE_CUDA
         ((cuda::HostMem*)obj)->release();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
 
     if( k == OPENGL_BUFFER )
     {
+#ifdef HAVE_OPENGL
         ((ogl::Buffer*)obj)->release();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
+#endif
     }
 
     if( k == NONE )
@@ -1688,8 +1741,12 @@ void _OutputArray::release() const
     }
     if (k == STD_VECTOR_CUDA_GPU_MAT)
     {
+#ifdef HAVE_CUDA
         ((std::vector<cuda::GpuMat>*)obj)->clear();
         return;
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }
@@ -1797,9 +1854,13 @@ void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
         ((UMat*)obj)->setTo(arr, mask);
     else if( k == CUDA_GPU_MAT )
     {
+#ifdef HAVE_CUDA
         Mat value = arr.getMat();
         CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::CUDA_GPU_MAT) );
         ((cuda::GpuMat*)obj)->setTo(Scalar(Vec<double, 4>(value.ptr<double>())), mask);
+#else
+        CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
+#endif
     }
     else
         CV_Error(Error::StsNotImplemented, "");
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index b95cd99bd8..601082783e 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -205,13 +205,10 @@ int normL1_(const uchar* a, const uchar* b, int n)
     return d;
 }
 
-}} //cv::hal
+} //cv::hal
 
 //==================================================================================================
 
-namespace cv
-{
-
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
@@ -591,12 +588,10 @@ static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
     CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result);
 #endif
     return false;
-}
-#endif
+}  // ipp_norm()
+#endif  // HAVE_IPP
 
-} // cv::
-
-double cv::norm( InputArray _src, int normType, InputArray _mask )
+double norm( InputArray _src, int normType, InputArray _mask )
 {
     CV_INSTRUMENT_REGION();
 
@@ -769,9 +764,6 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
 //==================================================================================================
 
 #ifdef HAVE_OPENCL
-
-namespace cv {
-
 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
 {
 #ifdef __ANDROID__
@@ -826,15 +818,10 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
         result /= (s2 + DBL_EPSILON);
 
     return true;
-}
-
-}
-
-#endif
+}  // ocl_norm()
+#endif  // HAVE_OPENCL
 
 #ifdef HAVE_IPP
-namespace cv
-{
 static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result)
 {
     CV_INSTRUMENT_REGION_IPP();
@@ -1060,12 +1047,11 @@ static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArra
     CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result);
 #endif
     return false;
-}
-}
-#endif
+}  // ipp_norm
+#endif  // HAVE_IPP
 
 
-double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
+double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
 {
     CV_INSTRUMENT_REGION();
 
@@ -1234,12 +1220,12 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
     return result.d;
 }
 
-cv::Hamming::ResultType cv::Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
+cv::Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
 {
     return cv::hal::normHamming(a, b, size);
 }
 
-double cv::PSNR(InputArray _src1, InputArray _src2)
+double PSNR(InputArray _src1, InputArray _src2)
 {
     CV_INSTRUMENT_REGION();
 
@@ -1249,3 +1235,141 @@ double cv::PSNR(InputArray _src1, InputArray _src2)
     double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
     return 20*log10(255./(diff+DBL_EPSILON));
 }
+
+
+#ifdef HAVE_OPENCL
+static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
+                           double scale, double delta )
+{
+    UMat src = _src.getUMat();
+
+    if( _mask.empty() )
+        src.convertTo( _dst, dtype, scale, delta );
+    else if (src.channels() <= 4)
+    {
+        const ocl::Device & dev = ocl::Device::getDefault();
+
+        int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
+                ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
+                rowsPerWI = dev.isIntel() ? 4 : 1;
+
+        float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
+        bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
+                haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
+                haveDelta = std::fabs(delta) > DBL_EPSILON,
+                doubleSupport = dev.doubleFPConfig() > 0;
+
+        if (!haveScale && !haveDelta && stype == dtype)
+        {
+            _src.copyTo(_dst, _mask);
+            return true;
+        }
+        if (haveZeroScale)
+        {
+            _dst.setTo(Scalar(delta), _mask);
+            return true;
+        }
+
+        if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
+            return false;
+
+        char cvt[2][40];
+        String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
+                             " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
+                             ocl::typeToStr(stype), ocl::typeToStr(dtype),
+                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
+                             rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
+                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+                             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                             haveScale ? " -D HAVE_SCALE" : "",
+                             haveDelta ? " -D HAVE_DELTA" : "",
+                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
+
+        ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
+        if (k.empty())
+            return false;
+
+        UMat mask = _mask.getUMat(), dst = _dst.getUMat();
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+                maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
+                dstarg = ocl::KernelArg::ReadWrite(dst);
+
+        if (haveScale)
+        {
+            if (haveDelta)
+                k.args(srcarg, maskarg, dstarg, fscale, fdelta);
+            else
+                k.args(srcarg, maskarg, dstarg, fscale);
+        }
+        else
+        {
+            if (haveDelta)
+                k.args(srcarg, maskarg, dstarg, fdelta);
+            else
+                k.args(srcarg, maskarg, dstarg);
+        }
+
+        size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
+        return k.run(2, globalsize, NULL, false);
+    }
+    else
+    {
+        UMat temp;
+        src.convertTo( temp, dtype, scale, delta );
+        temp.copyTo( _dst, _mask );
+    }
+
+    return true;
+}  // ocl_normalize
+#endif  // HAVE_OPENCL
+
+void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
+               int norm_type, int rtype, InputArray _mask)
+{
+    CV_INSTRUMENT_REGION();
+
+    double scale = 1, shift = 0;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type);
+
+    if( rtype < 0 )
+        rtype = _dst.fixedType() ? _dst.depth() : depth;
+
+    if( norm_type == CV_MINMAX )
+    {
+        double smin = 0, smax = 0;
+        double dmin = MIN( a, b ), dmax = MAX( a, b );
+        minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
+        scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
+        if( rtype == CV_32F )
+        {
+            scale = (float)scale;
+            shift = (float)dmin - (float)(smin*scale);
+        }
+        else
+            shift = dmin - smin*scale;
+    }
+    else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
+    {
+        scale = norm( _src, norm_type, _mask );
+        scale = scale > DBL_EPSILON ? a/scale : 0.;
+        shift = 0;
+    }
+    else
+        CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
+
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
+
+    Mat src = _src.getMat();
+    if( _mask.empty() )
+        src.convertTo( _dst, rtype, scale, shift );
+    else
+    {
+        Mat temp;
+        src.convertTo( temp, rtype, scale, shift );
+        temp.copyTo( _dst, _mask );
+    }
+}
+
+}  // namespace
diff --git a/modules/core/src/persistence_c.cpp b/modules/core/src/persistence_c.cpp
index 9ec70190df..904164c783 100644
--- a/modules/core/src/persistence_c.cpp
+++ b/modules/core/src/persistence_c.cpp
@@ -1378,48 +1378,6 @@ cvTypeOf( const void* struct_ptr )
 }
 
 
-/* universal functions */
-CV_IMPL void
-cvRelease( void** struct_ptr )
-{
-    CvTypeInfo* info;
-
-    if( !struct_ptr )
-        CV_Error( CV_StsNullPtr, "NULL double pointer" );
-
-    if( *struct_ptr )
-    {
-        info = cvTypeOf( *struct_ptr );
-        if( !info )
-            CV_Error( CV_StsError, "Unknown object type" );
-        if( !info->release )
-            CV_Error( CV_StsError, "release function pointer is NULL" );
-
-        info->release( struct_ptr );
-        *struct_ptr = 0;
-    }
-}
-
-
-void* cvClone( const void* struct_ptr )
-{
-    void* struct_copy = 0;
-    CvTypeInfo* info;
-
-    if( !struct_ptr )
-        CV_Error( CV_StsNullPtr, "NULL structure pointer" );
-
-    info = cvTypeOf( struct_ptr );
-    if( !info )
-        CV_Error( CV_StsError, "Unknown object type" );
-    if( !info->clone )
-        CV_Error( CV_StsError, "clone function pointer is NULL" );
-
-    struct_copy = info->clone( struct_ptr );
-    return struct_copy;
-}
-
-
 /* reads matrix, image, sequence, graph etc. */
 CV_IMPL void*
 cvRead( CvFileStorage* fs, CvFileNode* node, CvAttrList* list )
diff --git a/modules/core/src/rand.cpp b/modules/core/src/rand.cpp
index 8c66cdcc07..2ae5664245 100644
--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -867,6 +867,9 @@ void cv::randShuffle( InputOutputArray _dst, double iterFactor, RNG* _rng )
     func( dst, rng, iterFactor );
 }
 
+
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL void
 cvRandArr( CvRNG* _rng, CvArr* arr, int disttype, CvScalar param1, CvScalar param2 )
 {
@@ -884,6 +887,9 @@ CV_IMPL void cvRandShuffle( CvArr* arr, CvRNG* _rng, double iter_factor )
     cv::randShuffle( dst, iter_factor, &rng );
 }
 
+#endif  // OPENCV_EXCLUDE_C_API
+
+
 // Mersenne Twister random number generator.
 // Inspired by http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c
 
diff --git a/modules/core/src/stat_c.cpp b/modules/core/src/stat_c.cpp
index d7355b9f94..8b6f0f09e4 100644
--- a/modules/core/src/stat_c.cpp
+++ b/modules/core/src/stat_c.cpp
@@ -5,6 +5,8 @@
 
 #include "precomp.hpp"
 
+#ifndef OPENCV_EXCLUDE_C_API
+
 CV_IMPL CvScalar cvSum( const CvArr* srcarr )
 {
     cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
@@ -117,3 +119,5 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
 
     return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
 }
+
+#endif  // OPENCV_EXCLUDE_C_API
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index f21cf7b7e2..936348f779 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -1259,88 +1259,6 @@ UMat UMat::t() const
     return m;
 }
 
-UMat UMat::inv(int method) const
-{
-    UMat m;
-    invert(*this, m, method);
-    return m;
-}
-
-UMat UMat::mul(InputArray m, double scale) const
-{
-    UMat dst;
-    multiply(*this, m, dst, scale);
-    return dst;
-}
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
-{
-    UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
-
-    int type = src1.type(), depth = CV_MAT_DEPTH(type),
-            kercn = ocl::predictOptimalVectorWidth(src1, src2);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-
-    if ( !doubleSupport && depth == CV_64F )
-        return false;
-
-    int dbsize = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
-    int ddepth = std::max(CV_32F, depth);
-
-    int wgs2_aligned = 1;
-    while (wgs2_aligned < (int)wgs)
-        wgs2_aligned <<= 1;
-    wgs2_aligned >>= 1;
-
-    char cvt[40];
-    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
-                         "-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
-                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
-                         ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
-                         ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
-                         (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                         _src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
-                         _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
-    if (k.empty())
-        return false;
-
-    UMat db(1, dbsize, ddepth);
-
-    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
-            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
-            dbarg = ocl::KernelArg::PtrWriteOnly(db);
-
-    k.args(src1arg, src1.cols, (int)src1.total(), dbsize, dbarg, src2arg);
-
-    size_t globalsize = dbsize * wgs;
-    if (k.run(1, &globalsize, &wgs, false))
-    {
-        res = sum(db.getMat(ACCESS_READ))[0];
-        return true;
-    }
-    return false;
-}
-
-#endif
-
-double UMat::dot(InputArray m) const
-{
-    CV_INSTRUMENT_REGION();
-
-    CV_Assert(m.sameSize(*this) && m.type() == type());
-
-#ifdef HAVE_OPENCL
-    double r = 0;
-    CV_OCL_RUN_(dims <= 2, ocl_dot(*this, m, r), r)
-#endif
-
-    return getMat(ACCESS_READ).dot(m);
-}
-
 UMat UMat::zeros(int rows, int cols, int type)
 {
     return UMat(rows, cols, type, Scalar::all(0));
@@ -1371,18 +1289,6 @@ UMat UMat::ones(int ndims, const int* sz, int type)
     return UMat(ndims, sz, type, Scalar(1));
 }
 
-UMat UMat::eye(int rows, int cols, int type)
-{
-    return UMat::eye(Size(cols, rows), type);
-}
-
-UMat UMat::eye(Size size, int type)
-{
-    UMat m(size, type);
-    setIdentity(m);
-    return m;
-}
-
 }
 
 /* End of file. */

From 75ad74c893a02728821d8432fe73e89e35f49ec0 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 2 Mar 2021 23:56:27 +0000
Subject: [PATCH 03/10] ffmpeg/3.4: update FFmpeg wrapper 2021.03

- FFmpeg 3.4.8
---
 3rdparty/ffmpeg/ffmpeg.cmake | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/3rdparty/ffmpeg/ffmpeg.cmake b/3rdparty/ffmpeg/ffmpeg.cmake
index 531d301fa8..a1ada4eeaa 100644
--- a/3rdparty/ffmpeg/ffmpeg.cmake
+++ b/3rdparty/ffmpeg/ffmpeg.cmake
@@ -1,8 +1,8 @@
-# Binaries branch name: ffmpeg/3.4_20200907
-# Binaries were created for OpenCV: 03bee14372f5537daa56c62e771ec16181ca1f98
-ocv_update(FFMPEG_BINARIES_COMMIT "2a96257b743695a47f8012aab1ffb995a1dee8b4")
-ocv_update(FFMPEG_FILE_HASH_BIN32 "5e68a3ff82f43ac6524e50e448a34c9c")
-ocv_update(FFMPEG_FILE_HASH_BIN64 "205db629d893e7d4865fd1459807ff47")
+# Binaries branch name: ffmpeg/3.4_20210302
+# Binaries were created for OpenCV: 2ab1f3f166fccc3a01497209cc01c5cea44ff201
+ocv_update(FFMPEG_BINARIES_COMMIT "e99214251d9f3cde7c48abd46b2259bddc9885b6")
+ocv_update(FFMPEG_FILE_HASH_BIN32 "fad5ada9be36120bba8966709e7953a8")
+ocv_update(FFMPEG_FILE_HASH_BIN64 "650e2272728491923e566f784f79cfef")
 ocv_update(FFMPEG_FILE_HASH_CMAKE "3b90f67f4b429e77d3da36698cef700c")
 
 function(download_win_ffmpeg script_var)

From a42d4da003357751a3579bef1568dcbf803f8bb7 Mon Sep 17 00:00:00 2001
From: SamFC10 <njebastin10@gmail.com>
Date: Wed, 3 Mar 2021 22:42:47 +0530
Subject: [PATCH 04/10] Added Spatial Attention Module in Darknet Importer

---
 modules/dnn/src/darknet/darknet_io.cpp     | 31 ++++++++++++++++++++++
 modules/dnn/test/test_darknet_importer.cpp |  5 ++++
 2 files changed, 36 insertions(+)

diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index e3c978a8c0..4915538ff7 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -558,6 +558,29 @@ namespace cv {
                     fused_layer_names.push_back(last_layer);
                 }
 
+                void setSAM(int from)
+                {
+                    cv::dnn::LayerParams eltwise_param;
+                    eltwise_param.name = "SAM-name";
+                    eltwise_param.type = "Eltwise";
+
+                    eltwise_param.set<std::string>("operation", "prod");
+                    eltwise_param.set<std::string>("output_channels_mode", "same");
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("sam_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = eltwise_param.type;
+                    lp.layerParams = eltwise_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
                 void setUpsample(int scaleFactor)
                 {
                     cv::dnn::LayerParams param;
@@ -837,6 +860,14 @@ namespace cv {
                         from = from < 0 ? from + layers_counter : from;
                         setParams.setScaleChannels(from);
                     }
+                    else if (layer_type == "sam")
+                    {
+                        std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        CV_Assert(!bottom_layer.empty());
+                        int from = std::atoi(bottom_layer.c_str());
+                        from = from < 0 ? from + layers_counter : from;
+                        setParams.setSAM(from);
+                    }
                     else if (layer_type == "upsample")
                     {
                         int scaleFactor = getParam<int>(layer_params, "stride", 1);
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 00638f83c5..8a633fa566 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -770,6 +770,11 @@ TEST_P(Test_Darknet_layers, relu)
     testDarknetLayer("relu");
 }
 
+TEST_P(Test_Darknet_layers, sam)
+{
+    testDarknetLayer("sam", true);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_layers, dnnBackendsAndTargets());
 
 }} // namespace

From 94533e12ebd6e131723a53e2d31e97986fea6e5b Mon Sep 17 00:00:00 2001
From: Liubov Batanina <piccione-mail@yandex.ru>
Date: Thu, 4 Mar 2021 13:05:01 +0300
Subject: [PATCH 05/10] Determine layout

---
 modules/dnn/src/ie_ngraph.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index 84b984ac97..aa3be70e05 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -769,8 +769,14 @@ static InferenceEngine::Layout estimateLayout(const Mat& m)
 {
     if (m.dims == 4)
         return InferenceEngine::Layout::NCHW;
+    else if (m.dims == 3)
+        return InferenceEngine::Layout::CHW;
     else if (m.dims == 2)
         return InferenceEngine::Layout::NC;
+    else if (m.dims == 1)
+        return InferenceEngine::Layout::C;
+    else if (m.dims == 5)
+        return InferenceEngine::Layout::NCDHW;
     else
         return InferenceEngine::Layout::ANY;
 }

From 125cc79c179f364eeecda72b24e6bb2da2f1bd1e Mon Sep 17 00:00:00 2001
From: APrigarina <ann73617@gmail.com>
Date: Thu, 4 Mar 2021 14:04:50 +0300
Subject: [PATCH 06/10] fix false positive detection

---
 modules/objdetect/src/qrcode.cpp | 38 +++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 449e6e6d32..929807b34e 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -235,9 +235,11 @@ vector<Vec3d> QRDetect::searchHorizontalLines()
 vector<Point2f> QRDetect::separateVerticalLines(const vector<Vec3d> &list_lines)
 {
     CV_TRACE_FUNCTION();
-
-    for (int coeff_epsilon = 1; coeff_epsilon < 10; coeff_epsilon++)
+    const double min_dist_between_points = 10.0;
+    const double max_ratio = 1.0;
+    for (int coeff_epsilon_i = 1; coeff_epsilon_i < 101; ++coeff_epsilon_i)
     {
+        const float coeff_epsilon = coeff_epsilon_i * 0.1f;
         vector<Point2f> point2f_result = extractVerticalLines(list_lines, eps_horizontal * coeff_epsilon);
         if (!point2f_result.empty())
         {
@@ -247,9 +249,23 @@ vector<Point2f> QRDetect::separateVerticalLines(const vector<Vec3d> &list_lines)
                     point2f_result, 3, labels,
                     TermCriteria(TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
                     3, KMEANS_PP_CENTERS, centers);
-            if (compactness == 0)
+            double min_dist = std::numeric_limits<double>::max();
+            for (size_t i = 0; i < centers.size(); i++)
+            {
+                double dist = norm(centers[i] - centers[(i+1) % centers.size()]);
+                if (dist < min_dist)
+                {
+                    min_dist = dist;
+                }
+            }
+            if (min_dist < min_dist_between_points)
+            {
                 continue;
-            if (compactness > 0)
+            }
+            double mean_compactness = compactness / point2f_result.size();
+            double ratio = mean_compactness / min_dist;
+
+            if (ratio < max_ratio)
             {
                 return point2f_result;
             }
@@ -456,7 +472,6 @@ bool QRDetect::localization()
     vector<Point2f> list_lines_y = separateVerticalLines(list_lines_x);
     if( list_lines_y.empty() ) { return false; }
 
-    vector<Point2f> centers;
     Mat labels;
     kmeans(list_lines_y, 3, labels,
            TermCriteria( TermCriteria::EPS + TermCriteria::COUNT, 10, 0.1),
@@ -464,7 +479,7 @@ bool QRDetect::localization()
 
     fixationPoints(localization_points);
 
-    bool suare_flag = false, local_points_flag = false;
+    bool square_flag = false, local_points_flag = false;
     double triangle_sides[3];
     double triangle_perim, square_area, img_square_area;
     if (localization_points.size() == 3)
@@ -482,14 +497,14 @@ bool QRDetect::localization()
 
         if (square_area > (img_square_area * 0.2))
         {
-            suare_flag = true;
+            square_flag = true;
         }
     }
     else
     {
         local_points_flag = true;
     }
-    if ((suare_flag || local_points_flag) && purpose == SHRINKING)
+    if ((square_flag || local_points_flag) && purpose == SHRINKING)
     {
         localization_points.clear();
         bin_barcode = resized_bin_barcode.clone();
@@ -1970,6 +1985,13 @@ bool QRDecode::createSpline(vector<vector<Point2f> > &spline_lines)
             }
         }
     }
+    for (int i = 0; i < NUM_SIDES; i++)
+    {
+        if (spline_lines[i].size() == 0)
+        {
+            return false;
+        }
+    }
     return true;
 }
 

From 7894cd3c73df5b90139c54c664225b913c5f049c Mon Sep 17 00:00:00 2001
From: Anastasia Murzova <anastasia.murzova@xperience.ai>
Date: Sun, 28 Feb 2021 19:55:43 +0300
Subject: [PATCH 07/10] Aligned TF Reshape layer behaviour

---
 modules/dnn/src/tensorflow/tf_importer.cpp | 70 +++++++++++++++++-----
 modules/dnn/test/test_tf_importer.cpp      | 10 ++++
 2 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index c03ac8a943..53d62fc9f7 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -295,6 +295,22 @@ DataLayout getDataLayout(
     return it != data_layouts.end() ? it->second : DATA_LAYOUT_UNKNOWN;
 }
 
+static
+bool hasAllOnes(const Mat &inputs, int startPos, int endPos)
+{
+    CV_CheckLE(inputs.dims, 2, "");
+    CV_CheckGE(startPos, 0, "");
+    CV_CheckLE(startPos, endPos, "");
+    CV_CheckLT((size_t)endPos, inputs.total(), "");
+
+    for (int i = startPos; i < endPos; i++)
+    {
+        if (inputs.at<int>(i) != 1 || inputs.at<int>(i)!= -1)
+            return false;
+    }
+    return true;
+}
+
 void setStrides(LayerParams &layerParams, const tensorflow::NodeDef &layer)
 {
     if (hasLayerAttr(layer, "strides"))
@@ -490,6 +506,9 @@ protected:
     std::map<String, Mat> sharedWeights;
 
     std::map<String, int> layer_id;
+
+private:
+    void addPermuteLayer(const int* order, const std::string& permName, Pin& inpId);
 };
 
 TFImporter::TFImporter(Net& net, const char *model, const char *config)
@@ -895,6 +914,17 @@ void TFImporter::populateNet()
     CV_LOG_DEBUG(NULL, "DNN/TF: ===================== Import completed =====================");
 }
 
+void TFImporter::addPermuteLayer(const int* order, const std::string& permName, Pin& inpId)
+{
+    LayerParams permLP;
+    permLP.set("order", DictValue::arrayInt<const int*>(order, 4));
+    CV_Assert(layer_id.find(permName) == layer_id.end());
+    int permId = dstNet.addLayer(permName, "Permute", permLP);
+    layer_id[permName] = permId;
+    connect(layer_id, dstNet, inpId, permId, 0);
+    inpId = Pin(permName);
+}
+
 void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
 {
     tensorflow::NodeDef layer = layer_;
@@ -1276,37 +1306,49 @@ void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
             if (value_id.find(layer.input(1)) != value_id.end())
             {
                 Mat newShape = getTensorContent(getConstBlob(layer, value_id, 1));
-                if (newShape.total() == 4)
+                int newShapeSize = newShape.total();
+                bool hasSwap = false;
+                if (newShapeSize == 4 && hasAllOnes(newShape, 0, 2))
                 {
                     // NHWC->NCHW
                     std::swap(*newShape.ptr<int32_t>(0, 2), *newShape.ptr<int32_t>(0, 3));
                     std::swap(*newShape.ptr<int32_t>(0, 1), *newShape.ptr<int32_t>(0, 2));
+                    hasSwap = true;
                 }
                 if (inpLayout == DATA_LAYOUT_NHWC)
                 {
-                    if (newShape.total() != 4 || newShape.at<int>(1) == 1)
+                    if (newShapeSize >= 2 || newShape.at<int>(1) == 1)
                     {
-                        LayerParams permLP;
                         int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                        permLP.set("order", DictValue::arrayInt<int*>(order, 4));
-
-                        std::string permName = name + "/nchw";
-                        CV_Assert(layer_id.find(permName) == layer_id.end());
-                        int permId = dstNet.addLayer(permName, "Permute", permLP);
-                        layer_id[permName] = permId;
-                        connect(layer_id, dstNet, inpId, permId, 0);
-                        inpId = Pin(permName);
-                        inpLayout = DATA_LAYOUT_NCHW;
+                        addPermuteLayer(order, name + "/nhwc", inpId);
+                        if (newShapeSize < 4)
+                        {
+                            inpLayout = DATA_LAYOUT_NCHW;
+                        }
+                        else
+                        {
+                            inpLayout = DATA_LAYOUT_NHWC;
+                        }
                     }
                 }
-                layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShape.total()));
+                layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShapeSize));
 
                 int id = dstNet.addLayer(name, "Reshape", layerParams);
                 layer_id[name] = id;
 
                 // one input only
                 connect(layer_id, dstNet, inpId, id, 0);
-                data_layouts[name] = newShape.total() == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
+                inpId = Pin(name);
+
+                if ((inpLayout == DATA_LAYOUT_NHWC || inpLayout == DATA_LAYOUT_UNKNOWN || inpLayout == DATA_LAYOUT_PLANAR) &&
+                    newShapeSize == 4 && !hasSwap)
+                {
+                    int order[] = {0, 3, 1, 2};  // Transform back to OpenCV's NCHW.
+                    addPermuteLayer(order, name + "/nchw", inpId);
+                    inpLayout = DATA_LAYOUT_NCHW;
+                }
+
+                data_layouts[name] = newShapeSize == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
             }
             else
             {
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 6163e89fa7..6a1a44f03a 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -457,6 +457,16 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten)
     runTensorFlowNet("unfused_flatten_unknown_batch");
 }
 
+TEST_P(Test_TensorFlow_layers, reshape_layer)
+{
+    runTensorFlowNet("reshape_layer");
+}
+
+TEST_P(Test_TensorFlow_layers, reshape_nchw)
+{
+    runTensorFlowNet("reshape_nchw");
+}
+
 TEST_P(Test_TensorFlow_layers, leaky_relu)
 {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)

From 625d4fc8843435a505adeec752d5f17e1c153cef Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 5 Mar 2021 12:54:51 +0000
Subject: [PATCH 08/10] cmake: update Python linters handling

- exclude from getBuildInformation()
- fix pylint version
---
 CMakeLists.txt           | 12 ++++++------
 cmake/FindPylint.cmake   |  2 +-
 cmake/OpenCVPylint.cmake |  1 -
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65933cdeef..f6a2da5310 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1000,6 +1000,12 @@ if(COMMAND ocv_pylint_finalize)
   ocv_pylint_add_directory_recurse(${CMAKE_CURRENT_LIST_DIR}/samples/python/tutorial_code)
   ocv_pylint_finalize()
 endif()
+if(TARGET check_pylint)
+  message(STATUS "Registered 'check_pylint' target: using ${PYLINT_EXECUTABLE} (ver: ${PYLINT_VERSION}), checks: ${PYLINT_TOTAL_TARGETS}")
+endif()
+if(TARGET check_flake8)
+  message(STATUS "Registered 'check_flake8' target: using ${FLAKE8_EXECUTABLE} (ver: ${FLAKE8_VERSION})")
+endif()
 
 if(OPENCV_GENERATE_SETUPVARS)
   include(cmake/OpenCVGenSetupVars.cmake)
@@ -1633,12 +1639,6 @@ endif()
 
 status("")
 status("  Python (for build):"  PYTHON_DEFAULT_AVAILABLE THEN "${PYTHON_DEFAULT_EXECUTABLE}" ELSE NO)
-if(PYLINT_FOUND AND PYLINT_EXECUTABLE)
-  status("    Pylint:"  PYLINT_FOUND THEN "${PYLINT_EXECUTABLE} (ver: ${PYLINT_VERSION}, checks: ${PYLINT_TOTAL_TARGETS})" ELSE NO)
-endif()
-if(FLAKE8_FOUND AND FLAKE8_EXECUTABLE)
-  status("    Flake8:"  FLAKE8_FOUND THEN "${FLAKE8_EXECUTABLE} (ver: ${FLAKE8_VERSION})" ELSE NO)
-endif()
 
 # ========================== java ==========================
 if(BUILD_JAVA)
diff --git a/cmake/FindPylint.cmake b/cmake/FindPylint.cmake
index 5731ba493a..ef4b4394ff 100644
--- a/cmake/FindPylint.cmake
+++ b/cmake/FindPylint.cmake
@@ -16,7 +16,7 @@ if(PYLINT_EXECUTABLE AND NOT DEFINED PYLINT_VERSION)
   execute_process(COMMAND ${PYLINT_EXECUTABLE} --version RESULT_VARIABLE _result OUTPUT_VARIABLE PYLINT_VERSION_RAW)
   if(NOT _result EQUAL 0)
     ocv_clear_vars(PYLINT_EXECUTABLE PYLINT_VERSION)
-  elseif(PYLINT_VERSION_RAW MATCHES "pylint([^,]*) ([0-9\\.]+[0-9])")
+  elseif(PYLINT_VERSION_RAW MATCHES "pylint([^,\n]*) ([0-9\\.]+[0-9])")
     set(PYLINT_VERSION "${CMAKE_MATCH_2}")
   else()
     set(PYLINT_VERSION "unknown")
diff --git a/cmake/OpenCVPylint.cmake b/cmake/OpenCVPylint.cmake
index 50da730946..928926d340 100644
--- a/cmake/OpenCVPylint.cmake
+++ b/cmake/OpenCVPylint.cmake
@@ -122,7 +122,6 @@ function(ocv_pylint_finalize)
 
   list(LENGTH PYLINT_TARGET_ID __total)
   set(PYLINT_TOTAL_TARGETS "${__total}" CACHE INTERNAL "")
-  message(STATUS "Pylint: registered ${__total} targets. Build 'check_pylint' target to run checks (\"cmake --build . --target check_pylint\" or \"make check_pylint\")")
   configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/pylint.cmake.in" "${CMAKE_BINARY_DIR}/pylint.cmake" @ONLY)
 
   add_custom_target(check_pylint

From 640f188ca269c7cc7134c70725789e556a7a9733 Mon Sep 17 00:00:00 2001
From: Mradul Agrawal <69335152+theroyalpekka@users.noreply.github.com>
Date: Fri, 5 Mar 2021 19:25:52 +0530
Subject: [PATCH 09/10] Merge pull request #19583 from theroyalpekka:patch-1

* Update polynom_solver.cpp

This pull request is in the response to Issue  #19526. I have fixed the problem with the cube root calculation of 2*R. The Issue was in the usage of pow function with negative values of R, but if it is calculated for only positive values of R then changing x0 according to the parity of R, the Issue is resolved. Kindly consider it, Thanks!

* add cv::cubeRoot(double)

Co-authored-by: Alexander Alekhin <alexander.a.alekhin@gmail.com>
---
 modules/calib3d/src/polynom_solver.cpp     | 14 +++++++++++---
 modules/core/include/opencv2/core/base.hpp | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/modules/calib3d/src/polynom_solver.cpp b/modules/calib3d/src/polynom_solver.cpp
index beb91cafc0..5025199dd3 100644
--- a/modules/calib3d/src/polynom_solver.cpp
+++ b/modules/calib3d/src/polynom_solver.cpp
@@ -65,7 +65,8 @@ int solve_deg3(double a, double b, double c, double d,
       return 3;
     }
     else {
-      x0 = pow(2 * R, 1 / 3.0) - b_a_3;
+      double cube_root = cv::cubeRoot(2 * R);
+      x0 = cube_root - b_a_3;
       return 1;
     }
   }
@@ -82,8 +83,15 @@ int solve_deg3(double a, double b, double c, double d,
   }
 
   // D > 0, only one real root
-  double AD = pow(fabs(R) + sqrt(D), 1.0 / 3.0) * (R > 0 ? 1 : (R < 0 ? -1 : 0));
-  double BD = (AD == 0) ? 0 : -Q / AD;
+  double AD = 0.;
+  double BD = 0.;
+  double R_abs = fabs(R);
+  if (R_abs > DBL_EPSILON)
+  {
+    AD = cv::cubeRoot(R_abs + sqrt(D));
+    AD = (R >= 0) ? AD : -AD;
+    BD = -Q / AD;
+  }
 
   // Calculate the only real root
   x0 = AD + BD - b_a_3;
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 546140e9f1..12504974d9 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -587,6 +587,21 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
  */
 CV_EXPORTS_W float cubeRoot(float val);
 
+/** @overload
+
+cubeRoot with argument of `double` type calls `std::cbrt(double)` (C++11) or falls back on `pow()` for C++98 compilation mode.
+*/
+static inline
+double cubeRoot(double val)
+{
+#ifdef CV_CXX11
+    return std::cbrt(val);
+#else
+    double v = pow(abs(val), 1/3.);  // pow doesn't support negative inputs with fractional exponents
+    return val >= 0 ? v : -v;
+#endif
+}
+
 /** @brief Calculates the angle of a 2D vector in degrees.
 
  The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured

From 04a9ff88d80fd2757b0545e7420884a4394394af Mon Sep 17 00:00:00 2001
From: Vitaly Tuzov <vitaly.tuzov@intel.com>
Date: Sat, 6 Mar 2021 20:22:21 +0300
Subject: [PATCH 10/10] Merge pull request #19622 from terfendail:ref_doc

* Updated cpp reference implementations for a few intrinsics to address wide universal intrinsics as well

* Updated cpp reference implementations for a few more universal intrinsics
---
 .../include/opencv2/core/hal/intrin_cpp.hpp   | 478 ++++++++----------
 1 file changed, 216 insertions(+), 262 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 859bfd72dc..5878dced7f 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -559,27 +559,6 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
     return c; \
 }
 
-//! @brief Helper macro
-//! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \
-inline v_reg<int, 4> func(const v_reg<float, 4>& a) \
-{ \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 4; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-} \
-inline v_reg<int, 4> func(const v_reg<double, 2>& a) \
-{ \
-    v_reg<int, 4> c; \
-    for( int i = 0; i < 2; i++ ) \
-    { \
-        c.s[i] = cfunc(a.s[i]); \
-        c.s[i + 2] = 0; \
-    } \
-    return c; \
-}
-
 /** @brief Square root of elements
 
 Only for floating point types.*/
@@ -598,26 +577,6 @@ Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
                           typename V_TypeTraits<_Tp>::abs_type)
 
-/** @brief Round elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound)
-
-/** @brief Floor elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor)
-
-/** @brief Ceil elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil)
-
-/** @brief Truncate elements
-
-Only for floating point types.*/
-OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int)
-
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
@@ -855,9 +814,9 @@ inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp,
 /** @overload
 
 For 32-bit floating point values */
-inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
 {
-    v_float32x4 c;
+    v_reg<float, n> c;
     for( int i = 0; i < c.nlanes; i++ )
         c.s[i] = _absdiff(a.s[i], b.s[i]);
     return c;
@@ -866,9 +825,9 @@ inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
 /** @overload
 
 For 64-bit floating point values */
-inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
 {
-    v_float64x2 c;
+    v_reg<double, n> c;
     for( int i = 0; i < c.nlanes; i++ )
         c.s[i] = _absdiff(a.s[i], b.s[i]);
     return c;
@@ -1238,14 +1197,17 @@ template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_redu
  result[3] = d[0] + d[1] + d[2] + d[3]
  @endcode
 */
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
+template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
+    const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    v_float32x4 r;
-    r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
-    r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
-    r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
-    r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
+    v_reg<float, n> r;
+    for(int i = 0; i < (n/4); i++)
+    {
+        r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
+        r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
+        r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
+        r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
+    }
     return r;
 }
 
@@ -1965,9 +1927,11 @@ inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
     return v_reg<_Tp, n>::all(a.s[i]);
 }
 
-/** @brief Round
+/** @brief Round elements
 
-Rounds each value. Input type is float vector ==> output type is int vector.*/
+Rounds each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -1988,9 +1952,11 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const
     return c;
 }
 
-/** @brief Floor
+/** @brief Floor elements
 
-Floor each value. Input type is float vector ==> output type is int vector.*/
+Floor each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -1999,9 +1965,11 @@ template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
     return c;
 }
 
-/** @brief Ceil
+/** @brief Ceil elements
 
-Ceil each value. Input type is float vector ==> output type is int vector.*/
+Ceil each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -2010,9 +1978,11 @@ template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
     return c;
 }
 
-/** @brief Trunc
+/** @brief Truncate elements
 
-Truncate each value. Input type is float vector ==> output type is int vector.*/
+Truncate each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
 {
     v_reg<int, n> c;
@@ -2036,7 +2006,7 @@ template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
     for( int i = 0; i < n; i++ )
     {
         c.s[i] = cvFloor(a.s[i]);
@@ -2048,7 +2018,7 @@ template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
     for( int i = 0; i < n; i++ )
     {
         c.s[i] = cvCeil(a.s[i]);
@@ -2060,10 +2030,10 @@ template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
 /** @overload */
 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
 {
-    v_reg<int, n> c;
+    v_reg<int, n*2> c;
     for( int i = 0; i < n; i++ )
     {
-        c.s[i] = cvCeil(a.s[i]);
+        c.s[i] = (int)(a.s[i]);
         c.s[i+n] = 0;
     }
     return c;
@@ -2105,11 +2075,10 @@ template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, co
 /** @brief Convert to double
 
 Supported input type is cv::v_int32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int, 4>& a)
+template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
         c.s[i] = (double)a.s[i];
     return c;
 }
@@ -2117,23 +2086,21 @@ CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int, 4>& a)
 /** @brief Convert to double high part of vector
 
 Supported input type is cv::v_int32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int, 4>& a)
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
     return c;
 }
 
 /** @brief Convert to double
 
 Supported input type is cv::v_float32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<float, 4>& a)
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
         c.s[i] = (double)a.s[i];
     return c;
 }
@@ -2141,33 +2108,19 @@ CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<float, 4>& a)
 /** @brief Convert to double high part of vector
 
 Supported input type is cv::v_float32x4. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<float, 4>& a)
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i + 2];
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
     return c;
 }
 
 /** @brief Convert to double
 
 Supported input type is cv::v_int64x2. */
-CV_INLINE v_reg<double, 2> v_cvt_f64(const v_reg<int64, 2>& a)
+template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
 {
-    enum { n = 2 };
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-/** @brief Convert to double high part of vector
-
-Supported input type is cv::v_int64x2. */
-CV_INLINE v_reg<double, 2> v_cvt_f64_high(const v_reg<int64, 2>& a)
-{
-    enum { n = 2 };
     v_reg<double, n> c;
     for( int i = 0; i < n; i++ )
         c.s[i] = (double)a.s[i];
@@ -2221,36 +2174,15 @@ template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int,
     return c;
 }
 
-template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
+template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
 {
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
+    v_reg<double, n/2> c;
+    for( int i = 0; i < n/2; i++ )
         c.s[i] = tab[idx.s[i]];
     return c;
 }
 
 
-inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
-{
-    return v_lut(tab, idxvec.s);
-}
-
-
 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
                                                v_reg<float, n>& x, v_reg<float, n>& y)
 {
@@ -2330,16 +2262,23 @@ b2  {A3 B3 C3 D3}
 b3  {A4 B4 C4 D4}
 @endcode
 */
-template<typename _Tp>
-inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
+template<typename _Tp, int n>
+inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                            const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
+                            v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
+                            v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
 {
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
+    for (int i = 0; i < n / 4; i++)
+    {
+        b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
+        b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
+        b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
+        b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
+        b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
+        b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
+        b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
+        b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
+    }
 }
 
 //! @brief Helper macro
@@ -2384,92 +2323,92 @@ OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
-template<typename _Tp0, int n0> inline _Tpvec \
+#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
+template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
-{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
+{ return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
 
 //! @name Reinterpret
 //! @{
 //! @brief Convert vector to different type without modifying underlying data.
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
+OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
+OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
+OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
+OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
+OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
+OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
+OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ return a << n; }
+#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
+{ return a << shift; }
 
 //! @name Left shift
 //! @{
 //! @brief Shift left
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_SHIFTL(ushort)
+OPENCV_HAL_IMPL_C_SHIFTL(short)
+OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTL(int)
+OPENCV_HAL_IMPL_C_SHIFTL(uint64)
+OPENCV_HAL_IMPL_C_SHIFTL(int64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ return a >> n; }
+#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
+{ return a >> shift; }
 
 //! @name Right shift
 //! @{
 //! @brief Shift right
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_SHIFTR(ushort)
+OPENCV_HAL_IMPL_C_SHIFTR(short)
+OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTR(int)
+OPENCV_HAL_IMPL_C_SHIFTR(uint64)
+OPENCV_HAL_IMPL_C_SHIFTR(int64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
 { \
-    _Tpvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
     return c; \
 }
 
 //! @name Rounding shift
 //! @{
 //! @brief Rounding shift right
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
-OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
+OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
+OPENCV_HAL_IMPL_C_RSHIFTR(short)
+OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_RSHIFTR(int)
+OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
+OPENCV_HAL_IMPL_C_RSHIFTR(int64)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
     { \
         c.s[i] = cast<_Tpn>(a.s[i]); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
+        c.s[i+n] = cast<_Tpn>(b.s[i]); \
     } \
     return c; \
 }
@@ -2485,26 +2424,26 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
-    _Tpnvec c; \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
     { \
-        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
-        c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+        c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
     } \
     return c; \
 }
@@ -2520,22 +2459,22 @@ template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpve
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
 { \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    for( int i = 0; i < n; i++ ) \
         ptr[i] = cast<_Tpn>(a.s[i]); \
 }
 
@@ -2550,23 +2489,23 @@ inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
-template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
 { \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    for( int i = 0; i < n; i++ ) \
+        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
 }
 
 //! @name Pack and store with rounding shift
@@ -2580,14 +2519,14 @@ template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec
 //! - pack_u: for 16- and 32-bit signed integer input types
 //!
 //! @note All variants except 64-bit use saturation.
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
-OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
 //! @}
 
 //! @cond IGNORED
@@ -2622,9 +2561,9 @@ b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
 }
 @endcode */
 
-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 2*n> mask;
     _pack_b(mask.s, a, b);
     return mask;
 }
@@ -2645,12 +2584,12 @@ d  {0 0xFFFF.. 0 0xFFFF..}
 }
 @endcode */
 
-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
+template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
+                                                  const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 4*n> mask;
     _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 8, c, d);
+    _pack_b(mask.s + 2*n, c, d);
     return mask;
 }
 
@@ -2674,15 +2613,16 @@ h  {0 0xFFFF..}
    0xFF 0 0xFF 0 0 0xFF 0 0xFF
 }
 @endcode */
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
+template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
+                                                  const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
+                                                  const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
+                                                  const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
 {
-    v_uint8x16 mask;
+    v_reg<uchar, 8*n> mask;
     _pack_b(mask.s, a, b);
-    _pack_b(mask.s + 4, c, d);
-    _pack_b(mask.s + 8, e, f);
-    _pack_b(mask.s + 12, g, h);
+    _pack_b(mask.s + 2*n, c, d);
+    _pack_b(mask.s + 4*n, e, f);
+    _pack_b(mask.s + 6*n, g, h);
     return mask;
 }
 //! @}
@@ -2697,54 +2637,68 @@ Scheme:
 {D0 D1 D2 D3} x |V3|
 ====================
 {R0 R1 R2 R3}, where:
-R0 = A0V0 + A1V1 + A2V2 + A3V3,
-R1 = B0V0 + B1V1 + B2V2 + B3V3
+R0 = A0V0 + B0V1 + C0V2 + D0V3,
+R1 = A1V0 + B1V1 + C1V2 + D1V3
 ...
 @endcode
 */
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
+template<int n>
+inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
+                                const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
+        res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
+        res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
+        res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
+    }
+    return res;
 }
 
 /** @brief Matrix multiplication and add
 
 Scheme:
 @code
-{A0 A1 A2   }   |V0|   |D0|
-{B0 B1 B2   }   |V1|   |D1|
-{C0 C1 C2   } x |V2| + |D2|
-====================
+{A0 A1 A2 A3}   |V0|   |D0|
+{B0 B1 B2 B3}   |V1|   |D1|
+{C0 C1 C2 C3} x |V2| + |D2|
+====================   |D3|
 {R0 R1 R2 R3}, where:
-R0 = A0V0 + A1V1 + A2V2 + D0,
-R1 = B0V0 + B1V1 + B2V2 + D1
+R0 = A0V0 + B0V1 + C0V2 + D0,
+R1 = A1V0 + B1V1 + C1V2 + D1
 ...
 @endcode
 */
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
-                               const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& m3)
+template<int n>
+inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
+                                   const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                   const v_reg<float, n>& c, const v_reg<float, n>& d)
 {
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
+        res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
+        res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
+        res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
+    }
+    return res;
 }
 
 
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                           const v_reg<double, n/2>& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
 
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
 { return v_dotprod_expand(a, b); }
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                                const v_reg<double, n/2>& c)
 { return v_dotprod_expand(a, b, c); }
 
 ////// FP16 support ///////
@@ -2760,8 +2714,8 @@ v_load_expand(const float16_t* ptr)
     return v;
 }
 
-inline void
-v_pack_store(float16_t* ptr, const v_reg<float, V_TypeTraits<float>::nlanes128>& v)
+template<int n> inline void
+v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
 {
     for( int i = 0; i < v.nlanes; i++ )
     {