Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2020-03-06 20:00:55 +00:00
parent 6271192a32 6d113bd03f
commit 619180dffd
19 changed files with 518 additions and 391 deletions
@@ -1089,6 +1089,7 @@ bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc
        getMinMaxRes<double>
    };

+    CV_Assert(ddepth <= CV_64F);
    getMinMaxResFunc func = functab[ddepth];

    int locTemp[2];
@@ -710,67 +710,78 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
    result;
    result.d = 0;
    NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total;
-    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
-            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
-    int isum = 0;
-    int *ibuf = &result.i;
-    AutoBuffer<float> fltbuf_;
-    float* fltbuf = 0;
-    size_t esz = 0;
+    CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");

-    if( blockSum )
+    if ((normType == NORM_L1 && depth <= CV_16S) ||
+        ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
    {
-        esz = src.elemSize();
+        // special case to handle "integer" overflow in accumulator
+        const size_t esz = src.elemSize();
+        const int total = (int)it.size;
+        const int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+        const int blockSize = std::min(total, intSumBlockSize);
+        int isum = 0;
+        int count = 0;

-        if( depth == CV_16F )
+        for (size_t i = 0; i < it.nplanes; i++, ++it)
        {
-            blockSize = std::min(blockSize, 1024);
-            fltbuf_.allocate(blockSize);
-            fltbuf = fltbuf_.data();
-        }
-        else
-        {
-            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
-            blockSize = std::min(blockSize, intSumBlockSize);
-            ibuf = &isum;
+            for (int j = 0; j < total; j += blockSize)
+            {
+                int bsz = std::min(total - j, blockSize);
+                func(ptrs[0], ptrs[1], (uchar*)&isum, bsz, cn);
+                count += bsz;
+                if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
+                {
+                    result.d += isum;
+                    isum = 0;
+                    count = 0;
+                }
+                ptrs[0] += bsz*esz;
+                if (ptrs[1])
+                    ptrs[1] += bsz;
+            }
        }
    }
-
-    for( size_t i = 0; i < it.nplanes; i++, ++it )
+    else if (depth == CV_16F)
    {
-        for( j = 0; j < total; j += blockSize )
+        const size_t esz = src.elemSize();
+        const int total = (int)it.size;
+        const int blockSize = std::min(total, divUp(1024, cn));
+        AutoBuffer<float, 1024> fltbuf(blockSize);
+        float* data0 = fltbuf.data();
+        for (size_t i = 0; i < it.nplanes; i++, ++it)
        {
-            int bsz = std::min(total - j, blockSize);
-            const uchar* data = ptrs[0];
-            if( depth == CV_16F )
+            for (int j = 0; j < total; j += blockSize)
            {
-                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
-                data = (const uchar*)fltbuf;
+                int bsz = std::min(total - j, blockSize);
+                hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
+                func((uchar*)data0, ptrs[1], (uchar*)&result.d, bsz, cn);
+                ptrs[0] += bsz*esz;
+                if (ptrs[1])
+                    ptrs[1] += bsz;
            }
-            func( data, ptrs[1], (uchar*)ibuf, bsz, cn );
-            if( blockSum && depth != CV_16F )
-            {
-                result.d += isum;
-                isum = 0;
-            }
-            ptrs[0] += bsz*esz;
-            if( ptrs[1] )
-                ptrs[1] += bsz;
+        }
+    }
+    else
+    {
+        // generic implementation
+        for (size_t i = 0; i < it.nplanes; i++, ++it)
+        {
+            func(ptrs[0], ptrs[1], (uchar*)&result, (int)it.size, cn);
        }
    }

    if( normType == NORM_INF )
    {
-        if( depth == CV_64F )
-            ;
-        else if( depth == CV_32F )
-            result.d = result.f;
+        if(depth == CV_64F || depth == CV_16F)
+            return result.d;
+        else if (depth == CV_32F)
+            return result.f;
        else
-            result.d = result.i;
+            return result.i;
    }
    else if( normType == NORM_L2 )
-        result.d = std::sqrt(result.d);
+        return std::sqrt(result.d);

    return result.d;
 }
@@ -1186,70 +1197,82 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
    result;
    result.d = 0;
    NAryMatIterator it(arrays, ptrs);
-    int j, total = (int)it.size, blockSize = total;
-    bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
-            ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
-    unsigned isum = 0;
-    unsigned *ibuf = &result.u;
-    AutoBuffer<float> fltbuf_;
-    float* fltbuf = 0;
-    size_t esz = 0;
+    CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");

-    if( blockSum )
+    if ((normType == NORM_L1 && depth <= CV_16S) ||
+        ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
    {
-        esz = src1.elemSize();
+        // special case to handle "integer" overflow in accumulator
+        const size_t esz = src1.elemSize();
+        const int total = (int)it.size;
+        const int intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
+        const int blockSize = std::min(total, intSumBlockSize);
+        int isum = 0;
+        int count = 0;

-        if( depth == CV_16F )
+        for (size_t i = 0; i < it.nplanes; i++, ++it)
        {
-            blockSize = std::min(blockSize, 1024);
-            fltbuf_.allocate(blockSize*2);
-            fltbuf = fltbuf_.data();
-        }
-        else
-        {
-            int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
-            blockSize = std::min(blockSize, intSumBlockSize);
-            ibuf = &isum;
+            for (int j = 0; j < total; j += blockSize)
+            {
+                int bsz = std::min(total - j, blockSize);
+                func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&isum, bsz, cn);
+                count += bsz;
+                if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
+                {
+                    result.d += isum;
+                    isum = 0;
+                    count = 0;
+                }
+                ptrs[0] += bsz*esz;
+                ptrs[1] += bsz*esz;
+                if (ptrs[2])
+                    ptrs[2] += bsz;
+            }
        }
    }
-
-    for( size_t i = 0; i < it.nplanes; i++, ++it )
+    else if (depth == CV_16F)
    {
-        for( j = 0; j < total; j += blockSize )
+        const size_t esz = src1.elemSize();
+        const int total = (int)it.size;
+        const int blockSize = std::min(total, divUp(512, cn));
+        AutoBuffer<float, 1024> fltbuf(blockSize * 2);
+        float* data0 = fltbuf.data();
+        float* data1 = fltbuf.data() + blockSize * cn;
+        for (size_t i = 0; i < it.nplanes; i++, ++it)
        {
-            int bsz = std::min(total - j, blockSize);
-            const uchar *data0 = ptrs[0], *data1 = ptrs[1];
-            if( depth == CV_16F )
+            for (int j = 0; j < total; j += blockSize)
            {
-                hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
-                hal::cvt16f32f((const float16_t*)ptrs[1], fltbuf + bsz, bsz);
-                data0 = (const uchar*)fltbuf;
-                data1 = (const uchar*)(fltbuf + bsz);
+                int bsz = std::min(total - j, blockSize);
+                hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
+                hal::cvt16f32f((const float16_t*)ptrs[1], data1, bsz * cn);
+                func((uchar*)data0, (uchar*)data1, ptrs[2], (uchar*)&result.d, bsz, cn);
+                ptrs[0] += bsz*esz;
+                ptrs[1] += bsz*esz;
+                if (ptrs[2])
+                    ptrs[2] += bsz;
            }
-            func( data0, data1, ptrs[2], (uchar*)ibuf, bsz, cn );
-            if( blockSum && depth != CV_16F )
-            {
-                result.d += isum;
-                isum = 0;
-            }
-            ptrs[0] += bsz*esz;
-            ptrs[1] += bsz*esz;
-            if( ptrs[2] )
-                ptrs[2] += bsz;
+        }
+    }
+    else
+    {
+        // generic implementation
+        for (size_t i = 0; i < it.nplanes; i++, ++it)
+        {
+            func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&result, (int)it.size, cn);
        }
    }

    if( normType == NORM_INF )
    {
-        if( depth == CV_64F )
-            ;
-        else if( depth == CV_32F )
-            result.d = result.f;
+        if (depth == CV_64F || depth == CV_16F)
+            return result.d;
+        else if (depth == CV_32F)
+            return result.f;
        else
-            result.d = result.u;
+            return result.u;
    }
    else if( normType == NORM_L2 )
-        result.d = std::sqrt(result.d);
+        return std::sqrt(result.d);

    return result.d;
 }
@@ -6451,16 +6451,19 @@ struct Image2D::Impl
                                                CL_MEM_OBJECT_IMAGE2D, numFormats,
                                                NULL, &numFormats);
        CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
-        AutoBuffer<cl_image_format> formats(numFormats);
-        err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
-                                         CL_MEM_OBJECT_IMAGE2D, numFormats,
-                                         formats.data(), NULL);
-        CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
-        for (cl_uint i = 0; i < numFormats; ++i)
+        if (numFormats > 0)
        {
-            if (!memcmp(&formats[i], &format, sizeof(format)))
+            AutoBuffer<cl_image_format> formats(numFormats);
+            err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
+                                             CL_MEM_OBJECT_IMAGE2D, numFormats,
+                                             formats.data(), NULL);
+            CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
+            for (cl_uint i = 0; i < numFormats; ++i)
            {
-                return true;
+                if (!memcmp(&formats[i], &format, sizeof(format)))
+                {
+                    return true;
+                }
            }
        }
        return false;