From 5ee7abbe3c279a1b4f260b641fd2aeac322819cb Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Wed, 11 Dec 2019 20:16:58 +0300
Subject: [PATCH] Merge pull request #16088 from
 alalek:dnn_eltwise_layer_different_src_channels

dnn(eltwise): fix handling of different number of channels

* dnn(test): reproducer for Eltwise layer issue from PR16063

* dnn(eltwise): rework support for inputs with different channels

* dnn(eltwise): get rid of finalize(), variableChannels

* dnn(eltwise): update input sorting by number of channels

- do not swap inputs if number of channels are same after truncation

* dnn(test): skip "shortcut" with batch size 2 on MYRIAD targets
---
 modules/dnn/CMakeLists.txt                    |   1 +
 .../dnn/include/opencv2/dnn/all_layers.hpp    |   7 +
 modules/dnn/src/darknet/darknet_io.cpp        |   1 +
 modules/dnn/src/layers/eltwise_layer.cpp      | 376 +++++++++++++-----
 modules/dnn/test/test_darknet_importer.cpp    |  42 ++
 modules/dnn/test/test_layers.cpp              | 100 ++++-
 modules/ts/src/ts_func.cpp                    |   3 +-
 7 files changed, 407 insertions(+), 123 deletions(-)
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 7ed4f27cd6..4fb5e6c8e7 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -58,6 +58,7 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS
     /wd4456 /wd4510 /wd4610 /wd4800
     /wd4701 /wd4703                    # potentially uninitialized local/pointer variable 'value' used
     /wd4505                            # unreferenced local function has been removed
+    /wd4458                            # declaration of 'x' hides class member. GCC still works, MSVC bug is here: https://developercommunity.visualstudio.com/content/problem/219311/c-c4458-declaration-hides-class-member-warning-iss.html
     -wd858 -wd2196
     -Winvalid-offsetof                 # Apple Clang (attr_value.pb.cc)
 )
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 363dcc01cb..b251b4adb3 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -508,6 +508,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         static Ptr<Layer> create(const LayerParams &params);
     };
 
+    /** @brief Element wise operation on inputs
+
+    Extra optional parameters:
+    - "operation" as string. Values are "sum" (default), "prod", "max", "div"
+    - "coeff" as float array. Specify weights of inputs for SUM operation
+    - "output_channels_mode" as string. Values are "same" (default, all input must have the same layout), "input_0", "input_0_truncate", "max_input_channels"
+    */
     class CV_EXPORTS EltwiseLayer : public Layer
     {
     public:
diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp
index 3a90081e17..e9938ecbb9 100644
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -425,6 +425,7 @@ namespace cv {
                     }
 
                     shortcut_param.set<std::string>("op", "sum");
+                    shortcut_param.set<std::string>("output_channels_mode", "input_0_truncate");
 
                     darknet::LayerParameter lp;
                     std::string layer_name = cv::format("shortcut_%d", layer_id);
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 978b1b6f4a..c7c28e8fb8 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -66,9 +66,28 @@ public:
         DIV = 3
     } op;
     std::vector<float> coeffs;
-    bool variableChannels;
+
+    enum OutputChannelsMode
+    {
+        ELTWISE_CHANNNELS_SAME = 0,              //!< number of channels from inputs must be the same and equal to output's number of channels
+        ELTWISE_CHANNNELS_INPUT_0,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< number of channels of other inputs should not be greater than number of channels of first input
+        ELTWISE_CHANNNELS_INPUT_0_TRUNCATE,      //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< there is restriction on number of channels of other inputs
+                                                 //!< extra channels of other inputs is ignored
+        ELTWISE_CHANNNELS_USE_MAX,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to maximal number of input channels
+                                                 //!< @note supported operation: `SUM`
+    } channelsModeInput;
+
+
+    mutable OutputChannelsMode channelsMode;     //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
+    mutable /*size_t*/int outputChannels;
 
     EltwiseLayerImpl(const LayerParams& params)
+        : outputChannels(0)
     {
         setParamsFrom(params);
         op = SUM;
@@ -97,6 +116,35 @@ public:
                 coeffs[i] = paramCoeff.get<float>(i);
             }
         }
+
+        channelsModeInput = ELTWISE_CHANNNELS_SAME;
+        if (params.has("output_channels_mode"))
+        {
+            String v = toLowerCase(params.get<String>("output_channels_mode"));
+            if (v == "same")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_SAME;
+            }
+            else if (v == "input_0")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
+            }
+            else if (v == "input_0_truncate")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
+            }
+            else if (v == "max_input_channels")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
+                if (op != SUM)
+                    CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
+            }
+            else
+                CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
+        }
+        channelsMode = channelsModeInput;
+
+        // TODO Must have checks for other unknown options
     }
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
@@ -104,7 +152,7 @@ public:
         return backendId == DNN_BACKEND_OPENCV ||
                backendId == DNN_BACKEND_HALIDE ||
                ((((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()))
-                || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && !variableChannels));
+                || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && channelsMode == ELTWISE_CHANNNELS_SAME));
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -119,212 +167,320 @@ public:
 
         int dims = inputs[0].size();
         // Number of channels in output shape is determined by the first input tensor.
+        bool variableChannels = false;
         int numChannels = inputs[0][1];
-        for (int i = 1; i < inputs.size(); i++)
+        for (size_t i = 1; i < inputs.size(); i++)
         {
-            CV_Assert(inputs[0][0] == inputs[i][0]);
+            CV_Assert(inputs[0][0] == inputs[i][0]);  // batch sizes are equal
 
-            // It's allowed for channels axis to be different.
-            for (int j = 2; j < dims; j++)
+            int input_channels = inputs[i][1];
+            if (numChannels != input_channels)
+                variableChannels = true;
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
+            {
+                CV_Assert(numChannels == input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
+            {
+                CV_Assert(numChannels >= input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+            {
+                // nothing to check
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
+            {
+                numChannels = std::max(numChannels, input_channels);
+            }
+            else
+            {
+                CV_Assert(0 && "Internal error");
+            }
+
+            for (size_t j = 2; j < dims; j++)
                 CV_Assert(inputs[0][j] == inputs[i][j]);
         }
 
+        channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
+        outputChannels = numChannels;
+
         outputs.assign(1, inputs[0]);
         outputs[0][1] = numChannels;
         return false;
     }
 
-    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
-    {
-        std::vector<Mat> inputs;
-        inputs_arr.getMatVector(inputs);
-        variableChannels = false;
-        for (int i = 1; i < inputs.size(); ++i)
-        {
-            if (inputs[i].size[1] != inputs[0].size[1])
-            {
-                variableChannels = true;
-                break;
-            }
-        }
-    }
-
 
     class EltwiseInvoker : public ParallelLoopBody
     {
-    public:
+        EltwiseLayerImpl& self;
         std::vector<const Mat*> srcs;
+        std::vector<int> srcNumChannels;
         int nsrcs;
         Mat* dst;
         std::vector<float> coeffs;
-        EltwiseOp op;
         int nstripes;
         const ActivationLayer* activ;
         int channels;
         size_t planeSize;
 
-        EltwiseInvoker() : nsrcs(0), dst(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
+        EltwiseInvoker(EltwiseLayerImpl& self_)
+            : self(self_)
+            , nsrcs(0), dst(0), nstripes(0), activ(0), channels(0)
+            , planeSize(0)
+        {}
 
-        static void run(const Mat* srcs, int nsrcs, Mat& dst,
-                        const std::vector<float>& coeffs, EltwiseOp op,
-                        const ActivationLayer* activ, int nstripes)
+    public:
+        static void run(EltwiseLayerImpl& self,
+                        const Mat* srcs, int nsrcs, Mat& dst,
+                        int nstripes)
         {
+            const EltwiseOp op = self.op;
             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
-            CV_Assert(coeffs.empty() || coeffs.size() == (size_t)nsrcs);
+            CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
+            CV_CheckGE(nsrcs, 2, "");
 
-            EltwiseInvoker p;
+            CV_Assert(self.outputChannels == dst.size[1]);
+
+            EltwiseInvoker p(self);
             p.srcs.resize(nsrcs);
-            p.coeffs = coeffs;
+            p.srcNumChannels.resize(nsrcs);
+            p.coeffs = self.coeffs;  // can be sorted
+
+            bool sortInputs = false;
             for( int i = 0; i < nsrcs; i++ )
             {
-                p.srcs[i] = srcs + i;
-                CV_Assert(srcs[i].type() == dst.type() &&
-                          srcs[i].isContinuous());
-                // Sort srcs and coefficients in the order by number of channels
-                for( int j = i; j >= 1 && p.srcs[j - 1]->size[1] < p.srcs[j]->size[1]; j-- )
+                p.srcs[i] = &srcs[i];
+                CV_CheckEQ(srcs[i].dims, dst.dims, "");
+                CV_Assert(srcs[i].isContinuous());
+                CV_Assert(srcs[i].type() == dst.type());
+                p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
+
+                if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
                 {
-                    std::swap(p.srcs[j - 1], p.srcs[j]);
-                    if (!p.coeffs.empty())
-                        std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                    CV_Assert(srcs[i].size == dst.size);
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
+                {
+                    CV_Assert(op == SUM);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else
+                {
+                    CV_Assert(0 && "Internal error");
+                }
+
+                if (sortInputs)
+                {
+                    // Sort srcs and coefficients in the desc order by number of channels
+                    for (int j = i; j >= 1; j--)
+                    {
+                        if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
+                        {
+                            std::swap(p.srcs[j - 1], p.srcs[j]);
+                            std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
+                            if (!p.coeffs.empty())
+                                std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                        }
+                        else
+                            break;
+                    }
                 }
             }
 
             p.nsrcs = nsrcs;
             p.dst = &dst;
-            p.op = op;
             p.nstripes = nstripes;
             p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
 
             p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
-            CV_Assert(dst.total() == dst.size[0] * p.channels * p.planeSize);
+            CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
 
             bool simpleCoeffs = true;
-            if( op == SUM && !coeffs.empty() )
+            if (op == SUM && !p.coeffs.empty())
             {
-                CV_Assert( coeffs.size() == (size_t)nsrcs );
+                CV_CheckEQ(p.coeffs.size(), (size_t)nsrcs, "");
 
-                for( size_t i = 0; i < coeffs.size(); i++ )
-                    if( coeffs[i] != 1 )
+                for (size_t i = 0; i < p.coeffs.size(); i++)
+                {
+                    if (p.coeffs[i] != 1)
                     {
                         simpleCoeffs = false;
                         break;
                     }
+                }
             }
             if (simpleCoeffs)
                 p.coeffs.clear();
-            p.activ = activ;
+            p.activ = self.activ.get();
 
             parallel_for_(Range(0, nstripes), p, nstripes);
         }
 
         void operator()(const Range& r) const CV_OVERRIDE
         {
+            const EltwiseOp op = self.op;
             size_t total = dst->size[0]*planeSize;
             size_t stripeSize = (total + nstripes - 1)/nstripes;
             size_t stripeStart = r.start*stripeSize;
             size_t stripeEnd = std::min(r.end*stripeSize, total);
-            int c, j, k, n;
             const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
             float* dstptr0 = dst->ptr<float>();
-            int blockSize0 = 1 << 12, blockSize;
+            int blockSize0 = 1 << 12;
 
-            for( size_t ofs = stripeStart; ofs < stripeEnd; ofs += blockSize )
+            for (size_t ofs = stripeStart; ofs < stripeEnd; )
             {
                 int sampleIdx = (int)(ofs / planeSize);
                 int delta = (int)ofs - sampleIdx * planeSize;
-                blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
+                int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
                 if( blockSize <= 0 )
                     break;
+                ofs += blockSize;
 
-                for( c = 0; c < channels; c++ )
+                for (int c = 0; c < channels; c++)
                 {
-                    size_t globalDelta = delta + (sampleIdx*channels + c)*planeSize;
-                    const float* srcptr0 = srcs[0]->ptr<float>() + globalDelta;
-                    float* dstptr = dstptr0 + globalDelta;
+                    size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
+                    float* dstptr = dstptr0 + dstIdx;
 
-                    // This code assumes that srcs are sorted in descending order by channels.
-                    for (n = 1; n < nsrcs && c < srcs[n]->size[1]; ++n) {}
-
-                    if (n == 1)
+                    // process first two inputs
                     {
-                        if( !coeffsptr )
+                        const float* srcptr0 = srcs[0]->ptr<float>() + dstIdx;
+
+                        const int inputIdx = 1;
+                        int src1_channels = srcNumChannels[inputIdx];
+                        if (c >= src1_channels)
                         {
-                            for( j = 0; j < blockSize; j++ )
+                            // no data from second input
+                            if (!coeffsptr || coeffsptr[0] == 1.0f)
                             {
-                                dstptr[j] = srcptr0[j];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j];
+                                }
+                            }
+                            else
+                            {
+                                float c0 = coeffsptr[0];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = c0*srcptr0[j];
+                                }
                             }
                         }
                         else
                         {
-                            float c0 = coeffsptr[0];
-                            for( j = 0; j < blockSize; j++ )
+                            size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
+                            const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
+
+                            if (op == PROD)
                             {
-                                dstptr[j] = c0*srcptr0[j];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j] * srcptrI[j];
+                                }
                             }
+                            else if (op == DIV)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j] / srcptrI[j];
+                                }
+                            }
+                            else if (op == MAX)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
+                                }
+                            }
+                            else if (op == SUM)
+                            {
+                                if (!coeffsptr || (coeffsptr[0] == 1.0f && coeffsptr[1] == 1.0f))
+                                {
+                                    for (int j = 0; j < blockSize; j++)
+                                    {
+                                        dstptr[j] = srcptr0[j] + srcptrI[j];
+                                    }
+                                }
+                                else
+                                {
+                                    float c0 = coeffsptr[0];
+                                    float c1 = coeffsptr[1];
+                                    for (int j = 0; j < blockSize; j++)
+                                    {
+                                        dstptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
+                                    }
+                                }
+                            }
+                            else
+                                CV_Error(Error::StsInternal, "");
                         }
                     }
-                    else if( op == PROD )
+
+                    // aggregate other inputs (3+)
+                    for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
                     {
-                        for( k = 1; k < n; k++ )
+                        int srcI_channels = srcNumChannels[inputIdx];
+                        if (c >= srcI_channels)
+                            continue;  // no data from second input
+                        size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
+                        const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
+
+                        if (op == PROD)
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
-                            for( j = 0; j < blockSize; j++ )
+                            for (int j = 0; j < blockSize; j++)
                             {
-                                dstptr[j] = srcptr0[j]*srcptr1[j];
+                                dstptr[j] *= srcptrI[j];
                             }
-                            srcptr0 = (const float*)dstptr;
                         }
-                    }
-                    else if( op == DIV )
-                    {
-                        for( k = 1; k < n; k++ )
+                        else if (op == DIV)
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
-                            for( j = 0; j < blockSize; j++ )
+                            for (int j = 0; j < blockSize; j++)
                             {
-                                dstptr[j] = srcptr0[j]/srcptr1[j];
+                                dstptr[j] /= srcptrI[j];
                             }
-                            srcptr0 = (const float*)dstptr;
                         }
-                    }
-                    else if( op == MAX )
-                    {
-                        for( k = 1; k < n; k++ )
+                        else if (op == MAX)
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
-                            for( j = 0; j < blockSize; j++ )
+                            for (int j = 0; j < blockSize; j++)
                             {
-                                dstptr[j] = std::max(srcptr0[j], srcptr1[j]);
+                                dstptr[j] = std::max(dstptr[j], srcptrI[j]);
                             }
-                            srcptr0 = (const float*)dstptr;
                         }
-                    }
-                    else if( !coeffsptr )
-                    {
-                        for( k = 1; k < n; k++ )
+                        else if (op == SUM)
                         {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
-                            for( j = 0; j < blockSize; j++ )
+                            if (!coeffsptr || coeffsptr[inputIdx] == 1.0f)
                             {
-                                dstptr[j] = srcptr0[j] + srcptr1[j];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] += srcptrI[j];
+                                }
                             }
-                            srcptr0 = (const float*)dstptr;
-                        }
-                    }
-                    else
-                    {
-                        float c0 = coeffsptr[0];
-                        for( k = 1; k < n; k++ )
-                        {
-                            const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
-                            float c1 = coeffsptr[k];
-                            for( j = 0; j < blockSize; j++ )
+                            else
                             {
-                                dstptr[j] = c0*srcptr0[j] + c1*srcptr1[j];
+                                float cI = coeffsptr[inputIdx];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] += cI * srcptrI[j];
+                                }
                             }
-                            srcptr0 = (const float*)dstptr;
-                            c0 = 1;
                         }
+                        else
+                            CV_Error(Error::StsInternal, "");
                     }
                 }
 
@@ -343,7 +499,7 @@ public:
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        if ((inputs_.depth() == CV_16S && op != SUM) || variableChannels)
+        if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
             return false;
 
         inputs_.getUMatVector(inputs);
@@ -446,8 +602,9 @@ public:
 
         CV_Assert(outputs.size() == 1);
         const int nstripes = getNumThreads();
-        EltwiseInvoker::run(&inputs[0], (int)inputs.size(), outputs[0],
-                            coeffs, op, activ.get(), nstripes);
+        EltwiseInvoker::run(*this,
+                            &inputs[0], (int)inputs.size(), outputs[0],
+                            nstripes);
     }
 
     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
@@ -558,6 +715,7 @@ public:
         CV_UNUSED(outputs); // suppress unused variable warning
         CV_Assert(inputs.size());
 
+        // FIXIT: handle inputs with different number of channels
         long flops = inputs.size() * total(inputs[0]);
 
         return flops;
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index fecc5aaa8e..eced69555e 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -99,6 +99,7 @@ class Test_Darknet_layers : public DNNTestLayer
 public:
     void testDarknetLayer(const std::string& name, bool hasWeights = false)
     {
+        SCOPED_TRACE(name);
         Mat inp = blobFromNPY(findDataFile("dnn/darknet/" + name + "_in.npy"));
         Mat ref = blobFromNPY(findDataFile("dnn/darknet/" + name + "_out.npy"));
 
@@ -115,6 +116,47 @@ public:
         net.setInput(inp);
         Mat out = net.forward();
         normAssert(out, ref, "", default_l1, default_lInf);
+
+        if (inp.size[0] == 1)  // test handling of batch size
+        {
+            SCOPED_TRACE("batch size 2");
+
+#if defined(INF_ENGINE_RELEASE)
+            if (target == DNN_TARGET_MYRIAD && name == "shortcut")
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+#endif
+
+            std::vector<int> sz2 = shape(inp);
+            sz2[0] = 2;
+
+            Net net2 = readNet(cfg, model);
+            net2.setPreferableBackend(backend);
+            net2.setPreferableTarget(target);
+            Range ranges0[4] = { Range(0, 1), Range::all(), Range::all(), Range::all() };
+            Range ranges1[4] = { Range(1, 2), Range::all(), Range::all(), Range::all() };
+            Mat inp2(sz2, inp.type(), Scalar::all(0));
+            inp.copyTo(inp2(ranges0));
+            inp.copyTo(inp2(ranges1));
+            net2.setInput(inp2);
+            Mat out2 = net2.forward();
+            EXPECT_EQ(0, cv::norm(out2(ranges0), out2(ranges1), NORM_INF)) << "Batch result is not equal: " << name;
+
+            Mat ref2 = ref;
+            if (ref.dims == 2 && out2.dims == 3)
+            {
+                int ref_3d_sizes[3] = {1, ref.rows, ref.cols};
+                ref2 = Mat(3, ref_3d_sizes, ref.type(), (void*)ref.data);
+            }
+            /*else if (ref.dims == 3 && out2.dims == 4)
+            {
+                int ref_4d_sizes[4] = {1, ref.size[0], ref.size[1], ref.size[2]};
+                ref2 = Mat(4, ref_4d_sizes, ref.type(), (void*)ref.data);
+            }*/
+            ASSERT_EQ(out2.dims, ref2.dims) << ref.dims;
+
+            normAssert(out2(ranges0), ref2, "", default_l1, default_lInf);
+            normAssert(out2(ranges1), ref2, "", default_l1, default_lInf);
+        }
     }
 };
 
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 7c5c491fad..f9ff4ed883 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -1582,30 +1582,28 @@ TEST(Layer_Test_Convolution, relu_fusion)
 }
 
 typedef testing::TestWithParam<tuple<bool, tuple<Backend, Target> > > Layer_Test_Eltwise_unequal;
-TEST_P(Layer_Test_Eltwise_unequal, Accuracy)
+TEST_P(Layer_Test_Eltwise_unequal, accuracy_input_0_truncate)
 {
     bool weighted = get<0>(GetParam());
     int backendId = get<0>(get<1>(GetParam()));
     int targetId = get<1>(get<1>(GetParam()));
 
-    if (backendId == DNN_BACKEND_OPENCV && targetId == DNN_TARGET_OPENCL_FP16)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
-
     Net net;
     LayerParams lp;
     lp.type = "Eltwise";
     lp.name = "testLayer";
+    lp.set<std::string>("output_channels_mode", "input_0_truncate");
 
     const int inpShapes[][4] = {{1, 4, 2, 2}, {1, 5, 2, 2}, {1, 3, 2, 2}};
+    const int out_channels = inpShapes[0][1];
     std::vector<String> inpNames(3);
     std::vector<Mat> inputs(3);
-    size_t numOutValues = 1*4*2*2;  // By the first input
 
     std::vector<float> weights(3, 1);
     if (weighted)
     {
         for (int i = 0; i < inputs.size(); ++i)
-            randu(Mat(1, 1, CV_32F, &weights[i]), -1, 1);
+            weights[i] = -0.125f + i * 0.25f;
         lp.set("coeff", DictValue::arrayReal<float*>(&weights[0], weights.size()));
     }
 
@@ -1613,27 +1611,103 @@ TEST_P(Layer_Test_Eltwise_unequal, Accuracy)
     for (int i = 0; i < inputs.size(); ++i)
     {
         inputs[i].create(4, inpShapes[i], CV_32F);
-        randu(inputs[i], 0, 255);
+        size_t total = inputs[i].total();
+        for (size_t j = 0; j < total; j++)
+            inputs[i].ptr<float>()[j] = j + i * 100;
         inpNames[i] = format("input_%d", i);
         net.connect(0, i, eltwiseId, i);
     }
-    Mat ref(1, numOutValues, CV_32F, Scalar(0));
+    Mat ref(4, inpShapes[0], CV_32F, Scalar(0));
 
     net.setInputsNames(inpNames);
     for (int i = 0; i < inputs.size(); ++i)
     {
+        //std::cout << ref.reshape(1,1) << endl;
         net.setInput(inputs[i], inpNames[i]);
-        if (numOutValues >= inputs[i].total())
-            ref.colRange(0, inputs[i].total()) += weights[i] * inputs[i].reshape(1, 1);
-        else
-            ref += weights[i] * inputs[i].reshape(1, 1).colRange(0, numOutValues);
+        for (size_t batchId = 0; batchId < ref.size[0]; batchId++)
+        {
+            int input_channels = inputs[i].size[1];
+            Range ranges[4] = { Range(batchId, batchId + 1), Range(0, std::min(out_channels, input_channels)), Range::all(), Range::all() };
+            Mat ref_slice = ref(ranges);
+            Mat input_slice = inputs[i](ranges);
+            ref_slice += weights[i] * input_slice;
+        }
     }
 
     net.setPreferableBackend(backendId);
     net.setPreferableTarget(targetId);
     Mat out = net.forward();
-    normAssert(out.reshape(1, 1), ref);
+    normAssert(out, ref);
+    if (testing::Test::HasFailure())
+    {
+        std::cout << out.reshape(1,1) << endl;
+        std::cout << ref.reshape(1,1) << endl;
+    }
 }
+
+TEST_P(Layer_Test_Eltwise_unequal, accuracy_input_0)
+{
+    bool weighted = get<0>(GetParam());
+    int backendId = get<0>(get<1>(GetParam()));
+    int targetId = get<1>(get<1>(GetParam()));
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Eltwise";
+    lp.name = "testLayer";
+    lp.set<std::string>("output_channels_mode", "input_0");
+
+    const int inpShapes[][4] = {{1, 4, 2, 2}, {1, 2, 2, 2}, {1, 3, 2, 2}};
+    const int out_channels = inpShapes[0][1];
+    std::vector<String> inpNames(3);
+    std::vector<Mat> inputs(3);
+
+    std::vector<float> weights(3, 1);
+    if (weighted)
+    {
+        for (int i = 0; i < inputs.size(); ++i)
+            weights[i] = -0.125f + i * 0.25f;
+        lp.set("coeff", DictValue::arrayReal<float*>(&weights[0], weights.size()));
+    }
+
+    int eltwiseId = net.addLayer(lp.name, lp.type, lp);
+    for (int i = 0; i < inputs.size(); ++i)
+    {
+        inputs[i].create(4, inpShapes[i], CV_32F);
+        size_t total = inputs[i].total();
+        for (size_t j = 0; j < total; j++)
+            inputs[i].ptr<float>()[j] = j + i * 100;
+        inpNames[i] = format("input_%d", i);
+        net.connect(0, i, eltwiseId, i);
+    }
+    Mat ref(4, inpShapes[0], CV_32F, Scalar(0));
+
+    net.setInputsNames(inpNames);
+    for (int i = 0; i < inputs.size(); ++i)
+    {
+        //std::cout << ref.reshape(1,1) << endl;
+        net.setInput(inputs[i], inpNames[i]);
+        for (size_t batchId = 0; batchId < ref.size[0]; batchId++)
+        {
+            int input_channels = inputs[i].size[1];
+            Range ranges[4] = { Range(batchId, batchId + 1), Range(0, std::min(out_channels, input_channels)), Range::all(), Range::all() };
+            Mat ref_slice = ref(ranges);
+            Mat input_slice = inputs[i](ranges);
+            ref_slice += weights[i] * input_slice;
+        }
+    }
+
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+    Mat out = net.forward();
+    normAssert(out, ref);
+    if (testing::Test::HasFailure())
+    {
+        std::cout << out.reshape(1,1) << endl;
+        std::cout << ref.reshape(1,1) << endl;
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Eltwise_unequal, Combine(
     testing::Bool(),
     dnnBackendsAndTargets()
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 928411d996..6f1c389307 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -1368,7 +1368,8 @@ double norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask)
     int normType0 = normType;
     normType = normType == NORM_L2SQR ? NORM_L2 : normType;
 
-    CV_Assert( src1.type() == src2.type() && src1.size == src2.size );
+    CV_CheckTypeEQ(src1.type(), src2.type(), "");
+    CV_Assert(src1.size == src2.size);
     CV_Assert( mask.empty() || (src1.size == mask.size && mask.type() == CV_8U) );
     CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
     const Mat *arrays[]={&src1, &src2, &mask, 0};