diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index b4829c72a6..5567a58a2a 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -606,7 +606,8 @@ public:
         if(IS_DNN_CUDA_TARGET(preferableTarget))
         {
             Ptr<EltwiseLayer> eltwise = top.dynamicCast<EltwiseLayer>();
-            if (!eltwise.empty()) // && eltwise->op == EltwiseLayer::SUM && eltwise->coeffs.empty())
+            Ptr<NaryEltwiseLayer> naryEltwise = top.dynamicCast<NaryEltwiseLayer>();
+            if (!eltwise.empty() || !naryEltwise.empty())
             {
                 /* we also need to check that the eltwise input does not require shortcut mechanism
                  * it's difficult to verify it here but we hope that `fuseLayers` has done the check already
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
index 91eb7f3c0e..3232f0ae5c 100644
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -681,17 +681,28 @@ public:
                 return Ptr<BackendNode>();
         }
 
-        auto op_ = [this] {
-            switch (op) {
-                case OPERATION::MAX: return cuda4dnn::EltwiseOpType::MAX;
-                case OPERATION::MIN: return cuda4dnn::EltwiseOpType::MIN;
-                case OPERATION::SUM: return cuda4dnn::EltwiseOpType::SUM;
-                case OPERATION::PROD: return cuda4dnn::EltwiseOpType::PRODUCT;
-                case OPERATION::DIV: return cuda4dnn::EltwiseOpType::DIV;
-                case OPERATION::ADD: return cuda4dnn::EltwiseOpType::SUM;
-                default: CV_Error(Error::StsNotImplemented, "Other operators except MAX, MIN, SUM, PRODUCT and DIV are not supported with cuda.");
-            }
-        }();
+        cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
+        switch (op) {
+            case OPERATION::MAX:
+                op_ = cuda4dnn::EltwiseOpType::MAX;
+                break;
+            case OPERATION::MIN:
+                op_ = cuda4dnn::EltwiseOpType::MIN;
+                break;
+            case OPERATION::SUM:
+                op_ = cuda4dnn::EltwiseOpType::SUM;
+                break;
+            case OPERATION::PROD:
+                op_ = cuda4dnn::EltwiseOpType::PRODUCT;
+                break;
+            case OPERATION::DIV:
+                op_ = cuda4dnn::EltwiseOpType::DIV;
+                break;
+            case OPERATION::ADD:
+                op_ = cuda4dnn::EltwiseOpType::SUM;
+                break;
+            default: return Ptr<BackendNode>(); // return empty cuda_node if the EltwiseOpType is unsupported type.
+        };
 
         return make_cuda_node<cuda4dnn::EltwiseOp>(preferableTarget, std::move(context->stream), op_, std::vector<float>());
     }
diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp
index 79365d0411..935f71833f 100644
--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@@ -82,10 +82,11 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                     break;
                 }
 #endif
-                /* we use `tryFuse` member of convolution layer to fuse eltwise later
+                /* we use `tryFuse` member of convolution layer to fuse eltwise/naryEltwise later
                  * it's not intended to be fused here; hence, we stop when we encounter eltwise
                  */
-                if (preferableBackend == DNN_BACKEND_CUDA && ld.type == "Convolution" && nextData->type == "Eltwise")
+                if (preferableBackend == DNN_BACKEND_CUDA && ld.type == "Convolution" &&
+                        (nextData->type == "Eltwise" || nextData->type == "NaryEltwise"))
                     break;
                 Ptr<Layer> nextLayer = nextData->layerInstance;
                 if (currLayer->tryFuse(nextLayer))
@@ -335,22 +336,31 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
             }
 
             // OpenCL: fuse convolution layer followed by eltwise + relu
-            // CUDA: fuse convolution layer followed by eltwise (and optional activation)
+            // CUDA: fuse convolution layer followed by eltwise/naryEltwise (and optional activation)
             while (nextData &&
                 (IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
                 ld.layerInstance->type == "Convolution"
             )  // semantic of 'if'
             {
                 Ptr<EltwiseLayer> nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
-                if (nextEltwiseLayer.empty())
+                Ptr<NaryEltwiseLayer> nextNaryEltwiseLayer = nextData->layerInstance.dynamicCast<NaryEltwiseLayer>();
+                if (nextEltwiseLayer.empty() && nextNaryEltwiseLayer.empty())
+                    break;
+
+                // TODO: fused the Conv+NaryEltwise on OpenCL backend. At present, we can only support it at CUDA backend.
+                if (IS_DNN_OPENCL_TARGET(preferableTarget) && nextNaryEltwiseLayer)
                     break;
 
 #ifdef HAVE_CUDA
                 // CUDA backend supports fusion with eltwise sum (without variable channels)
-                if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
+                if (IS_DNN_CUDA_TARGET(preferableTarget) && (!nextEltwiseLayer.empty() || !nextNaryEltwiseLayer.empty()))
                 {
                     // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
                     cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
+
+                    if (!nextData->layerInstance->supportBackend(DNN_BACKEND_CUDA))
+                        break;
+
                     const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
                     auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
 
@@ -408,7 +418,7 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                 {
                     LayerData *eltwiseData = nextData;
 
-                    // Eltwise layer has two inputs. We need to determine which
+                    // Eltwise/NaryEltwise layer has two inputs. We need to determine which
                     // is a base convolution layer and which could be used as it's bias.
                     LayerData* biasLayerData = 0;
                     for (int i = 0; i < 2; ++i)
@@ -483,7 +493,14 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                                  * => activation(convolution + eltwise)
                                  *    > fuse eltwise and then activation
                                  */
-                                auto layer = nextEltwiseLayer.staticCast<Layer>();
+                                Ptr<Layer> layer = nullptr;
+                                if (nextNaryEltwiseLayer)
+                                    layer = nextNaryEltwiseLayer.staticCast<Layer>();
+                                else if (nextEltwiseLayer)
+                                    layer = nextEltwiseLayer.staticCast<Layer>();
+                                else
+                                    CV_Error(Error::StsError, "Both nextNaryEltwiseLayer and nextEltwiseLayer are empty!");
+
                                 if (currLayer->tryFuse(layer))
                                 {
                                     fuse_eltwise = true; /* eltwise was successfully fused */
@@ -511,7 +528,14 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                                 CV_Assert(nextData);
                                 CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
                                 ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
-                                printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+
+                                if (nextEltwiseLayer)
+                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+                                else if (nextNaryEltwiseLayer)
+                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+                                else
+                                    CV_Error(Error::StsError, "Both nextNaryEltwiseLayer and nextEltwiseLayer are empty!");
+
                                 printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str()));
                                 eltwiseData->skip = true;
                                 nextData->skip = true;
@@ -554,12 +578,19 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                                     }
                                 }
                             }
-                            else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise)
+                            else if (fuse_eltwise) // conv + eltwise/naryEltwise (note: conv could have fused activations before eltwise)
                             {
                                 CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
                                 CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
                                 ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
-                                printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+
+                                if (nextEltwiseLayer)
+                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+                                else if (nextNaryEltwiseLayer)
+                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+                                else
+                                    CV_Error(Error::StsError, "Both nextNaryEltwiseLayer and nextEltwiseLayer are empty!");
+
                                 eltwiseData->skip = true;
                                 // This optimization is for cases like
                                 // some_layer   conv (maybe fused with activ)
@@ -682,6 +713,7 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                           inp_i_data->layerInstance->type != "Permute" &&
                           inp_i_data->layerInstance->type != "Reorg" &&
                           inp_i_data->layerInstance->type != "Eltwise" &&
+                          inp_i_data->layerInstance->type != "NaryEltwise" &&
                           inp_i_data->layerInstance.dynamicCast<ActivationLayer>().empty())))
                     {
                         break;