diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 95643a287b..d9fd4e99a1 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -148,7 +148,13 @@ private:
 #else
         cv::dnn::Net net;
         cv::dnn::LayerParams lp;
-        net.addLayerToPrev("testLayer", "Identity", lp);
+        lp.set("kernel_size", 1);
+        lp.set("num_output", 1);
+        lp.set("bias_term", false);
+        lp.type = "Convolution";
+        lp.name = "testLayer";
+        lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
+        net.addLayerToPrev(lp.name, lp.type, lp);
         net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
         net.setPreferableTarget(target);
         static int inpDims[] = {1, 2, 3, 4};
@@ -2676,7 +2682,7 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
     backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
     for (auto& it : ieNet.getOutputsInfo())
     {
-        Ptr<Layer> cvLayer(new InfEngineBackendLayer(it.second));
+        Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
         InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
         CV_Assert(ieLayer);
 
@@ -2871,8 +2877,7 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
     std::vector<LayerPin> pins;
     for (int i = 0; i < outBlobNames.size(); i++)
     {
-        std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
-        pins.insert(pins.end(), lp.begin(), lp.end());
+        pins.push_back(impl->getPinByAlias(outBlobNames[i]));
     }
 
     impl->setUpNet(pins);
@@ -2885,9 +2890,10 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
     for (int i = 0; i < outBlobNames.size(); i++)
     {
         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
-        for (int i = 0; i < lp.size(); i++)
+        outputBlobs[i].resize(lp.size());
+        for (int j = 0; j < lp.size(); j++)
         {
-            outputBlobs[i].push_back(impl->getBlob(lp[i]));
+            outputBlobs[i][j] = impl->getBlob(lp[j]);
         }
     }
 }
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 9f8590bea7..96336808a0 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -110,14 +110,25 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
-        InferenceEngine::Builder::SplitLayer ieLayer(name);
-        ieLayer.setOutputPorts({InferenceEngine::Port()});
-        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
-#else
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
         CV_Assert(!input->dims.empty());
-
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (preferableTarget == DNN_TARGET_MYRIAD)
+        {
+            ieLayer.setType("Copy");
+        }
+        else
+        {
+            ieLayer.setType("Split");
+            ieLayer.getParameters()["axis"] = input->dims.size() - 1;
+            ieLayer.getParameters()["out_sizes"] = input->dims[0];
+        }
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
         InferenceEngine::LayerParams lp;
         lp.name = name;
         lp.type = "Split";
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 21a13c8d47..57f6054538 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -281,7 +281,7 @@ public:
         const int outCn = blobs[0].size[0];
         // prepare weightsMat where each row is aligned and has enough zero padding on the right to
         // use vectorized (i.e. with intrinsics) loops without tail processing
-        Mat wm = blobs[0].reshape(1, outCn).clone();
+        Mat wm = blobs[0].reshape(1, outCn);
         if( wm.step1() % VEC_ALIGN != 0 )
         {
             int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
@@ -374,6 +374,10 @@ public:
 
         if (!w.empty())
         {
+            // Keep origin weights unchanged.
+            if (weightsMat.data == blobs[0].data)
+                weightsMat = weightsMat.clone();
+
             Mat originWeights = blobs[0].reshape(1, outCn);
             for (int i = 0; i < outCn; ++i)
             {
@@ -551,13 +555,13 @@ public:
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
         InferenceEngine::Builder::ConvolutionLayer ieLayer(name);
 
-        ieLayer.setKernel({kernel.height, kernel.width});
-        ieLayer.setStrides({stride.height, stride.width});
-        ieLayer.setDilation({dilation.height, dilation.width});
-        ieLayer.setPaddingsBegin({pad.height, pad.width});
-        ieLayer.setPaddingsEnd({pad.height, pad.width});
-        ieLayer.setGroup(group);
-        ieLayer.setOutDepth(outCn);
+        ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
+        ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
+        ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width});
+        ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setGroup((size_t)group);
+        ieLayer.setOutDepth((size_t)outCn);
 
         ieLayer.setWeights(ieWeights);
         if (ieBiases)
@@ -1220,7 +1224,7 @@ public:
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         {
-            if (INF_ENGINE_RELEASE == 2018050000 && (adjustPad.height || adjustPad.width))
+            if (INF_ENGINE_RELEASE >= 2018050000 && (adjustPad.height || adjustPad.width))
                 return false;
 
             const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
@@ -1783,13 +1787,13 @@ public:
 
         InferenceEngine::Builder::DeconvolutionLayer ieLayer(name);
 
-        ieLayer.setKernel({kernel.height, kernel.width});
-        ieLayer.setStrides({stride.height, stride.width});
-        ieLayer.setDilation({dilation.height, dilation.width});
-        ieLayer.setPaddingsBegin({pad.height, pad.width});
-        ieLayer.setPaddingsEnd({pad.height, pad.width});
-        ieLayer.setGroup(group);
-        ieLayer.setOutDepth(numOutput);
+        ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
+        ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
+        ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width});
+        ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setGroup((size_t)group);
+        ieLayer.setOutDepth((size_t)numOutput);
 
         ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW));
         if (hasBias())
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index bfcc1068e1..5b357fec91 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -299,10 +299,10 @@ public:
         if (type == MAX || type == AVE)
         {
             InferenceEngine::Builder::PoolingLayer ieLayer(name);
-            ieLayer.setKernel({kernel.height, kernel.width});
-            ieLayer.setStrides({stride.height, stride.width});
-            ieLayer.setPaddingsBegin({pad_t, pad_l});
-            ieLayer.setPaddingsEnd({pad_b, pad_r});
+            ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
+            ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
+            ieLayer.setPaddingsBegin({(size_t)pad_t, (size_t)pad_l});
+            ieLayer.setPaddingsEnd({(size_t)pad_b, (size_t)pad_r});
             ieLayer.setPoolingType(type == MAX ?
                                    InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
                                    InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 98de907b9e..1d21021e34 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -82,7 +82,7 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
     CV_Assert(it != layers.end());
 
     const int layerId = it->second;
-    for (int i = 0; i < inpWrappers.size(); ++i)
+    for (size_t i = 0; i < inpWrappers.size(); ++i)
     {
         const auto& inp = inpWrappers[i];
         const std::string& inpName = inp->dataPtr->name;
@@ -103,7 +103,7 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
         else
             inpId = it->second;
 
-        netBuilder.connect(inpId, {layerId, i});
+        netBuilder.connect((size_t)inpId, {(size_t)layerId, i});
         unconnectedLayersIds.erase(inpId);
     }
     CV_Assert(!outputs.empty());
@@ -119,7 +119,7 @@ void InfEngineBackendNet::init(int targetId)
         for (int id : unconnectedLayersIds)
         {
             InferenceEngine::Builder::OutputLayer outLayer("myconv1");
-            netBuilder.addLayer({id}, outLayer);
+            netBuilder.addLayer({InferenceEngine::PortInfo(id)}, outLayer);
         }
         cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build()));
     }
@@ -718,19 +718,33 @@ Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
     return Mat(size, CV_32F, (void*)blob->buffer());
 }
 
-InfEngineBackendLayer::InfEngineBackendLayer(const InferenceEngine::DataPtr& output_)
-{
-    output = output_;
-}
-
 bool InfEngineBackendLayer::getMemoryShapes(const std::vector<MatShape> &inputs,
                                             const int requiredOutputs,
                                             std::vector<MatShape> &outputs,
                                             std::vector<MatShape> &internals) const
 {
-    std::vector<size_t> dims = output->dims;
-    std::vector<int> shape(dims.rbegin(), dims.rend());
-    outputs.assign(1, shape);
+    InferenceEngine::ICNNNetwork::InputShapes inShapes = t_net.getInputShapes();
+    InferenceEngine::ICNNNetwork::InputShapes::iterator itr;
+    bool equal_flag = true;
+    size_t i = 0;
+    for (itr = inShapes.begin(); itr != inShapes.end(); ++itr)
+    {
+        InferenceEngine::SizeVector currentInShape(inputs[i].begin(), inputs[i].end());
+        if (itr->second != currentInShape)
+        {
+            itr->second = currentInShape;
+            equal_flag = false;
+        }
+        i++;
+    }
+
+    if (!equal_flag)
+    {
+        InferenceEngine::CNNNetwork curr_t_net(t_net);
+        curr_t_net.reshape(inShapes);
+    }
+    std::vector<size_t> dims = t_net.getOutputsInfo()[name]->getDims();
+    outputs.push_back(MatShape(dims.begin(), dims.end()));
     return false;
 }
 
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index a224767f8d..1e35612555 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -260,7 +260,7 @@ InferenceEngine::TBlob<int16_t>::Ptr convertFp16(const InferenceEngine::Blob::Pt
 class InfEngineBackendLayer : public Layer
 {
 public:
-    InfEngineBackendLayer(const InferenceEngine::DataPtr& output);
+    InfEngineBackendLayer(const InferenceEngine::CNNNetwork &t_net_) : t_net(t_net_) {};
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                  const int requiredOutputs,
@@ -273,7 +273,7 @@ public:
     virtual bool supportBackend(int backendId) CV_OVERRIDE;
 
 private:
-    InferenceEngine::DataPtr output;
+    InferenceEngine::CNNNetwork t_net;
 };
 
 #endif  // HAVE_INF_ENGINE
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 62e625f03c..06aec7da13 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -236,6 +236,10 @@ TEST_P(Test_Caffe_layers, Dropout)
 
 TEST_P(Test_Caffe_layers, Concat)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE > 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
     testLayerUsingCaffeModels("layer_concat");
     testLayerUsingCaffeModels("layer_concat_optim", true, false);
     testLayerUsingCaffeModels("layer_concat_shared_input", true, false);
@@ -923,8 +927,9 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
 {
     Target targetId = GetParam();
 
+    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
     Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt"));
-    Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
+    Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
 
     Mat inp = blobFromNPY(_tf("blob.npy"));
 
@@ -935,22 +940,15 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
     net.setInput(inp);
     net.setPreferableTarget(targetId);
 
-    if (targetId != DNN_TARGET_MYRIAD)
-    {
-        Mat out = net.forward();
+    Mat out = net.forward();
 
-        normAssert(outDefault, out);
+    double l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.4e-3 : 1e-5;
+    double lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.8e-2 : 1e-4;
+    normAssert(outDefault, out, "", l1, lInf);
 
-        std::vector<int> outLayers = net.getUnconnectedOutLayers();
-        ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge");
-        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat");
-    }
-    else
-    {
-        // An assertion is expected because the model is in FP32 format but
-        // Myriad plugin supports only FP16 models.
-        ASSERT_ANY_THROW(net.forward());
-    }
+    std::vector<int> outLayers = net.getUnconnectedOutLayers();
+    ASSERT_EQ(net.getLayer(outLayers[0])->name, "output");
+    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
 }
 
 TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
@@ -962,23 +960,16 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
     randu(inputs[0], 0, 255);
     inputs[0].convertTo(inputs[1], CV_32F);
 
+    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+
     Mat outs[2];
     for (int i = 0; i < 2; ++i)
     {
-        Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
+        Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
         net.setPreferableTarget(targetId);
         net.setInput(inputs[i]);
-        if (targetId != DNN_TARGET_MYRIAD)
-        {
-            outs[i] = net.forward();
-            ASSERT_EQ(outs[i].type(), CV_32F);
-        }
-        else
-        {
-            // An assertion is expected because the model is in FP32 format but
-            // Myriad plugin supports only FP16 models.
-            ASSERT_ANY_THROW(net.forward());
-        }
+        outs[i] = net.forward();
+        ASSERT_EQ(outs[i].type(), CV_32F);
     }
     if (targetId != DNN_TARGET_MYRIAD)
         normAssert(outs[0], outs[1]);
@@ -1008,8 +999,8 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Convolution_DLDT,
 // net.save('/path/to/caffemodel')
 //
 // 3. Convert using ModelOptimizer.
-typedef testing::TestWithParam<tuple<int, int, Target> > Test_DLDT_two_inputs;
-TEST_P(Test_DLDT_two_inputs, as_IR)
+typedef testing::TestWithParam<tuple<int, int, Target, std::vector<int> > > Test_DLDT_two_inputs_3dim;
+TEST_P(Test_DLDT_two_inputs_3dim, as_IR)
 {
     int firstInpType = get<0>(GetParam());
     int secondInpType = get<1>(GetParam());
@@ -1020,32 +1011,39 @@ TEST_P(Test_DLDT_two_inputs, as_IR)
         throw SkipTestException("Test is enabled starts from OpenVINO 2018R4");
 #endif
 
-    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
-    int inpSize[] = {1, 2, 3};
-    Mat firstInp(3, &inpSize[0], firstInpType);
-    Mat secondInp(3, &inpSize[0], secondInpType);
+    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    Net net = readNet(_tf("net_two_inputs" + suffix + ".xml"), _tf("net_two_inputs.bin"));
+    std::vector<int> inpSize = get<3>(GetParam());
+    Mat firstInp(3, inpSize.data(), firstInpType);
+    Mat secondInp(3, inpSize.data(), secondInpType);
     randu(firstInp, 0, 255);
     randu(secondInp, 0, 255);
 
     net.setInput(firstInp, "data");
     net.setInput(secondInp, "second_input");
     net.setPreferableTarget(targetId);
-    if (targetId != DNN_TARGET_MYRIAD)
-    {
-        Mat out = net.forward();
 
-        Mat ref;
-        cv::add(firstInp, secondInp, ref, Mat(), CV_32F);
-        normAssert(out, ref);
-    }
-    else
-    {
-        // An assertion is expected because the model is in FP32 format but
-        // Myriad plugin supports only FP16 models.
-        ASSERT_ANY_THROW(net.forward());
-    }
+    double l1 = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) &&
+                 (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.06 : 0.0;
+    double lInf = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) &&
+                   (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.23 : 0.0;
+
+    Mat out = net.forward();
+
+    Mat ref;
+    cv::add(firstInp, secondInp, ref, Mat(), CV_32F);
+    normAssert(out, ref, "", l1, lInf);
 }
 
+std::vector< std::vector<int> > list_sizes{ {1, 2, 3}, {3, 2, 1}, {5, 5, 5}, {13, 7, 11} };
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_DLDT_two_inputs_3dim, Combine(
+  Values(CV_8U, CV_32F), Values(CV_8U, CV_32F),
+  testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)),
+  testing::ValuesIn(list_sizes)
+));
+
+typedef testing::TestWithParam<tuple<int, int, Target> > Test_DLDT_two_inputs;
 TEST_P(Test_DLDT_two_inputs, as_backend)
 {
     static const float kScale = 0.5f;
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index fa528b5c4b..60e3313048 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -308,4 +308,38 @@ TEST_P(DeprecatedForward, CustomLayerWithFallback)
 
 INSTANTIATE_TEST_CASE_P(/**/, DeprecatedForward, dnnBackendsAndTargets());
 
+TEST(Net, forwardAndRetrieve)
+{
+    std::string prototxt =
+        "input: \"data\"\n"
+        "layer {\n"
+        "  name: \"testLayer\"\n"
+        "  type: \"Slice\"\n"
+        "  bottom: \"data\"\n"
+        "  top: \"firstCopy\"\n"
+        "  top: \"secondCopy\"\n"
+        "  slice_param {\n"
+        "    axis: 0\n"
+        "    slice_point: 2\n"
+        "  }\n"
+        "}";
+    Net net = readNetFromCaffe(&prototxt[0], prototxt.size());
+    net.setPreferableBackend(DNN_BACKEND_OPENCV);
+
+    Mat inp(4, 5, CV_32F);
+    randu(inp, -1, 1);
+    net.setInput(inp);
+
+    std::vector<String> outNames;
+    outNames.push_back("testLayer");
+    std::vector<std::vector<Mat> > outBlobs;
+
+    net.forward(outBlobs, outNames);
+
+    EXPECT_EQ(outBlobs.size(), 1);
+    EXPECT_EQ(outBlobs[0].size(), 2);
+    normAssert(outBlobs[0][0], inp.rowRange(0, 2), "first part");
+    normAssert(outBlobs[0][1], inp.rowRange(2, 4), "second part");
+}
+
 }} // namespace
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index acdd66631c..217ef34421 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -395,7 +395,7 @@ TEST_P(Test_ONNX_nets, DenseNet121)
 
 TEST_P(Test_ONNX_nets, Inception_v1)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
         throw SkipTestException("Test is disabled for OpenVINO 2018R5");
 #endif
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index b20b166551..7ddda7f03a 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -241,7 +241,7 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten)
 
 TEST_P(Test_TensorFlow_layers, leaky_relu)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
         throw SkipTestException("");
 #endif
@@ -388,7 +388,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
 
 TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("Unstable test case");
 #endif
diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp
index 7503eb9321..e4092ccb16 100644
--- a/modules/imgproc/perf/perf_blur.cpp
+++ b/modules/imgproc/perf/perf_blur.cpp
@@ -230,4 +230,27 @@ PERF_TEST_P(Size_MatType_BorderType, blur5x5,
     SANITY_CHECK(dst, 1);
 }
 
+///////////// BlendLinear ////////////////////////
+PERF_TEST_P(Size_MatType, BlendLinear,
+            testing::Combine(
+                testing::Values(szVGA, sz720p, sz1080p, sz2160p),
+                testing::Values(CV_8UC1, CV_32FC1, CV_8UC3, CV_32FC3, CV_8UC4, CV_32FC4)
+                )
+           )
+{
+    const Size srcSize = get<0>(GetParam());
+    const int srcType = get<1>(GetParam());
+
+    Mat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
+    Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
+
+    declare.in(src1, src2, WARMUP_RNG).in(weights1, weights2, WARMUP_READ).out(dst);
+    randu(weights1, 0, 1);
+    randu(weights2, 0, 1);
+
+    TEST_CYCLE() blendLinear(src1, src2, weights1, weights2, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp
index 1a4ad0d525..e0ee9ec0c5 100644
--- a/modules/imgproc/src/blend.cpp
+++ b/modules/imgproc/src/blend.cpp
@@ -48,44 +48,44 @@
 #include "opencv2/core/hal/intrin.hpp"
 
 namespace cv {
-#if CV_SIMD128
-static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2)
+#if CV_SIMD
+static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2)
 {
-    const v_float32x4 v_eps = v_setall_f32(1e-5f);
-    v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
+    const v_float32 v_eps = vx_setall_f32(1e-5f);
+    v_float32 v_denom = v_w1 + v_w2 + v_eps;
     return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
 }
-static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
+static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
 {
-    v_float32x4 v_w1 = v_load(w_ptr1 + offset);
-    v_float32x4 v_w2 = v_load(w_ptr2 + offset);
+    v_float32 v_w1 = vx_load(w_ptr1 + offset);
+    v_float32 v_w2 = vx_load(w_ptr2 + offset);
     return blend(v_src1, v_src2, v_w1, v_w2);
 }
-static inline v_uint32x4 saturate_f32_u32(const v_float32x4& vec)
+static inline v_uint32 saturate_f32_u32(const v_float32& vec)
 {
-    const v_int32x4 z = v_setzero_s32();
-    const v_int32x4 x = v_setall_s32(255);
+    const v_int32 z = vx_setzero_s32();
+    const v_int32 x = vx_setall_s32(255);
     return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x));
 }
-static inline v_uint8x16 pack_f32tou8(v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
+static inline v_uint8 pack_f32tou8(v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3)
 {
-    v_uint32x4 a = saturate_f32_u32(val0);
-    v_uint32x4 b = saturate_f32_u32(val1);
-    v_uint32x4 c = saturate_f32_u32(val2);
-    v_uint32x4 d = saturate_f32_u32(val3);
-    v_uint16x8 e = v_pack(a, b);
-    v_uint16x8 f = v_pack(c, d);
+    v_uint32 a = saturate_f32_u32(val0);
+    v_uint32 b = saturate_f32_u32(val1);
+    v_uint32 c = saturate_f32_u32(val2);
+    v_uint32 d = saturate_f32_u32(val3);
+    v_uint16 e = v_pack(a, b);
+    v_uint16 f = v_pack(c, d);
     return v_pack(e, f);
 }
-static inline void store_pack_f32tou8(uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
+static inline void store_pack_f32tou8(uchar* ptr, v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3)
 {
     v_store((ptr), pack_f32tou8(val0, val1, val2, val3));
 }
-static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
+static inline void expand_u8tof32(const v_uint8& src, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3)
 {
-    v_uint16x8 a0, a1;
+    v_uint16 a0, a1;
     v_expand(src, a0, a1);
-    v_uint32x4 b0, b1,b2,b3;
+    v_uint32 b0, b1,b2,b3;
     v_expand(a0, b0, b1);
     v_expand(a1, b2, b3);
     dst0 = v_cvt_f32(v_reinterpret_as_s32(b0));
@@ -93,71 +93,69 @@ static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_fl
     dst2 = v_cvt_f32(v_reinterpret_as_s32(b2));
     dst3 = v_cvt_f32(v_reinterpret_as_s32(b3));
 }
-static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
+static inline void load_expand_u8tof32(const uchar* ptr, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3)
 {
-    v_uint8x16 a = v_load((ptr));
+    v_uint8 a = vx_load((ptr));
     expand_u8tof32(a, dst0, dst1, dst2, dst3);
 }
-int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
-int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
-int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
+int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
+int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
+int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
 {
-    int step = v_uint8x16::nlanes * cn;
-    int weight_step = v_uint8x16::nlanes;
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes)
         {
-            v_float32x4 v_src10, v_src11, v_src12, v_src13;
-            v_float32x4 v_src20, v_src21, v_src22, v_src23;
+            v_float32 v_src10, v_src11, v_src12, v_src13;
+            v_float32 v_src20, v_src21, v_src22, v_src23;
             load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
             load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
 
-            v_float32x4 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + 4);
-            v_float32x4 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 8);
-            v_float32x4 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 12);
+            v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
+            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes);
+            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes);
+            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes);
 
             store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
         {
-            v_uint8x16 v_src10, v_src11, v_src20, v_src21;
+            v_uint8 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
             v_load_deinterleave(src2 + x, v_src20, v_src21);
-            v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
-            v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
+            v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
+            v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
             expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
             expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
             expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203);
             expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
 
-            v_float32x4 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
-            v_float32x4 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
-            v_float32x4 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + 4);
-            v_float32x4 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + 4);
-            v_float32x4 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 8);
-            v_float32x4 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 8);
-            v_float32x4 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 12);
-            v_float32x4 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 12);
+            v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
+            v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
+            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes);
+            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes);
+            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes);
+            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes);
+            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes);
 
-            v_uint8x16 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
-            v_uint8x16 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
+            v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
+            v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
             v_store_interleave(dst + x, v_dsta, v_dstb);
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
         {
-            v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
+            v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
             v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
 
-            v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
-            v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
+            v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
+            v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
             expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
             expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
             expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123);
@@ -165,14 +163,14 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
             expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
             expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);
 
-            v_float32x4 v_w10 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w11 = v_load(weights1 + weight_offset + 4);
-            v_float32x4 v_w12 = v_load(weights1 + weight_offset + 8);
-            v_float32x4 v_w13 = v_load(weights1 + weight_offset + 12);
-            v_float32x4 v_w20 = v_load(weights2 + weight_offset);
-            v_float32x4 v_w21 = v_load(weights2 + weight_offset + 4);
-            v_float32x4 v_w22 = v_load(weights2 + weight_offset + 8);
-            v_float32x4 v_w23 = v_load(weights2 + weight_offset + 12);
+            v_float32 v_w10 = vx_load(weights1 + weight_offset);
+            v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes);
+            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes);
+            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w20 = vx_load(weights2 + weight_offset);
+            v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes);
+            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes);
+            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes);
             v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
             v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
             v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
@@ -187,34 +185,36 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
             v_src123 = blend(v_src123, v_src223, v_w13, v_w23);
 
 
-            v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
-            v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
-            v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
+            v_uint8 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
+            v_uint8 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
+            v_uint8 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
             v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
         }
         break;
     case 4:
-        step = v_uint8x16::nlanes;
-        weight_step = v_float32x4::nlanes;
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes)
         {
-            v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
-            v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
+            v_float32 v_src10, v_src11, v_src12, v_src13;
+            v_float32 v_src20, v_src21, v_src22, v_src23;
             load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
             load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
 
-            v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
-            v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
+            v_float32 v_w10, v_w11, v_w12, v_w13, v_w20, v_w21, v_w22, v_w23, v_w0, v_w1;
+            v_w10 = vx_load(weights1 + weight_offset);
+            v_zip(v_w10, v_w10, v_w0, v_w1);
+            v_zip(v_w0, v_w0, v_w10, v_w11);
+            v_zip(v_w1, v_w1, v_w12, v_w13);
+            v_w20 = vx_load(weights2 + weight_offset);
+            v_zip(v_w20, v_w20, v_w0, v_w1);
+            v_zip(v_w0, v_w0, v_w20, v_w21);
+            v_zip(v_w1, v_w1, v_w22, v_w23);
 
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
-            v_src10 = blend(v_src14, v_src24, v_w1, v_w2);
-            v_src11 = blend(v_src15, v_src25, v_w1, v_w2);
-            v_src12 = blend(v_src16, v_src26, v_w1, v_w2);
-            v_src13 = blend(v_src17, v_src27, v_w1, v_w2);
+            v_float32 v_dst0, v_dst1, v_dst2, v_dst3;
+            v_dst0 = blend(v_src10, v_src20, v_w10, v_w20);
+            v_dst1 = blend(v_src11, v_src21, v_w11, v_w21);
+            v_dst2 = blend(v_src12, v_src22, v_w12, v_w22);
+            v_dst3 = blend(v_src13, v_src23, v_w13, v_w23);
 
-            v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
-            v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
             store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
         }
         break;
@@ -224,68 +224,67 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
     return x;
 }
 
-int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
+int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
 {
-    int step = v_float32x4::nlanes*cn;
     switch(cn)
     {
     case 1:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes)
         {
-            v_float32x4 v_src1 = v_load(src1 + x);
-            v_float32x4 v_src2 = v_load(src2 + x);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_src1 = vx_load(src1 + x);
+            v_float32 v_src2 = vx_load(src2 + x);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);
 
-            v_float32x4 v_dst = blend(v_src1, v_src2, v_w1, v_w2);
+            v_float32 v_dst = blend(v_src1, v_src2, v_w1, v_w2);
 
             v_store(dst + x, v_dst);
         }
         break;
     case 2:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes)
         {
-            v_float32x4 v_src10, v_src11, v_src20, v_src21;
+            v_float32 v_src10, v_src11, v_src20, v_src21;
             v_load_deinterleave(src1 + x, v_src10, v_src11);
             v_load_deinterleave(src2 + x, v_src20, v_src21);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);
 
-            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
+            v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
+            v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
 
             v_store_interleave(dst + x, v_dst0, v_dst1);
         }
         break;
     case 3:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes)
         {
-            v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
+            v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
             v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);
 
-            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
-            v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
+            v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
+            v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
+            v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
 
             v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
         }
         break;
     case 4:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes)
         {
-            v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
+            v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
             v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
             v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);
 
-            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
-            v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
-            v_float32x4 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);
+            v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
+            v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
+            v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
+            v_float32 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);
 
             v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
         }
@@ -321,8 +320,8 @@ public:
             T * const dst_row = dst->ptr<T>(y);
 
             int x = 0;
-            #if CV_SIMD128
-            x = blendLinearSimd128(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
+            #if CV_SIMD
+            x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
             #endif
 
             for ( ; x < width; ++x)
diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.cpp
index 0606aec578..d565b9486d 100644
--- a/modules/imgproc/src/median_blur.cpp
+++ b/modules/imgproc/src/median_blur.cpp
@@ -110,15 +110,19 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
     int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
     CV_Assert(cn > 0 && cn <= 4);
     size_t sstep = _src.step, dstep = _dst.step;
-    Histogram CV_DECL_ALIGNED(16) H[4];
-    HT CV_DECL_ALIGNED(16) luc[4][16];
 
     int STRIPE_SIZE = std::min( _dst.cols, 512/cn );
 
-    std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
-    std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
-    HT* h_coarse = alignPtr(&_h_coarse[0], 16);
-    HT* h_fine = alignPtr(&_h_fine[0], 16);
+#if defined(CV_SIMD_WIDTH) && CV_SIMD_WIDTH >= 16
+# define CV_ALIGNMENT CV_SIMD_WIDTH
+#else
+# define CV_ALIGNMENT 16
+#endif
+
+    std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT);
+    std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT);
+    HT* h_coarse = alignPtr(&_h_coarse[0], CV_ALIGNMENT);
+    HT* h_fine = alignPtr(&_h_fine[0], CV_ALIGNMENT);
 
     for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
     {
@@ -148,10 +152,14 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
             const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
             const uchar* p1 = src + sstep * std::min( m-1, i+r );
 
-            memset( H, 0, cn*sizeof(H[0]) );
-            memset( luc, 0, cn*sizeof(luc[0]) );
             for( c = 0; c < cn; c++ )
             {
+                Histogram CV_DECL_ALIGNED(CV_ALIGNMENT) H;
+                HT CV_DECL_ALIGNED(CV_ALIGNMENT) luc[16];
+
+                memset(&H, 0, sizeof(H));
+                memset(luc, 0, sizeof(luc));
+
                 // Update column histograms for the entire row.
                 for( j = 0; j < n; j++ )
                 {
@@ -163,21 +171,21 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                 for (k = 0; k < 16; ++k)
                 {
 #if CV_SIMD256
-                    v_store(H[c].fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H[c].fine[k]));
+                    v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k]));
 #elif CV_SIMD128
-                    v_store(H[c].fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k]));
-                    v_store(H[c].fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k] + 8));
+                    v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k]));
+                    v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8));
 #else
                     for (int ind = 0; ind < 16; ++ind)
-                        H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
+                        H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
 #endif
                 }
 
 #if CV_SIMD256
-                v_uint16x16 v_coarse = v256_load(H[c].coarse);
+                v_uint16x16 v_coarse = v256_load(H.coarse);
 #elif CV_SIMD128
-                v_uint16x8 v_coarsel = v_load(H[c].coarse);
-                v_uint16x8 v_coarseh = v_load(H[c].coarse + 8);
+                v_uint16x8 v_coarsel = v_load(H.coarse);
+                v_uint16x8 v_coarseh = v_load(H.coarse + 8);
 #endif
                 HT* px = h_coarse + 16 * n*c;
                 for( j = 0; j < 2*r; ++j, px += 16 )
@@ -189,7 +197,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                     v_coarseh += v_load(px + 8);
 #else
                     for (int ind = 0; ind < 16; ++ind)
-                        H[c].coarse[ind] += px[ind];
+                        H.coarse[ind] += px[ind];
 #endif
                 }
 
@@ -201,24 +209,24 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                     px = h_coarse + 16 * (n*c + std::min(j + r, n - 1));
 #if CV_SIMD256
                     v_coarse += v256_load(px);
-                    v_store(H[c].coarse, v_coarse);
+                    v_store(H.coarse, v_coarse);
 #elif CV_SIMD128
                     v_coarsel += v_load(px);
                     v_coarseh += v_load(px + 8);
-                    v_store(H[c].coarse, v_coarsel);
-                    v_store(H[c].coarse + 8, v_coarseh);
+                    v_store(H.coarse, v_coarsel);
+                    v_store(H.coarse + 8, v_coarseh);
 #else
                     for (int ind = 0; ind < 16; ++ind)
-                        H[c].coarse[ind] += px[ind];
+                        H.coarse[ind] += px[ind];
 #endif
 
                     // Find median at coarse level
                     for ( k = 0; k < 16 ; ++k )
                     {
-                        sum += H[c].coarse[k];
+                        sum += H.coarse[k];
                         if ( sum > t )
                         {
-                            sum -= H[c].coarse[k];
+                            sum -= H.coarse[k];
                             break;
                         }
                     }
@@ -231,7 +239,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                     v_uint16x8 v_finel;
                     v_uint16x8 v_fineh;
 #endif
-                    if ( luc[c][k] <= j-r )
+                    if ( luc[k] <= j-r )
                     {
 #if CV_SIMD256
                         v_fine = v256_setzero_u16();
@@ -239,10 +247,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                         v_finel = v_setzero_u16();
                         v_fineh = v_setzero_u16();
 #else
-                        memset(&H[c].fine[k], 0, 16 * sizeof(HT));
+                        memset(&H.fine[k], 0, 16 * sizeof(HT));
 #endif
                         px = h_fine + 16 * (n*(16 * c + k) + j - r);
-                        for (luc[c][k] = HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16)
+                        for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16)
                         {
 #if CV_SIMD256
                             v_fine += v256_load(px);
@@ -251,11 +259,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                             v_fineh += v_load(px + 8);
 #else
                             for (int ind = 0; ind < 16; ++ind)
-                                H[c].fine[k][ind] += px[ind];
+                                H.fine[k][ind] += px[ind];
 #endif
                         }
 
-                        if ( luc[c][k] < j+r+1 )
+                        if ( luc[k] < j+r+1 )
                         {
                             px = h_fine + 16 * (n*(16 * c + k) + (n - 1));
 #if CV_SIMD256
@@ -265,50 +273,50 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                             v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
 #else
                             for (int ind = 0; ind < 16; ++ind)
-                                H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (j + r + 1 - n) * px[ind]);
+                                H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]);
 #endif
-                            luc[c][k] = (HT)(j+r+1);
+                            luc[k] = (HT)(j+r+1);
                         }
                     }
                     else
                     {
 #if CV_SIMD256
-                        v_fine = v256_load(H[c].fine[k]);
+                        v_fine = v256_load(H.fine[k]);
 #elif CV_SIMD128
-                        v_finel = v_load(H[c].fine[k]);
-                        v_fineh = v_load(H[c].fine[k] + 8);
+                        v_finel = v_load(H.fine[k]);
+                        v_fineh = v_load(H.fine[k] + 8);
 #endif
                         px = h_fine + 16*n*(16 * c + k);
-                        for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
+                        for ( ; luc[k] < j+r+1; ++luc[k] )
                         {
 #if CV_SIMD256
-                            v_fine = v_fine + v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                            v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
 #elif CV_SIMD128
-                            v_finel = v_finel + v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
-                            v_fineh = v_fineh + v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
+                            v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1)    ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
+                            v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8);
 #else
                             for (int ind = 0; ind < 16; ++ind)
-                                H[c].fine[k][ind] += px[16 * MIN(luc[c][k], n - 1) + ind] - px[16 * MAX(luc[c][k] - 2 * r - 1, 0) + ind];
+                                H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind];
 #endif
                         }
                     }
 
                     px = h_coarse + 16 * (n*c + MAX(j - r, 0));
 #if CV_SIMD256
-                    v_store(H[c].fine[k], v_fine);
+                    v_store(H.fine[k], v_fine);
                     v_coarse -= v256_load(px);
 #elif CV_SIMD128
-                    v_store(H[c].fine[k], v_finel);
-                    v_store(H[c].fine[k] + 8, v_fineh);
+                    v_store(H.fine[k], v_finel);
+                    v_store(H.fine[k] + 8, v_fineh);
                     v_coarsel -= v_load(px);
                     v_coarseh -= v_load(px + 8);
 #else
                     for (int ind = 0; ind < 16; ++ind)
-                        H[c].coarse[ind] -= px[ind];
+                        H.coarse[ind] -= px[ind];
 #endif
 
                     /* Find median in segment */
-                    segment = H[c].fine[k];
+                    segment = H.fine[k];
                     for ( b = 0; b < 16 ; b++ )
                     {
                         sum += segment[b];
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index d212237a37..6aa9d0279f 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -112,6 +112,7 @@ struct PyrDownVec_32s8u
             v_rshr_pack_store<8>(dst + x, t0);
             x += v_uint16::nlanes;
         }
+        typedef int CV_DECL_ALIGNED(1) unaligned_int;
         for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
         {
             v_int32x4 r0, r1, r2, r3, r4, t0;
@@ -122,7 +123,7 @@ struct PyrDownVec_32s8u
             r4 = v_load(row4 + x);
             t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
 
-            *(int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
+            *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
         }
 
         return x;
diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp
index c942264e00..1aed1fa031 100644
--- a/modules/imgproc/src/spatialgradient.cpp
+++ b/modules/imgproc/src/spatialgradient.cpp
@@ -123,139 +123,125 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
         }
     }
 
-    // Pointer to row vectors
-    uchar *p_src, *c_src, *n_src; // previous, current, next row
-    short *c_dx,  *c_dy;
-
     int i_start = 0;
     int j_start = 0;
-#if CV_SIMD128
-    if(hasSIMD128())
+#if CV_SIMD
+    // Characters in variable names have the following meanings:
+    // u: unsigned char
+    // s: signed int
+    //
+    // [row][column]
+    // m: offset -1
+    // n: offset  0
+    // p: offset  1
+    // Example: umn is offset -1 in row and offset 0 in column
+    for ( i = 0; i < H - 1; i += 2 )
     {
-        uchar *m_src;
-        short *n_dx, *n_dy;
+        uchar *p_src = src.ptr<uchar>(i == 0 ? i_top : i - 1);
+        uchar *c_src = src.ptr<uchar>(i);
+        uchar *n_src = src.ptr<uchar>(i+1);
+        uchar *m_src = src.ptr<uchar>(i == H - 2 ? i_bottom : i + 2);
 
-        // Characters in variable names have the following meanings:
-        // u: unsigned char
-        // s: signed int
-        //
-        // [row][column]
-        // m: offset -1
-        // n: offset  0
-        // p: offset  1
-        // Example: umn is offset -1 in row and offset 0 in column
-        for ( i = 0; i < H - 1; i += 2 )
+        short *c_dx = dx.ptr<short>(i);
+        short *c_dy = dy.ptr<short>(i);
+        short *n_dx = dx.ptr<short>(i+1);
+        short *n_dy = dy.ptr<short>(i+1);
+
+        // Process rest of columns 16-column chunks at a time
+        for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes)
         {
-            if   ( i == 0 ) p_src = src.ptr<uchar>(i_top);
-            else            p_src = src.ptr<uchar>(i-1);
+            // Load top row for 3x3 Sobel filter
+            v_uint8 v_um = vx_load(&p_src[j-1]);
+            v_uint8 v_un = vx_load(&p_src[j]);
+            v_uint8 v_up = vx_load(&p_src[j+1]);
+            v_uint16 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2;
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s1m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s1m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s1n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s1n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s1p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s1p2 = v_reinterpret_as_s16(v_up2);
 
-            c_src = src.ptr<uchar>(i);
-            n_src = src.ptr<uchar>(i+1);
+            // Load second row for 3x3 Sobel filter
+            v_um = vx_load(&c_src[j-1]);
+            v_un = vx_load(&c_src[j]);
+            v_up = vx_load(&c_src[j+1]);
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s2m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s2m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s2n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s2n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s2p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s2p2 = v_reinterpret_as_s16(v_up2);
 
-            if ( i == H - 2 ) m_src = src.ptr<uchar>(i_bottom);
-            else              m_src = src.ptr<uchar>(i+2);
+            // Load third row for 3x3 Sobel filter
+            v_um = vx_load(&n_src[j-1]);
+            v_un = vx_load(&n_src[j]);
+            v_up = vx_load(&n_src[j+1]);
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s3m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s3m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s3n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s3n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s3p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s3p2 = v_reinterpret_as_s16(v_up2);
 
-            c_dx = dx.ptr<short>(i);
-            c_dy = dy.ptr<short>(i);
-            n_dx = dx.ptr<short>(i+1);
-            n_dy = dy.ptr<short>(i+1);
+            // dx & dy for rows 1, 2, 3
+            v_int16 v_sdx1, v_sdy1;
+            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+                                              v_s1m1, v_s1n1, v_s1p1,
+                                              v_s2m1,         v_s2p1,
+                                              v_s3m1, v_s3n1, v_s3p1 );
 
-            // Process rest of columns 16-column chunks at a time
-            for ( j = 1; j < W - 16; j += 16 )
-            {
-                // Load top row for 3x3 Sobel filter
-                v_uint8x16 v_um = v_load(&p_src[j-1]);
-                v_uint8x16 v_un = v_load(&p_src[j]);
-                v_uint8x16 v_up = v_load(&p_src[j+1]);
-                v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2;
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2);
+            v_int16 v_sdx2, v_sdy2;
+            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+                                              v_s1m2, v_s1n2, v_s1p2,
+                                              v_s2m2,         v_s2p2,
+                                              v_s3m2, v_s3n2, v_s3p2 );
 
-                // Load second row for 3x3 Sobel filter
-                v_um = v_load(&c_src[j-1]);
-                v_un = v_load(&c_src[j]);
-                v_up = v_load(&c_src[j+1]);
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2);
+            // Store
+            v_store(&c_dx[j],                 v_sdx1);
+            v_store(&c_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&c_dy[j],                 v_sdy1);
+            v_store(&c_dy[j+v_int16::nlanes], v_sdy2);
 
-                // Load third row for 3x3 Sobel filter
-                v_um = v_load(&n_src[j-1]);
-                v_un = v_load(&n_src[j]);
-                v_up = v_load(&n_src[j+1]);
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2);
+            // Load fourth row for 3x3 Sobel filter
+            v_um = vx_load(&m_src[j-1]);
+            v_un = vx_load(&m_src[j]);
+            v_up = vx_load(&m_src[j+1]);
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s4m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s4m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s4n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s4n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s4p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2);
 
-                // dx & dy for rows 1, 2, 3
-                v_int16x8 v_sdx1, v_sdy1;
-                spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1,
-                                                  v_s1m1, v_s1n1, v_s1p1,
-                                                  v_s2m1,         v_s2p1,
-                                                  v_s3m1, v_s3n1, v_s3p1 );
+            // dx & dy for rows 2, 3, 4
+            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+                                              v_s2m1, v_s2n1, v_s2p1,
+                                              v_s3m1,         v_s3p1,
+                                              v_s4m1, v_s4n1, v_s4p1 );
 
-                v_int16x8 v_sdx2, v_sdy2;
-                spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2,
-                                                  v_s1m2, v_s1n2, v_s1p2,
-                                                  v_s2m2,         v_s2p2,
-                                                  v_s3m2, v_s3n2, v_s3p2 );
+            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+                                              v_s2m2, v_s2n2, v_s2p2,
+                                              v_s3m2,         v_s3p2,
+                                              v_s4m2, v_s4n2, v_s4p2 );
 
-                // Store
-                v_store(&c_dx[j],   v_sdx1);
-                v_store(&c_dx[j+8], v_sdx2);
-                v_store(&c_dy[j],   v_sdy1);
-                v_store(&c_dy[j+8], v_sdy2);
-
-                // Load fourth row for 3x3 Sobel filter
-                v_um = v_load(&m_src[j-1]);
-                v_un = v_load(&m_src[j]);
-                v_up = v_load(&m_src[j+1]);
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2);
-
-                // dx & dy for rows 2, 3, 4
-                spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1,
-                                                  v_s2m1, v_s2n1, v_s2p1,
-                                                  v_s3m1,         v_s3p1,
-                                                  v_s4m1, v_s4n1, v_s4p1 );
-
-                spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2,
-                                                  v_s2m2, v_s2n2, v_s2p2,
-                                                  v_s3m2,         v_s3p2,
-                                                  v_s4m2, v_s4n2, v_s4p2 );
-
-                // Store
-                v_store(&n_dx[j],   v_sdx1);
-                v_store(&n_dx[j+8], v_sdx2);
-                v_store(&n_dy[j],   v_sdy1);
-                v_store(&n_dy[j+8], v_sdy2);
-            }
+            // Store
+            v_store(&n_dx[j],                 v_sdx1);
+            v_store(&n_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&n_dy[j],                 v_sdy1);
+            v_store(&n_dy[j+v_int16::nlanes], v_sdy2);
         }
     }
     i_start = i;
@@ -265,16 +251,12 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
     uchar v00, v01, v02, v10, v11, v12, v20, v21, v22;
     for ( i = 0; i < H; i++ )
     {
-        if   ( i == 0 ) p_src = src.ptr<uchar>(i_top);
-        else            p_src = src.ptr<uchar>(i-1);
+        uchar *p_src = src.ptr<uchar>(i == 0 ? i_top : i - 1);
+        uchar *c_src = src.ptr<uchar>(i);
+        uchar *n_src = src.ptr<uchar>(i == H - 1 ? i_bottom : i + 1);
 
-        c_src = src.ptr<uchar>(i);
-
-        if ( i == H - 1 ) n_src = src.ptr<uchar>(i_bottom);
-        else              n_src = src.ptr<uchar>(i+1);
-
-        c_dx = dx.ptr<short>(i);
-        c_dy = dy.ptr<short>(i);
+        short *c_dx = dx.ptr<short>(i);
+        short *c_dy = dy.ptr<short>(i);
 
         // Process left-most column
         j = 0;
diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp
index 7749e4a59a..6329a47afa 100644
--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -2235,4 +2235,13 @@ TEST(Imgproc_Sobel, s16_regression_13506)
     Sobel(src, dst, CV_16S, 0, 1, 5);
     ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF));
 }
+
+TEST(Imgproc_Pyrdown, issue_12961)
+{
+    Mat src(9, 9, CV_8UC1, Scalar::all(0));
+    Mat dst;
+    cv::pyrDown(src, dst);
+    ASSERT_EQ(0.0, cv::norm(dst));
+}
+
 }} // namespace
diff --git a/modules/js/src/core_bindings.cpp b/modules/js/src/core_bindings.cpp
index 554f95aa83..72efd6350a 100644
--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@@ -341,6 +341,9 @@ EMSCRIPTEN_BINDINGS(binding_utils)
     register_vector<cv::Mat>("MatVector");
     register_vector<cv::Rect>("RectVector");
     register_vector<cv::KeyPoint>("KeyPointVector");
+    register_vector<cv::DMatch>("DMatchVector");
+    register_vector<std::vector<cv::DMatch>>("DMatchVectorVector");
+
 
     emscripten::class_<cv::Mat>("Mat")
         .constructor<>()
@@ -494,6 +497,12 @@ EMSCRIPTEN_BINDINGS(binding_utils)
         .field("response", &cv::KeyPoint::response)
         .field("size", &cv::KeyPoint::size);
 
+    emscripten::value_object<cv::DMatch>("DMatch")
+        .field("queryIdx", &cv::DMatch::queryIdx)
+        .field("trainIdx", &cv::DMatch::trainIdx)
+        .field("imgIdx", &cv::DMatch::imgIdx)
+        .field("distance", &cv::DMatch::distance);
+
     emscripten::value_array<cv::Scalar_<double>> ("Scalar")
         .element(index<0>())
         .element(index<1>())
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 02043ac929..0a0b84a364 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -200,20 +200,19 @@ public:
     {
         int j;
         calc_non_rbf_base( vcount, var_count, vecs, another, results,
-                          -2*params.gamma, -2*params.coef0 );
+                          2*params.gamma, 2*params.coef0 );
         // TODO: speedup this
         for( j = 0; j < vcount; j++ )
         {
             Qfloat t = results[j];
-            Qfloat e = std::exp(-std::abs(t));
+            Qfloat e = std::exp(std::abs(t));
             if( t > 0 )
-                results[j] = (Qfloat)((1. - e)/(1. + e));
-            else
                 results[j] = (Qfloat)((e - 1.)/(e + 1.));
+            else
+                results[j] = (Qfloat)((1. - e)/(1. + e));
         }
     }
 
-
     void calc_rbf( int vcount, int var_count, const float* vecs,
                    const float* another, Qfloat* results )
     {
@@ -1310,8 +1309,6 @@ public:
 
             if( kernelType != SIGMOID && kernelType != POLY )
                 params.coef0 = 0;
-            else if( params.coef0 < 0 )
-                CV_Error( CV_StsOutOfRange, "The kernel parameter <coef0> must be positive or zero" );
 
             if( kernelType != POLY )
                 params.degree = 0;
diff --git a/modules/ml/test/test_svmtrainauto.cpp b/modules/ml/test/test_svmtrainauto.cpp
index 6d7a73eaef..fcd83d3533 100644
--- a/modules/ml/test/test_svmtrainauto.cpp
+++ b/modules/ml/test/test_svmtrainauto.cpp
@@ -88,6 +88,51 @@ void CV_SVMTrainAutoTest::run( int /*start_from*/ )
 
 TEST(ML_SVM, trainauto) { CV_SVMTrainAutoTest test; test.safe_run(); }
 
+TEST(ML_SVM, trainauto_sigmoid)
+{
+    const int datasize = 100;
+    cv::Mat samples = cv::Mat::zeros( datasize, 2, CV_32FC1 );
+    cv::Mat responses = cv::Mat::zeros( datasize, 1, CV_32S );
+
+    const float scale_factor = 0.5;
+    const float radius = 2.0;
+
+    // Populate samples with data that can be split into two concentric circles
+    for (int i = 0; i < datasize; i+=2)
+    {
+        const float pi = 3.14159f;
+        const float angle_rads = (i/datasize) * pi;
+        const float x = radius * cos(angle_rads);
+        const float y = radius * cos(angle_rads);
+
+        // Larger circle
+        samples.at<float>( i, 0 ) = x;
+        samples.at<float>( i, 1 ) = y;
+        responses.at<int>( i, 0 ) = 0;
+
+        // Smaller circle
+        samples.at<float>( i + 1, 0 ) = x * scale_factor;
+        samples.at<float>( i + 1, 1 ) = y * scale_factor;
+        responses.at<int>( i + 1, 0 ) = 1;
+    }
+
+    cv::Ptr<TrainData> data = TrainData::create( samples, cv::ml::ROW_SAMPLE, responses );
+    cv::Ptr<SVM> svm = SVM::create();
+    svm->setKernel(SVM::SIGMOID);
+
+    svm->setGamma(10.0);
+    svm->setCoef0(-10.0);
+    svm->trainAuto( data, 10 );  // 2-fold cross validation.
+
+    float test_data0[2] = {radius, radius};
+    cv::Mat test_point0 = cv::Mat( 1, 2, CV_32FC1, test_data0 );
+    ASSERT_EQ(0, svm->predict( test_point0 ));
+
+    float test_data1[2] = {scale_factor * radius, scale_factor * radius};
+    cv::Mat test_point1 = cv::Mat( 1, 2, CV_32FC1, test_data1 );
+    ASSERT_EQ(1, svm->predict( test_point1 ));
+}
+
 
 TEST(ML_SVM, trainAuto_regression_5369)
 {
diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py
index a644420780..5a8e62495d 100644
--- a/samples/dnn/tf_text_graph_common.py
+++ b/samples/dnn/tf_text_graph_common.py
@@ -323,7 +323,7 @@ def writeTextGraph(modelPath, outputPath, outNodes):
 
             for node in graph_def.node:
                 if node.op == 'Const':
-                    if 'value' in node.attr:
-                        del node.attr['value']
+                    if 'value' in node.attr and node.attr['value'].tensor.tensor_content:
+                        node.attr['value'].tensor.tensor_content = ''
 
         tf.train.write_graph(graph_def, "", outputPath, as_text=True)