Merge remote-tracking branch 'upstream/3.4' into merge-3.4

2019-02-01 13:17:32 +03:00
parent a65ccc0603 a42bbc9722
commit 665408e57f
20 changed files with 524 additions and 384 deletions
@@ -148,7 +148,13 @@ private:
 #else
        cv::dnn::Net net;
        cv::dnn::LayerParams lp;
-        net.addLayerToPrev("testLayer", "Identity", lp);
+        lp.set("kernel_size", 1);
+        lp.set("num_output", 1);
+        lp.set("bias_term", false);
+        lp.type = "Convolution";
+        lp.name = "testLayer";
+        lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
+        net.addLayerToPrev(lp.name, lp.type, lp);
        net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
        net.setPreferableTarget(target);
        static int inpDims[] = {1, 2, 3, 4};
@@ -2676,7 +2682,7 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
    backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
    for (auto& it : ieNet.getOutputsInfo())
    {
-        Ptr<Layer> cvLayer(new InfEngineBackendLayer(it.second));
+        Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
        InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
        CV_Assert(ieLayer);

@@ -2871,8 +2877,7 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
    std::vector<LayerPin> pins;
    for (int i = 0; i < outBlobNames.size(); i++)
    {
-        std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
-        pins.insert(pins.end(), lp.begin(), lp.end());
+        pins.push_back(impl->getPinByAlias(outBlobNames[i]));
    }

    impl->setUpNet(pins);
@@ -2885,9 +2890,10 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
    for (int i = 0; i < outBlobNames.size(); i++)
    {
        std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
-        for (int i = 0; i < lp.size(); i++)
+        outputBlobs[i].resize(lp.size());
+        for (int j = 0; j < lp.size(); j++)
        {
-            outputBlobs[i].push_back(impl->getBlob(lp[i]));
+            outputBlobs[i][j] = impl->getBlob(lp[j]);
        }
    }
 }
@@ -110,14 +110,25 @@ public:
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
-#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
-        InferenceEngine::Builder::SplitLayer ieLayer(name);
-        ieLayer.setOutputPorts({InferenceEngine::Port()});
-        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
-#else
        InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
        CV_Assert(!input->dims.empty());
-
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
+        InferenceEngine::Builder::Layer ieLayer(name);
+        ieLayer.setName(name);
+        if (preferableTarget == DNN_TARGET_MYRIAD)
+        {
+            ieLayer.setType("Copy");
+        }
+        else
+        {
+            ieLayer.setType("Split");
+            ieLayer.getParameters()["axis"] = input->dims.size() - 1;
+            ieLayer.getParameters()["out_sizes"] = input->dims[0];
+        }
+        ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
+        ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#else
        InferenceEngine::LayerParams lp;
        lp.name = name;
        lp.type = "Split";
@@ -281,7 +281,7 @@ public:
        const int outCn = blobs[0].size[0];
        // prepare weightsMat where each row is aligned and has enough zero padding on the right to
        // use vectorized (i.e. with intrinsics) loops without tail processing
-        Mat wm = blobs[0].reshape(1, outCn).clone();
+        Mat wm = blobs[0].reshape(1, outCn);
        if( wm.step1() % VEC_ALIGN != 0 )
        {
            int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
@@ -374,6 +374,10 @@ public:

        if (!w.empty())
        {
+            // Keep origin weights unchanged.
+            if (weightsMat.data == blobs[0].data)
+                weightsMat = weightsMat.clone();
+
            Mat originWeights = blobs[0].reshape(1, outCn);
            for (int i = 0; i < outCn; ++i)
            {
@@ -551,13 +555,13 @@ public:
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
        InferenceEngine::Builder::ConvolutionLayer ieLayer(name);

-        ieLayer.setKernel({kernel.height, kernel.width});
-        ieLayer.setStrides({stride.height, stride.width});
-        ieLayer.setDilation({dilation.height, dilation.width});
-        ieLayer.setPaddingsBegin({pad.height, pad.width});
-        ieLayer.setPaddingsEnd({pad.height, pad.width});
-        ieLayer.setGroup(group);
-        ieLayer.setOutDepth(outCn);
+        ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
+        ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
+        ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width});
+        ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setGroup((size_t)group);
+        ieLayer.setOutDepth((size_t)outCn);

        ieLayer.setWeights(ieWeights);
        if (ieBiases)
@@ -1220,7 +1224,7 @@ public:
 #ifdef HAVE_INF_ENGINE
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
        {
-            if (INF_ENGINE_RELEASE == 2018050000 && (adjustPad.height || adjustPad.width))
+            if (INF_ENGINE_RELEASE >= 2018050000 && (adjustPad.height || adjustPad.width))
                return false;

            const int outGroupCn = blobs[0].size[1];  // Weights are in IOHW layout
@@ -1783,13 +1787,13 @@ public:

        InferenceEngine::Builder::DeconvolutionLayer ieLayer(name);

-        ieLayer.setKernel({kernel.height, kernel.width});
-        ieLayer.setStrides({stride.height, stride.width});
-        ieLayer.setDilation({dilation.height, dilation.width});
-        ieLayer.setPaddingsBegin({pad.height, pad.width});
-        ieLayer.setPaddingsEnd({pad.height, pad.width});
-        ieLayer.setGroup(group);
-        ieLayer.setOutDepth(numOutput);
+        ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
+        ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
+        ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width});
+        ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width});
+        ieLayer.setGroup((size_t)group);
+        ieLayer.setOutDepth((size_t)numOutput);

        ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW));
        if (hasBias())
@@ -299,10 +299,10 @@ public:
        if (type == MAX || type == AVE)
        {
            InferenceEngine::Builder::PoolingLayer ieLayer(name);
-            ieLayer.setKernel({kernel.height, kernel.width});
-            ieLayer.setStrides({stride.height, stride.width});
-            ieLayer.setPaddingsBegin({pad_t, pad_l});
-            ieLayer.setPaddingsEnd({pad_b, pad_r});
+            ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
+            ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
+            ieLayer.setPaddingsBegin({(size_t)pad_t, (size_t)pad_l});
+            ieLayer.setPaddingsEnd({(size_t)pad_b, (size_t)pad_r});
            ieLayer.setPoolingType(type == MAX ?
                                   InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
                                   InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);
@@ -82,7 +82,7 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
    CV_Assert(it != layers.end());

    const int layerId = it->second;
-    for (int i = 0; i < inpWrappers.size(); ++i)
+    for (size_t i = 0; i < inpWrappers.size(); ++i)
    {
        const auto& inp = inpWrappers[i];
        const std::string& inpName = inp->dataPtr->name;
@@ -103,7 +103,7 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
        else
            inpId = it->second;

-        netBuilder.connect(inpId, {layerId, i});
+        netBuilder.connect((size_t)inpId, {(size_t)layerId, i});
        unconnectedLayersIds.erase(inpId);
    }
    CV_Assert(!outputs.empty());
@@ -119,7 +119,7 @@ void InfEngineBackendNet::init(int targetId)
        for (int id : unconnectedLayersIds)
        {
            InferenceEngine::Builder::OutputLayer outLayer("myconv1");
-            netBuilder.addLayer({id}, outLayer);
+            netBuilder.addLayer({InferenceEngine::PortInfo(id)}, outLayer);
        }
        cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build()));
    }
@@ -718,19 +718,33 @@ Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
    return Mat(size, CV_32F, (void*)blob->buffer());
 }

-InfEngineBackendLayer::InfEngineBackendLayer(const InferenceEngine::DataPtr& output_)
-{
-    output = output_;
-}
-
 bool InfEngineBackendLayer::getMemoryShapes(const std::vector<MatShape> &inputs,
                                            const int requiredOutputs,
                                            std::vector<MatShape> &outputs,
                                            std::vector<MatShape> &internals) const
 {
-    std::vector<size_t> dims = output->dims;
-    std::vector<int> shape(dims.rbegin(), dims.rend());
-    outputs.assign(1, shape);
+    InferenceEngine::ICNNNetwork::InputShapes inShapes = t_net.getInputShapes();
+    InferenceEngine::ICNNNetwork::InputShapes::iterator itr;
+    bool equal_flag = true;
+    size_t i = 0;
+    for (itr = inShapes.begin(); itr != inShapes.end(); ++itr)
+    {
+        InferenceEngine::SizeVector currentInShape(inputs[i].begin(), inputs[i].end());
+        if (itr->second != currentInShape)
+        {
+            itr->second = currentInShape;
+            equal_flag = false;
+        }
+        i++;
+    }
+
+    if (!equal_flag)
+    {
+        InferenceEngine::CNNNetwork curr_t_net(t_net);
+        curr_t_net.reshape(inShapes);
+    }
+    std::vector<size_t> dims = t_net.getOutputsInfo()[name]->getDims();
+    outputs.push_back(MatShape(dims.begin(), dims.end()));
    return false;
 }

@@ -260,7 +260,7 @@ InferenceEngine::TBlob<int16_t>::Ptr convertFp16(const InferenceEngine::Blob::Pt
 class InfEngineBackendLayer : public Layer
 {
 public:
-    InfEngineBackendLayer(const InferenceEngine::DataPtr& output);
+    InfEngineBackendLayer(const InferenceEngine::CNNNetwork &t_net_) : t_net(t_net_) {};

    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                 const int requiredOutputs,
@@ -273,7 +273,7 @@ public:
    virtual bool supportBackend(int backendId) CV_OVERRIDE;

 private:
-    InferenceEngine::DataPtr output;
+    InferenceEngine::CNNNetwork t_net;
 };

 #endif  // HAVE_INF_ENGINE
@@ -236,6 +236,10 @@ TEST_P(Test_Caffe_layers, Dropout)

 TEST_P(Test_Caffe_layers, Concat)
 {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE > 2018050000
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
    testLayerUsingCaffeModels("layer_concat");
    testLayerUsingCaffeModels("layer_concat_optim", true, false);
    testLayerUsingCaffeModels("layer_concat_shared_input", true, false);
@@ -923,8 +927,9 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
 {
    Target targetId = GetParam();

+    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
    Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt"));
-    Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
+    Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));

    Mat inp = blobFromNPY(_tf("blob.npy"));

@@ -935,22 +940,15 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
    net.setInput(inp);
    net.setPreferableTarget(targetId);

-    if (targetId != DNN_TARGET_MYRIAD)
-    {
-        Mat out = net.forward();
+    Mat out = net.forward();

-        normAssert(outDefault, out);
+    double l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.4e-3 : 1e-5;
+    double lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.8e-2 : 1e-4;
+    normAssert(outDefault, out, "", l1, lInf);

-        std::vector<int> outLayers = net.getUnconnectedOutLayers();
-        ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge");
-        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat");
-    }
-    else
-    {
-        // An assertion is expected because the model is in FP32 format but
-        // Myriad plugin supports only FP16 models.
-        ASSERT_ANY_THROW(net.forward());
-    }
+    std::vector<int> outLayers = net.getUnconnectedOutLayers();
+    ASSERT_EQ(net.getLayer(outLayers[0])->name, "output");
+    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
 }

 TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
@@ -962,23 +960,16 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
    randu(inputs[0], 0, 255);
    inputs[0].convertTo(inputs[1], CV_32F);

+    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+
    Mat outs[2];
    for (int i = 0; i < 2; ++i)
    {
-        Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
+        Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
        net.setPreferableTarget(targetId);
        net.setInput(inputs[i]);
-        if (targetId != DNN_TARGET_MYRIAD)
-        {
-            outs[i] = net.forward();
-            ASSERT_EQ(outs[i].type(), CV_32F);
-        }
-        else
-        {
-            // An assertion is expected because the model is in FP32 format but
-            // Myriad plugin supports only FP16 models.
-            ASSERT_ANY_THROW(net.forward());
-        }
+        outs[i] = net.forward();
+        ASSERT_EQ(outs[i].type(), CV_32F);
    }
    if (targetId != DNN_TARGET_MYRIAD)
        normAssert(outs[0], outs[1]);
@@ -1008,8 +999,8 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Convolution_DLDT,
 // net.save('/path/to/caffemodel')
 //
 // 3. Convert using ModelOptimizer.
-typedef testing::TestWithParam<tuple<int, int, Target> > Test_DLDT_two_inputs;
-TEST_P(Test_DLDT_two_inputs, as_IR)
+typedef testing::TestWithParam<tuple<int, int, Target, std::vector<int> > > Test_DLDT_two_inputs_3dim;
+TEST_P(Test_DLDT_two_inputs_3dim, as_IR)
 {
    int firstInpType = get<0>(GetParam());
    int secondInpType = get<1>(GetParam());
@@ -1020,32 +1011,39 @@ TEST_P(Test_DLDT_two_inputs, as_IR)
        throw SkipTestException("Test is enabled starts from OpenVINO 2018R4");
 #endif

-    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
-    int inpSize[] = {1, 2, 3};
-    Mat firstInp(3, &inpSize[0], firstInpType);
-    Mat secondInp(3, &inpSize[0], secondInpType);
+    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    Net net = readNet(_tf("net_two_inputs" + suffix + ".xml"), _tf("net_two_inputs.bin"));
+    std::vector<int> inpSize = get<3>(GetParam());
+    Mat firstInp(3, inpSize.data(), firstInpType);
+    Mat secondInp(3, inpSize.data(), secondInpType);
    randu(firstInp, 0, 255);
    randu(secondInp, 0, 255);

    net.setInput(firstInp, "data");
    net.setInput(secondInp, "second_input");
    net.setPreferableTarget(targetId);
-    if (targetId != DNN_TARGET_MYRIAD)
-    {
-        Mat out = net.forward();

-        Mat ref;
-        cv::add(firstInp, secondInp, ref, Mat(), CV_32F);
-        normAssert(out, ref);
-    }
-    else
-    {
-        // An assertion is expected because the model is in FP32 format but
-        // Myriad plugin supports only FP16 models.
-        ASSERT_ANY_THROW(net.forward());
-    }
+    double l1 = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) &&
+                 (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.06 : 0.0;
+    double lInf = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) &&
+                   (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.23 : 0.0;
+
+    Mat out = net.forward();
+
+    Mat ref;
+    cv::add(firstInp, secondInp, ref, Mat(), CV_32F);
+    normAssert(out, ref, "", l1, lInf);
 }

+std::vector< std::vector<int> > list_sizes{ {1, 2, 3}, {3, 2, 1}, {5, 5, 5}, {13, 7, 11} };
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_DLDT_two_inputs_3dim, Combine(
+  Values(CV_8U, CV_32F), Values(CV_8U, CV_32F),
+  testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)),
+  testing::ValuesIn(list_sizes)
+));
+
+typedef testing::TestWithParam<tuple<int, int, Target> > Test_DLDT_two_inputs;
 TEST_P(Test_DLDT_two_inputs, as_backend)
 {
    static const float kScale = 0.5f;
@@ -308,4 +308,38 @@ TEST_P(DeprecatedForward, CustomLayerWithFallback)

 INSTANTIATE_TEST_CASE_P(/**/, DeprecatedForward, dnnBackendsAndTargets());

+TEST(Net, forwardAndRetrieve)
+{
+    std::string prototxt =
+        "input: \"data\"\n"
+        "layer {\n"
+        "  name: \"testLayer\"\n"
+        "  type: \"Slice\"\n"
+        "  bottom: \"data\"\n"
+        "  top: \"firstCopy\"\n"
+        "  top: \"secondCopy\"\n"
+        "  slice_param {\n"
+        "    axis: 0\n"
+        "    slice_point: 2\n"
+        "  }\n"
+        "}";
+    Net net = readNetFromCaffe(&prototxt[0], prototxt.size());
+    net.setPreferableBackend(DNN_BACKEND_OPENCV);
+
+    Mat inp(4, 5, CV_32F);
+    randu(inp, -1, 1);
+    net.setInput(inp);
+
+    std::vector<String> outNames;
+    outNames.push_back("testLayer");
+    std::vector<std::vector<Mat> > outBlobs;
+
+    net.forward(outBlobs, outNames);
+
+    EXPECT_EQ(outBlobs.size(), 1);
+    EXPECT_EQ(outBlobs[0].size(), 2);
+    normAssert(outBlobs[0][0], inp.rowRange(0, 2), "first part");
+    normAssert(outBlobs[0][1], inp.rowRange(2, 4), "second part");
+}
+
 }} // namespace
@@ -395,7 +395,7 @@ TEST_P(Test_ONNX_nets, DenseNet121)

 TEST_P(Test_ONNX_nets, Inception_v1)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        throw SkipTestException("Test is disabled for OpenVINO 2018R5");
 #endif
@@ -241,7 +241,7 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten)

 TEST_P(Test_TensorFlow_layers, leaky_relu)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
        throw SkipTestException("");
 #endif
@@ -388,7 +388,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)

 TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
    if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
        throw SkipTestException("Unstable test case");
 #endif
@@ -230,4 +230,27 @@ PERF_TEST_P(Size_MatType_BorderType, blur5x5,
    SANITY_CHECK(dst, 1);
 }

+///////////// BlendLinear ////////////////////////
+PERF_TEST_P(Size_MatType, BlendLinear,
+            testing::Combine(
+                testing::Values(szVGA, sz720p, sz1080p, sz2160p),
+                testing::Values(CV_8UC1, CV_32FC1, CV_8UC3, CV_32FC3, CV_8UC4, CV_32FC4)
+                )
+           )
+{
+    const Size srcSize = get<0>(GetParam());
+    const int srcType = get<1>(GetParam());
+
+    Mat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
+    Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
+
+    declare.in(src1, src2, WARMUP_RNG).in(weights1, weights2, WARMUP_READ).out(dst);
+    randu(weights1, 0, 1);
+    randu(weights2, 0, 1);
+
+    TEST_CYCLE() blendLinear(src1, src2, weights1, weights2, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
@@ -48,44 +48,44 @@
 #include "opencv2/core/hal/intrin.hpp"

 namespace cv {
-#if CV_SIMD128
-static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2)
+#if CV_SIMD
+static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2)
 {
-    const v_float32x4 v_eps = v_setall_f32(1e-5f);
-    v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
+    const v_float32 v_eps = vx_setall_f32(1e-5f);
+    v_float32 v_denom = v_w1 + v_w2 + v_eps;
    return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
 }
-static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
+static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
 {
-    v_float32x4 v_w1 = v_load(w_ptr1 + offset);
-    v_float32x4 v_w2 = v_load(w_ptr2 + offset);
+    v_float32 v_w1 = vx_load(w_ptr1 + offset);
+    v_float32 v_w2 = vx_load(w_ptr2 + offset);
    return blend(v_src1, v_src2, v_w1, v_w2);
 }
-static inline v_uint32x4 saturate_f32_u32(const v_float32x4& vec)
+static inline v_uint32 saturate_f32_u32(const v_float32& vec)
 {
-    const v_int32x4 z = v_setzero_s32();
-    const v_int32x4 x = v_setall_s32(255);
+    const v_int32 z = vx_setzero_s32();
+    const v_int32 x = vx_setall_s32(255);
    return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x));
 }
-static inline v_uint8x16 pack_f32tou8(v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
+static inline v_uint8 pack_f32tou8(v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3)
 {
-    v_uint32x4 a = saturate_f32_u32(val0);
-    v_uint32x4 b = saturate_f32_u32(val1);
-    v_uint32x4 c = saturate_f32_u32(val2);
-    v_uint32x4 d = saturate_f32_u32(val3);
-    v_uint16x8 e = v_pack(a, b);
-    v_uint16x8 f = v_pack(c, d);
+    v_uint32 a = saturate_f32_u32(val0);
+    v_uint32 b = saturate_f32_u32(val1);
+    v_uint32 c = saturate_f32_u32(val2);
+    v_uint32 d = saturate_f32_u32(val3);
+    v_uint16 e = v_pack(a, b);
+    v_uint16 f = v_pack(c, d);
    return v_pack(e, f);
 }
-static inline void store_pack_f32tou8(uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
+static inline void store_pack_f32tou8(uchar* ptr, v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3)
 {
    v_store((ptr), pack_f32tou8(val0, val1, val2, val3));
 }
-static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
+static inline void expand_u8tof32(const v_uint8& src, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3)
 {
-    v_uint16x8 a0, a1;
+    v_uint16 a0, a1;
    v_expand(src, a0, a1);
-    v_uint32x4 b0, b1,b2,b3;
+    v_uint32 b0, b1,b2,b3;
    v_expand(a0, b0, b1);
    v_expand(a1, b2, b3);
    dst0 = v_cvt_f32(v_reinterpret_as_s32(b0));
@@ -93,71 +93,69 @@ static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_fl
    dst2 = v_cvt_f32(v_reinterpret_as_s32(b2));
    dst3 = v_cvt_f32(v_reinterpret_as_s32(b3));
 }
-static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
+static inline void load_expand_u8tof32(const uchar* ptr, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3)
 {
-    v_uint8x16 a = v_load((ptr));
+    v_uint8 a = vx_load((ptr));
    expand_u8tof32(a, dst0, dst1, dst2, dst3);
 }
-int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
-int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
-int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
+int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
+int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
+int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
 {
-    int step = v_uint8x16::nlanes * cn;
-    int weight_step = v_uint8x16::nlanes;
    switch(cn)
    {
    case 1:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes)
        {
-            v_float32x4 v_src10, v_src11, v_src12, v_src13;
-            v_float32x4 v_src20, v_src21, v_src22, v_src23;
+            v_float32 v_src10, v_src11, v_src12, v_src13;
+            v_float32 v_src20, v_src21, v_src22, v_src23;
            load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
            load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);

-            v_float32x4 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + 4);
-            v_float32x4 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 8);
-            v_float32x4 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 12);
+            v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
+            v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes);
+            v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes);
+            v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes);

            store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
        }
        break;
    case 2:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
        {
-            v_uint8x16 v_src10, v_src11, v_src20, v_src21;
+            v_uint8 v_src10, v_src11, v_src20, v_src21;
            v_load_deinterleave(src1 + x, v_src10, v_src11);
            v_load_deinterleave(src2 + x, v_src20, v_src21);
-            v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
-            v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
+            v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
+            v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
            expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
            expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
            expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203);
            expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);

-            v_float32x4 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
-            v_float32x4 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
-            v_float32x4 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + 4);
-            v_float32x4 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + 4);
-            v_float32x4 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 8);
-            v_float32x4 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 8);
-            v_float32x4 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 12);
-            v_float32x4 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 12);
+            v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
+            v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
+            v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes);
+            v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes);
+            v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes);
+            v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes);
+            v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes);
+            v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes);

-            v_uint8x16 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
-            v_uint8x16 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
+            v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
+            v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
            v_store_interleave(dst + x, v_dsta, v_dstb);
        }
        break;
    case 3:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
        {
-            v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
+            v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
            v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
            v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);

-            v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
-            v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
+            v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
+            v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
            expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
            expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
            expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123);
@@ -165,14 +163,14 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
            expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
            expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);

-            v_float32x4 v_w10 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w11 = v_load(weights1 + weight_offset + 4);
-            v_float32x4 v_w12 = v_load(weights1 + weight_offset + 8);
-            v_float32x4 v_w13 = v_load(weights1 + weight_offset + 12);
-            v_float32x4 v_w20 = v_load(weights2 + weight_offset);
-            v_float32x4 v_w21 = v_load(weights2 + weight_offset + 4);
-            v_float32x4 v_w22 = v_load(weights2 + weight_offset + 8);
-            v_float32x4 v_w23 = v_load(weights2 + weight_offset + 12);
+            v_float32 v_w10 = vx_load(weights1 + weight_offset);
+            v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes);
+            v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes);
+            v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes);
+            v_float32 v_w20 = vx_load(weights2 + weight_offset);
+            v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes);
+            v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes);
+            v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes);
            v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
            v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
            v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
@@ -187,34 +185,36 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
            v_src123 = blend(v_src123, v_src223, v_w13, v_w23);


-            v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
-            v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
-            v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
+            v_uint8 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
+            v_uint8 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
+            v_uint8 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
            v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
        }
        break;
    case 4:
-        step = v_uint8x16::nlanes;
-        weight_step = v_float32x4::nlanes;
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
+        for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes)
        {
-            v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
-            v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
+            v_float32 v_src10, v_src11, v_src12, v_src13;
+            v_float32 v_src20, v_src21, v_src22, v_src23;
            load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
            load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);

-            v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
-            v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
+            v_float32 v_w10, v_w11, v_w12, v_w13, v_w20, v_w21, v_w22, v_w23, v_w0, v_w1;
+            v_w10 = vx_load(weights1 + weight_offset);
+            v_zip(v_w10, v_w10, v_w0, v_w1);
+            v_zip(v_w0, v_w0, v_w10, v_w11);
+            v_zip(v_w1, v_w1, v_w12, v_w13);
+            v_w20 = vx_load(weights2 + weight_offset);
+            v_zip(v_w20, v_w20, v_w0, v_w1);
+            v_zip(v_w0, v_w0, v_w20, v_w21);
+            v_zip(v_w1, v_w1, v_w22, v_w23);

-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
-            v_src10 = blend(v_src14, v_src24, v_w1, v_w2);
-            v_src11 = blend(v_src15, v_src25, v_w1, v_w2);
-            v_src12 = blend(v_src16, v_src26, v_w1, v_w2);
-            v_src13 = blend(v_src17, v_src27, v_w1, v_w2);
+            v_float32 v_dst0, v_dst1, v_dst2, v_dst3;
+            v_dst0 = blend(v_src10, v_src20, v_w10, v_w20);
+            v_dst1 = blend(v_src11, v_src21, v_w11, v_w21);
+            v_dst2 = blend(v_src12, v_src22, v_w12, v_w22);
+            v_dst3 = blend(v_src13, v_src23, v_w13, v_w23);

-            v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
-            v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
            store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
        }
        break;
@@ -224,68 +224,67 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
    return x;
 }

-int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
+int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
 {
-    int step = v_float32x4::nlanes*cn;
    switch(cn)
    {
    case 1:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes)
        {
-            v_float32x4 v_src1 = v_load(src1 + x);
-            v_float32x4 v_src2 = v_load(src2 + x);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_src1 = vx_load(src1 + x);
+            v_float32 v_src2 = vx_load(src2 + x);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);

-            v_float32x4 v_dst = blend(v_src1, v_src2, v_w1, v_w2);
+            v_float32 v_dst = blend(v_src1, v_src2, v_w1, v_w2);

            v_store(dst + x, v_dst);
        }
        break;
    case 2:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes)
        {
-            v_float32x4 v_src10, v_src11, v_src20, v_src21;
+            v_float32 v_src10, v_src11, v_src20, v_src21;
            v_load_deinterleave(src1 + x, v_src10, v_src11);
            v_load_deinterleave(src2 + x, v_src20, v_src21);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);

-            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
+            v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
+            v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);

            v_store_interleave(dst + x, v_dst0, v_dst1);
        }
        break;
    case 3:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes)
        {
-            v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
+            v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
            v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
            v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);

-            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
-            v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
+            v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
+            v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
+            v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);

            v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
        }
        break;
    case 4:
-        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
+        for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes)
        {
-            v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
+            v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
            v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
            v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23);
-            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
-            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
+            v_float32 v_w1 = vx_load(weights1 + weight_offset);
+            v_float32 v_w2 = vx_load(weights2 + weight_offset);

-            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
-            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
-            v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
-            v_float32x4 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);
+            v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
+            v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
+            v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
+            v_float32 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);

            v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
        }
@@ -321,8 +320,8 @@ public:
            T * const dst_row = dst->ptr<T>(y);

            int x = 0;
-            #if CV_SIMD128
-            x = blendLinearSimd128(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
+            #if CV_SIMD
+            x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
            #endif

            for ( ; x < width; ++x)
@@ -110,15 +110,19 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
    int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
    CV_Assert(cn > 0 && cn <= 4);
    size_t sstep = _src.step, dstep = _dst.step;
-    Histogram CV_DECL_ALIGNED(16) H[4];
-    HT CV_DECL_ALIGNED(16) luc[4][16];

    int STRIPE_SIZE = std::min( _dst.cols, 512/cn );

-    std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
-    std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16);
-    HT* h_coarse = alignPtr(&_h_coarse[0], 16);
-    HT* h_fine = alignPtr(&_h_fine[0], 16);
+#if defined(CV_SIMD_WIDTH) && CV_SIMD_WIDTH >= 16
+# define CV_ALIGNMENT CV_SIMD_WIDTH
+#else
+# define CV_ALIGNMENT 16
+#endif
+
+    std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT);
+    std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT);
+    HT* h_coarse = alignPtr(&_h_coarse[0], CV_ALIGNMENT);
+    HT* h_fine = alignPtr(&_h_fine[0], CV_ALIGNMENT);

    for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
    {
@@ -148,10 +152,14 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
            const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
            const uchar* p1 = src + sstep * std::min( m-1, i+r );

-            memset( H, 0, cn*sizeof(H[0]) );
-            memset( luc, 0, cn*sizeof(luc[0]) );
            for( c = 0; c < cn; c++ )
            {
+                Histogram CV_DECL_ALIGNED(CV_ALIGNMENT) H;
+                HT CV_DECL_ALIGNED(CV_ALIGNMENT) luc[16];
+
+                memset(&H, 0, sizeof(H));
+                memset(luc, 0, sizeof(luc));
+
                // Update column histograms for the entire row.
                for( j = 0; j < n; j++ )
                {
@@ -163,21 +171,21 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                for (k = 0; k < 16; ++k)
                {
 #if CV_SIMD256
-                    v_store(H[c].fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H[c].fine[k]));
+                    v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k]));
 #elif CV_SIMD128
-                    v_store(H[c].fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k]));
-                    v_store(H[c].fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k] + 8));
+                    v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k]));
+                    v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8));
 #else
                    for (int ind = 0; ind < 16; ++ind)
-                        H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
+                        H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
 #endif
                }

 #if CV_SIMD256
-                v_uint16x16 v_coarse = v256_load(H[c].coarse);
+                v_uint16x16 v_coarse = v256_load(H.coarse);
 #elif CV_SIMD128
-                v_uint16x8 v_coarsel = v_load(H[c].coarse);
-                v_uint16x8 v_coarseh = v_load(H[c].coarse + 8);
+                v_uint16x8 v_coarsel = v_load(H.coarse);
+                v_uint16x8 v_coarseh = v_load(H.coarse + 8);
 #endif
                HT* px = h_coarse + 16 * n*c;
                for( j = 0; j < 2*r; ++j, px += 16 )
@@ -189,7 +197,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                    v_coarseh += v_load(px + 8);
 #else
                    for (int ind = 0; ind < 16; ++ind)
-                        H[c].coarse[ind] += px[ind];
+                        H.coarse[ind] += px[ind];
 #endif
                }

@@ -201,24 +209,24 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                    px = h_coarse + 16 * (n*c + std::min(j + r, n - 1));
 #if CV_SIMD256
                    v_coarse += v256_load(px);
-                    v_store(H[c].coarse, v_coarse);
+                    v_store(H.coarse, v_coarse);
 #elif CV_SIMD128
                    v_coarsel += v_load(px);
                    v_coarseh += v_load(px + 8);
-                    v_store(H[c].coarse, v_coarsel);
-                    v_store(H[c].coarse + 8, v_coarseh);
+                    v_store(H.coarse, v_coarsel);
+                    v_store(H.coarse + 8, v_coarseh);
 #else
                    for (int ind = 0; ind < 16; ++ind)
-                        H[c].coarse[ind] += px[ind];
+                        H.coarse[ind] += px[ind];
 #endif

                    // Find median at coarse level
                    for ( k = 0; k < 16 ; ++k )
                    {
-                        sum += H[c].coarse[k];
+                        sum += H.coarse[k];
                        if ( sum > t )
                        {
-                            sum -= H[c].coarse[k];
+                            sum -= H.coarse[k];
                            break;
                        }
                    }
@@ -231,7 +239,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                    v_uint16x8 v_finel;
                    v_uint16x8 v_fineh;
 #endif
-                    if ( luc[c][k] <= j-r )
+                    if ( luc[k] <= j-r )
                    {
 #if CV_SIMD256
                        v_fine = v256_setzero_u16();
@@ -239,10 +247,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                        v_finel = v_setzero_u16();
                        v_fineh = v_setzero_u16();
 #else
-                        memset(&H[c].fine[k], 0, 16 * sizeof(HT));
+                        memset(&H.fine[k], 0, 16 * sizeof(HT));
 #endif
                        px = h_fine + 16 * (n*(16 * c + k) + j - r);
-                        for (luc[c][k] = HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16)
+                        for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16)
                        {
 #if CV_SIMD256
                            v_fine += v256_load(px);
@@ -251,11 +259,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                            v_fineh += v_load(px + 8);
 #else
                            for (int ind = 0; ind < 16; ++ind)
-                                H[c].fine[k][ind] += px[ind];
+                                H.fine[k][ind] += px[ind];
 #endif
                        }

-                        if ( luc[c][k] < j+r+1 )
+                        if ( luc[k] < j+r+1 )
                        {
                            px = h_fine + 16 * (n*(16 * c + k) + (n - 1));
 #if CV_SIMD256
@@ -265,50 +273,50 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
                            v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
 #else
                            for (int ind = 0; ind < 16; ++ind)
-                                H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (j + r + 1 - n) * px[ind]);
+                                H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]);
 #endif
-                            luc[c][k] = (HT)(j+r+1);
+                            luc[k] = (HT)(j+r+1);
                        }
                    }
                    else
                    {
 #if CV_SIMD256
-                        v_fine = v256_load(H[c].fine[k]);
+                        v_fine = v256_load(H.fine[k]);
 #elif CV_SIMD128
-                        v_finel = v_load(H[c].fine[k]);
-                        v_fineh = v_load(H[c].fine[k] + 8);
+                        v_finel = v_load(H.fine[k]);
+                        v_fineh = v_load(H.fine[k] + 8);
 #endif
                        px = h_fine + 16*n*(16 * c + k);
-                        for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
+                        for ( ; luc[k] < j+r+1; ++luc[k] )
                        {
 #if CV_SIMD256
-                            v_fine = v_fine + v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                            v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
 #elif CV_SIMD128
-                            v_finel = v_finel + v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
-                            v_fineh = v_fineh + v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
+                            v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1)    ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
+                            v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8);
 #else
                            for (int ind = 0; ind < 16; ++ind)
-                                H[c].fine[k][ind] += px[16 * MIN(luc[c][k], n - 1) + ind] - px[16 * MAX(luc[c][k] - 2 * r - 1, 0) + ind];
+                                H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind];
 #endif
                        }
                    }

                    px = h_coarse + 16 * (n*c + MAX(j - r, 0));
 #if CV_SIMD256
-                    v_store(H[c].fine[k], v_fine);
+                    v_store(H.fine[k], v_fine);
                    v_coarse -= v256_load(px);
 #elif CV_SIMD128
-                    v_store(H[c].fine[k], v_finel);
-                    v_store(H[c].fine[k] + 8, v_fineh);
+                    v_store(H.fine[k], v_finel);
+                    v_store(H.fine[k] + 8, v_fineh);
                    v_coarsel -= v_load(px);
                    v_coarseh -= v_load(px + 8);
 #else
                    for (int ind = 0; ind < 16; ++ind)
-                        H[c].coarse[ind] -= px[ind];
+                        H.coarse[ind] -= px[ind];
 #endif

                    /* Find median in segment */
-                    segment = H[c].fine[k];
+                    segment = H.fine[k];
                    for ( b = 0; b < 16 ; b++ )
                    {
                        sum += segment[b];
@@ -112,6 +112,7 @@ struct PyrDownVec_32s8u
            v_rshr_pack_store<8>(dst + x, t0);
            x += v_uint16::nlanes;
        }
+        typedef int CV_DECL_ALIGNED(1) unaligned_int;
        for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
        {
            v_int32x4 r0, r1, r2, r3, r4, t0;
@@ -122,7 +123,7 @@ struct PyrDownVec_32s8u
            r4 = v_load(row4 + x);
            t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);

-            *(int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
+            *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
        }

        return x;
@@ -123,139 +123,125 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
        }
    }

-    // Pointer to row vectors
-    uchar *p_src, *c_src, *n_src; // previous, current, next row
-    short *c_dx,  *c_dy;
-
    int i_start = 0;
    int j_start = 0;
-#if CV_SIMD128
-    if(hasSIMD128())
+#if CV_SIMD
+    // Characters in variable names have the following meanings:
+    // u: unsigned char
+    // s: signed int
+    //
+    // [row][column]
+    // m: offset -1
+    // n: offset  0
+    // p: offset  1
+    // Example: umn is offset -1 in row and offset 0 in column
+    for ( i = 0; i < H - 1; i += 2 )
    {
-        uchar *m_src;
-        short *n_dx, *n_dy;
+        uchar *p_src = src.ptr<uchar>(i == 0 ? i_top : i - 1);
+        uchar *c_src = src.ptr<uchar>(i);
+        uchar *n_src = src.ptr<uchar>(i+1);
+        uchar *m_src = src.ptr<uchar>(i == H - 2 ? i_bottom : i + 2);

-        // Characters in variable names have the following meanings:
-        // u: unsigned char
-        // s: signed int
-        //
-        // [row][column]
-        // m: offset -1
-        // n: offset  0
-        // p: offset  1
-        // Example: umn is offset -1 in row and offset 0 in column
-        for ( i = 0; i < H - 1; i += 2 )
+        short *c_dx = dx.ptr<short>(i);
+        short *c_dy = dy.ptr<short>(i);
+        short *n_dx = dx.ptr<short>(i+1);
+        short *n_dy = dy.ptr<short>(i+1);
+
+        // Process rest of columns 16-column chunks at a time
+        for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes)
        {
-            if   ( i == 0 ) p_src = src.ptr<uchar>(i_top);
-            else            p_src = src.ptr<uchar>(i-1);
+            // Load top row for 3x3 Sobel filter
+            v_uint8 v_um = vx_load(&p_src[j-1]);
+            v_uint8 v_un = vx_load(&p_src[j]);
+            v_uint8 v_up = vx_load(&p_src[j+1]);
+            v_uint16 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2;
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s1m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s1m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s1n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s1n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s1p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s1p2 = v_reinterpret_as_s16(v_up2);

-            c_src = src.ptr<uchar>(i);
-            n_src = src.ptr<uchar>(i+1);
+            // Load second row for 3x3 Sobel filter
+            v_um = vx_load(&c_src[j-1]);
+            v_un = vx_load(&c_src[j]);
+            v_up = vx_load(&c_src[j+1]);
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s2m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s2m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s2n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s2n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s2p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s2p2 = v_reinterpret_as_s16(v_up2);

-            if ( i == H - 2 ) m_src = src.ptr<uchar>(i_bottom);
-            else              m_src = src.ptr<uchar>(i+2);
+            // Load third row for 3x3 Sobel filter
+            v_um = vx_load(&n_src[j-1]);
+            v_un = vx_load(&n_src[j]);
+            v_up = vx_load(&n_src[j+1]);
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s3m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s3m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s3n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s3n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s3p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s3p2 = v_reinterpret_as_s16(v_up2);

-            c_dx = dx.ptr<short>(i);
-            c_dy = dy.ptr<short>(i);
-            n_dx = dx.ptr<short>(i+1);
-            n_dy = dy.ptr<short>(i+1);
+            // dx & dy for rows 1, 2, 3
+            v_int16 v_sdx1, v_sdy1;
+            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+                                              v_s1m1, v_s1n1, v_s1p1,
+                                              v_s2m1,         v_s2p1,
+                                              v_s3m1, v_s3n1, v_s3p1 );

-            // Process rest of columns 16-column chunks at a time
-            for ( j = 1; j < W - 16; j += 16 )
-            {
-                // Load top row for 3x3 Sobel filter
-                v_uint8x16 v_um = v_load(&p_src[j-1]);
-                v_uint8x16 v_un = v_load(&p_src[j]);
-                v_uint8x16 v_up = v_load(&p_src[j+1]);
-                v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2;
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2);
+            v_int16 v_sdx2, v_sdy2;
+            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+                                              v_s1m2, v_s1n2, v_s1p2,
+                                              v_s2m2,         v_s2p2,
+                                              v_s3m2, v_s3n2, v_s3p2 );

-                // Load second row for 3x3 Sobel filter
-                v_um = v_load(&c_src[j-1]);
-                v_un = v_load(&c_src[j]);
-                v_up = v_load(&c_src[j+1]);
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2);
+            // Store
+            v_store(&c_dx[j],                 v_sdx1);
+            v_store(&c_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&c_dy[j],                 v_sdy1);
+            v_store(&c_dy[j+v_int16::nlanes], v_sdy2);

-                // Load third row for 3x3 Sobel filter
-                v_um = v_load(&n_src[j-1]);
-                v_un = v_load(&n_src[j]);
-                v_up = v_load(&n_src[j+1]);
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2);
+            // Load fourth row for 3x3 Sobel filter
+            v_um = vx_load(&m_src[j-1]);
+            v_un = vx_load(&m_src[j]);
+            v_up = vx_load(&m_src[j+1]);
+            v_expand(v_um, v_um1, v_um2);
+            v_expand(v_un, v_un1, v_un2);
+            v_expand(v_up, v_up1, v_up2);
+            v_int16 v_s4m1 = v_reinterpret_as_s16(v_um1);
+            v_int16 v_s4m2 = v_reinterpret_as_s16(v_um2);
+            v_int16 v_s4n1 = v_reinterpret_as_s16(v_un1);
+            v_int16 v_s4n2 = v_reinterpret_as_s16(v_un2);
+            v_int16 v_s4p1 = v_reinterpret_as_s16(v_up1);
+            v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2);

-                // dx & dy for rows 1, 2, 3
-                v_int16x8 v_sdx1, v_sdy1;
-                spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1,
-                                                  v_s1m1, v_s1n1, v_s1p1,
-                                                  v_s2m1,         v_s2p1,
-                                                  v_s3m1, v_s3n1, v_s3p1 );
+            // dx & dy for rows 2, 3, 4
+            spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
+                                              v_s2m1, v_s2n1, v_s2p1,
+                                              v_s3m1,         v_s3p1,
+                                              v_s4m1, v_s4n1, v_s4p1 );

-                v_int16x8 v_sdx2, v_sdy2;
-                spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2,
-                                                  v_s1m2, v_s1n2, v_s1p2,
-                                                  v_s2m2,         v_s2p2,
-                                                  v_s3m2, v_s3n2, v_s3p2 );
+            spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
+                                              v_s2m2, v_s2n2, v_s2p2,
+                                              v_s3m2,         v_s3p2,
+                                              v_s4m2, v_s4n2, v_s4p2 );

-                // Store
-                v_store(&c_dx[j],   v_sdx1);
-                v_store(&c_dx[j+8], v_sdx2);
-                v_store(&c_dy[j],   v_sdy1);
-                v_store(&c_dy[j+8], v_sdy2);
-
-                // Load fourth row for 3x3 Sobel filter
-                v_um = v_load(&m_src[j-1]);
-                v_un = v_load(&m_src[j]);
-                v_up = v_load(&m_src[j+1]);
-                v_expand(v_um, v_um1, v_um2);
-                v_expand(v_un, v_un1, v_un2);
-                v_expand(v_up, v_up1, v_up2);
-                v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1);
-                v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2);
-                v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1);
-                v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2);
-                v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1);
-                v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2);
-
-                // dx & dy for rows 2, 3, 4
-                spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1,
-                                                  v_s2m1, v_s2n1, v_s2p1,
-                                                  v_s3m1,         v_s3p1,
-                                                  v_s4m1, v_s4n1, v_s4p1 );
-
-                spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2,
-                                                  v_s2m2, v_s2n2, v_s2p2,
-                                                  v_s3m2,         v_s3p2,
-                                                  v_s4m2, v_s4n2, v_s4p2 );
-
-                // Store
-                v_store(&n_dx[j],   v_sdx1);
-                v_store(&n_dx[j+8], v_sdx2);
-                v_store(&n_dy[j],   v_sdy1);
-                v_store(&n_dy[j+8], v_sdy2);
-            }
+            // Store
+            v_store(&n_dx[j],                 v_sdx1);
+            v_store(&n_dx[j+v_int16::nlanes], v_sdx2);
+            v_store(&n_dy[j],                 v_sdy1);
+            v_store(&n_dy[j+v_int16::nlanes], v_sdy2);
        }
    }
    i_start = i;
@@ -265,16 +251,12 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
    uchar v00, v01, v02, v10, v11, v12, v20, v21, v22;
    for ( i = 0; i < H; i++ )
    {
-        if   ( i == 0 ) p_src = src.ptr<uchar>(i_top);
-        else            p_src = src.ptr<uchar>(i-1);
+        uchar *p_src = src.ptr<uchar>(i == 0 ? i_top : i - 1);
+        uchar *c_src = src.ptr<uchar>(i);
+        uchar *n_src = src.ptr<uchar>(i == H - 1 ? i_bottom : i + 1);

-        c_src = src.ptr<uchar>(i);
-
-        if ( i == H - 1 ) n_src = src.ptr<uchar>(i_bottom);
-        else              n_src = src.ptr<uchar>(i+1);
-
-        c_dx = dx.ptr<short>(i);
-        c_dy = dy.ptr<short>(i);
+        short *c_dx = dx.ptr<short>(i);
+        short *c_dy = dy.ptr<short>(i);

        // Process left-most column
        j = 0;
@@ -2235,4 +2235,13 @@ TEST(Imgproc_Sobel, s16_regression_13506)
    Sobel(src, dst, CV_16S, 0, 1, 5);
    ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF));
 }
+
+TEST(Imgproc_Pyrdown, issue_12961)
+{
+    Mat src(9, 9, CV_8UC1, Scalar::all(0));
+    Mat dst;
+    cv::pyrDown(src, dst);
+    ASSERT_EQ(0.0, cv::norm(dst));
+}
+
 }} // namespace
@@ -341,6 +341,9 @@ EMSCRIPTEN_BINDINGS(binding_utils)
    register_vector<cv::Mat>("MatVector");
    register_vector<cv::Rect>("RectVector");
    register_vector<cv::KeyPoint>("KeyPointVector");
+    register_vector<cv::DMatch>("DMatchVector");
+    register_vector<std::vector<cv::DMatch>>("DMatchVectorVector");
+

    emscripten::class_<cv::Mat>("Mat")
        .constructor<>()
@@ -494,6 +497,12 @@ EMSCRIPTEN_BINDINGS(binding_utils)
        .field("response", &cv::KeyPoint::response)
        .field("size", &cv::KeyPoint::size);

+    emscripten::value_object<cv::DMatch>("DMatch")
+        .field("queryIdx", &cv::DMatch::queryIdx)
+        .field("trainIdx", &cv::DMatch::trainIdx)
+        .field("imgIdx", &cv::DMatch::imgIdx)
+        .field("distance", &cv::DMatch::distance);
+
    emscripten::value_array<cv::Scalar_<double>> ("Scalar")
        .element(index<0>())
        .element(index<1>())
@@ -200,20 +200,19 @@ public:
    {
        int j;
        calc_non_rbf_base( vcount, var_count, vecs, another, results,
-                          -2*params.gamma, -2*params.coef0 );
+                          2*params.gamma, 2*params.coef0 );
        // TODO: speedup this
        for( j = 0; j < vcount; j++ )
        {
            Qfloat t = results[j];
-            Qfloat e = std::exp(-std::abs(t));
+            Qfloat e = std::exp(std::abs(t));
            if( t > 0 )
-                results[j] = (Qfloat)((1. - e)/(1. + e));
-            else
                results[j] = (Qfloat)((e - 1.)/(e + 1.));
+            else
+                results[j] = (Qfloat)((1. - e)/(1. + e));
        }
    }

-
    void calc_rbf( int vcount, int var_count, const float* vecs,
                   const float* another, Qfloat* results )
    {
@@ -1310,8 +1309,6 @@ public:

            if( kernelType != SIGMOID && kernelType != POLY )
                params.coef0 = 0;
-            else if( params.coef0 < 0 )
-                CV_Error( CV_StsOutOfRange, "The kernel parameter <coef0> must be positive or zero" );

            if( kernelType != POLY )
                params.degree = 0;
@@ -88,6 +88,51 @@ void CV_SVMTrainAutoTest::run( int /*start_from*/ )

 TEST(ML_SVM, trainauto) { CV_SVMTrainAutoTest test; test.safe_run(); }

+TEST(ML_SVM, trainauto_sigmoid)
+{
+    const int datasize = 100;
+    cv::Mat samples = cv::Mat::zeros( datasize, 2, CV_32FC1 );
+    cv::Mat responses = cv::Mat::zeros( datasize, 1, CV_32S );
+
+    const float scale_factor = 0.5;
+    const float radius = 2.0;
+
+    // Populate samples with data that can be split into two concentric circles
+    for (int i = 0; i < datasize; i+=2)
+    {
+        const float pi = 3.14159f;
+        const float angle_rads = (i/datasize) * pi;
+        const float x = radius * cos(angle_rads);
+        const float y = radius * cos(angle_rads);
+
+        // Larger circle
+        samples.at<float>( i, 0 ) = x;
+        samples.at<float>( i, 1 ) = y;
+        responses.at<int>( i, 0 ) = 0;
+
+        // Smaller circle
+        samples.at<float>( i + 1, 0 ) = x * scale_factor;
+        samples.at<float>( i + 1, 1 ) = y * scale_factor;
+        responses.at<int>( i + 1, 0 ) = 1;
+    }
+
+    cv::Ptr<TrainData> data = TrainData::create( samples, cv::ml::ROW_SAMPLE, responses );
+    cv::Ptr<SVM> svm = SVM::create();
+    svm->setKernel(SVM::SIGMOID);
+
+    svm->setGamma(10.0);
+    svm->setCoef0(-10.0);
+    svm->trainAuto( data, 10 );  // 2-fold cross validation.
+
+    float test_data0[2] = {radius, radius};
+    cv::Mat test_point0 = cv::Mat( 1, 2, CV_32FC1, test_data0 );
+    ASSERT_EQ(0, svm->predict( test_point0 ));
+
+    float test_data1[2] = {scale_factor * radius, scale_factor * radius};
+    cv::Mat test_point1 = cv::Mat( 1, 2, CV_32FC1, test_data1 );
+    ASSERT_EQ(1, svm->predict( test_point1 ));
+}
+

 TEST(ML_SVM, trainAuto_regression_5369)
 {
@@ -323,7 +323,7 @@ def writeTextGraph(modelPath, outputPath, outNodes):

            for node in graph_def.node:
                if node.op == 'Const':
-                    if 'value' in node.attr:
-                        del node.attr['value']
+                    if 'value' in node.attr and node.attr['value'].tensor.tensor_content:
+                        node.attr['value'].tensor.tensor_content = ''

        tf.train.write_graph(graph_def, "", outputPath, as_text=True)