diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 95643a287b..d9fd4e99a1 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -148,7 +148,13 @@ private: #else cv::dnn::Net net; cv::dnn::LayerParams lp; - net.addLayerToPrev("testLayer", "Identity", lp); + lp.set("kernel_size", 1); + lp.set("num_output", 1); + lp.set("bias_term", false); + lp.type = "Convolution"; + lp.name = "testLayer"; + lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1))); + net.addLayerToPrev(lp.name, lp.type, lp); net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE); net.setPreferableTarget(target); static int inpDims[] = {1, 2, 3, 4}; @@ -2676,7 +2682,7 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin) backendNode->net = Ptr(new InfEngineBackendNet(ieNet)); for (auto& it : ieNet.getOutputsInfo()) { - Ptr cvLayer(new InfEngineBackendLayer(it.second)); + Ptr cvLayer(new InfEngineBackendLayer(ieNet)); InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str()); CV_Assert(ieLayer); @@ -2871,8 +2877,7 @@ void Net::forward(std::vector >& outputBlobs, std::vector pins; for (int i = 0; i < outBlobNames.size(); i++) { - std::vector lp = impl->getLayerOutPins(outBlobNames[i]); - pins.insert(pins.end(), lp.begin(), lp.end()); + pins.push_back(impl->getPinByAlias(outBlobNames[i])); } impl->setUpNet(pins); @@ -2885,9 +2890,10 @@ void Net::forward(std::vector >& outputBlobs, for (int i = 0; i < outBlobNames.size(); i++) { std::vector lp = impl->getLayerOutPins(outBlobNames[i]); - for (int i = 0; i < lp.size(); i++) + outputBlobs[i].resize(lp.size()); + for (int j = 0; j < lp.size(); j++) { - outputBlobs[i].push_back(impl->getBlob(lp[i])); + outputBlobs[i][j] = impl->getBlob(lp[j]); } } } diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 9f8590bea7..96336808a0 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -110,14 +110,25 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { #ifdef HAVE_INF_ENGINE -#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) - InferenceEngine::Builder::SplitLayer ieLayer(name); - ieLayer.setOutputPorts({InferenceEngine::Port()}); - return Ptr(new InfEngineBackendNode(ieLayer)); -#else InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); CV_Assert(!input->dims.empty()); - +#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) + InferenceEngine::Builder::Layer ieLayer(name); + ieLayer.setName(name); + if (preferableTarget == DNN_TARGET_MYRIAD) + { + ieLayer.setType("Copy"); + } + else + { + ieLayer.setType("Split"); + ieLayer.getParameters()["axis"] = input->dims.size() - 1; + ieLayer.getParameters()["out_sizes"] = input->dims[0]; + } + ieLayer.setInputPorts(std::vector(1)); + ieLayer.setOutputPorts(std::vector(1)); + return Ptr(new InfEngineBackendNode(ieLayer)); +#else InferenceEngine::LayerParams lp; lp.name = name; lp.type = "Split"; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 21a13c8d47..57f6054538 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -281,7 +281,7 @@ public: const int outCn = blobs[0].size[0]; // prepare weightsMat where each row is aligned and has enough zero padding on the right to // use vectorized (i.e. with intrinsics) loops without tail processing - Mat wm = blobs[0].reshape(1, outCn).clone(); + Mat wm = blobs[0].reshape(1, outCn); if( wm.step1() % VEC_ALIGN != 0 ) { int newcols = (int)alignSize(wm.step1(), VEC_ALIGN); @@ -374,6 +374,10 @@ public: if (!w.empty()) { + // Keep origin weights unchanged. + if (weightsMat.data == blobs[0].data) + weightsMat = weightsMat.clone(); + Mat originWeights = blobs[0].reshape(1, outCn); for (int i = 0; i < outCn; ++i) { @@ -551,13 +555,13 @@ public: #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) InferenceEngine::Builder::ConvolutionLayer ieLayer(name); - ieLayer.setKernel({kernel.height, kernel.width}); - ieLayer.setStrides({stride.height, stride.width}); - ieLayer.setDilation({dilation.height, dilation.width}); - ieLayer.setPaddingsBegin({pad.height, pad.width}); - ieLayer.setPaddingsEnd({pad.height, pad.width}); - ieLayer.setGroup(group); - ieLayer.setOutDepth(outCn); + ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width}); + ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width}); + ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width}); + ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width}); + ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width}); + ieLayer.setGroup((size_t)group); + ieLayer.setOutDepth((size_t)outCn); ieLayer.setWeights(ieWeights); if (ieBiases) @@ -1220,7 +1224,7 @@ public: #ifdef HAVE_INF_ENGINE if (backendId == DNN_BACKEND_INFERENCE_ENGINE) { - if (INF_ENGINE_RELEASE == 2018050000 && (adjustPad.height || adjustPad.width)) + if (INF_ENGINE_RELEASE >= 2018050000 && (adjustPad.height || adjustPad.width)) return false; const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW layout @@ -1783,13 +1787,13 @@ public: InferenceEngine::Builder::DeconvolutionLayer ieLayer(name); - ieLayer.setKernel({kernel.height, kernel.width}); - ieLayer.setStrides({stride.height, stride.width}); - ieLayer.setDilation({dilation.height, dilation.width}); - ieLayer.setPaddingsBegin({pad.height, pad.width}); - ieLayer.setPaddingsEnd({pad.height, pad.width}); - ieLayer.setGroup(group); - ieLayer.setOutDepth(numOutput); + ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width}); + ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width}); + ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width}); + ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width}); + ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width}); + ieLayer.setGroup((size_t)group); + ieLayer.setOutDepth((size_t)numOutput); ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW)); if (hasBias()) diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index bfcc1068e1..5b357fec91 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -299,10 +299,10 @@ public: if (type == MAX || type == AVE) { InferenceEngine::Builder::PoolingLayer ieLayer(name); - ieLayer.setKernel({kernel.height, kernel.width}); - ieLayer.setStrides({stride.height, stride.width}); - ieLayer.setPaddingsBegin({pad_t, pad_l}); - ieLayer.setPaddingsEnd({pad_b, pad_r}); + ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width}); + ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width}); + ieLayer.setPaddingsBegin({(size_t)pad_t, (size_t)pad_l}); + ieLayer.setPaddingsEnd({(size_t)pad_b, (size_t)pad_r}); ieLayer.setPoolingType(type == MAX ? InferenceEngine::Builder::PoolingLayer::PoolingType::MAX : InferenceEngine::Builder::PoolingLayer::PoolingType::AVG); diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index 98de907b9e..1d21021e34 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -82,7 +82,7 @@ void InfEngineBackendNet::connect(const std::vector >& input CV_Assert(it != layers.end()); const int layerId = it->second; - for (int i = 0; i < inpWrappers.size(); ++i) + for (size_t i = 0; i < inpWrappers.size(); ++i) { const auto& inp = inpWrappers[i]; const std::string& inpName = inp->dataPtr->name; @@ -103,7 +103,7 @@ void InfEngineBackendNet::connect(const std::vector >& input else inpId = it->second; - netBuilder.connect(inpId, {layerId, i}); + netBuilder.connect((size_t)inpId, {(size_t)layerId, i}); unconnectedLayersIds.erase(inpId); } CV_Assert(!outputs.empty()); @@ -119,7 +119,7 @@ void InfEngineBackendNet::init(int targetId) for (int id : unconnectedLayersIds) { InferenceEngine::Builder::OutputLayer outLayer("myconv1"); - netBuilder.addLayer({id}, outLayer); + netBuilder.addLayer({InferenceEngine::PortInfo(id)}, outLayer); } cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build())); } @@ -718,19 +718,33 @@ Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob) return Mat(size, CV_32F, (void*)blob->buffer()); } -InfEngineBackendLayer::InfEngineBackendLayer(const InferenceEngine::DataPtr& output_) -{ - output = output_; -} - bool InfEngineBackendLayer::getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, std::vector &internals) const { - std::vector dims = output->dims; - std::vector shape(dims.rbegin(), dims.rend()); - outputs.assign(1, shape); + InferenceEngine::ICNNNetwork::InputShapes inShapes = t_net.getInputShapes(); + InferenceEngine::ICNNNetwork::InputShapes::iterator itr; + bool equal_flag = true; + size_t i = 0; + for (itr = inShapes.begin(); itr != inShapes.end(); ++itr) + { + InferenceEngine::SizeVector currentInShape(inputs[i].begin(), inputs[i].end()); + if (itr->second != currentInShape) + { + itr->second = currentInShape; + equal_flag = false; + } + i++; + } + + if (!equal_flag) + { + InferenceEngine::CNNNetwork curr_t_net(t_net); + curr_t_net.reshape(inShapes); + } + std::vector dims = t_net.getOutputsInfo()[name]->getDims(); + outputs.push_back(MatShape(dims.begin(), dims.end())); return false; } diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp index a224767f8d..1e35612555 100644 --- a/modules/dnn/src/op_inf_engine.hpp +++ b/modules/dnn/src/op_inf_engine.hpp @@ -260,7 +260,7 @@ InferenceEngine::TBlob::Ptr convertFp16(const InferenceEngine::Blob::Pt class InfEngineBackendLayer : public Layer { public: - InfEngineBackendLayer(const InferenceEngine::DataPtr& output); + InfEngineBackendLayer(const InferenceEngine::CNNNetwork &t_net_) : t_net(t_net_) {}; virtual bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, @@ -273,7 +273,7 @@ public: virtual bool supportBackend(int backendId) CV_OVERRIDE; private: - InferenceEngine::DataPtr output; + InferenceEngine::CNNNetwork t_net; }; #endif // HAVE_INF_ENGINE diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 62e625f03c..06aec7da13 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -236,6 +236,10 @@ TEST_P(Test_Caffe_layers, Dropout) TEST_P(Test_Caffe_layers, Concat) { +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE > 2018050000 + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) + throw SkipTestException(""); +#endif testLayerUsingCaffeModels("layer_concat"); testLayerUsingCaffeModels("layer_concat_optim", true, false); testLayerUsingCaffeModels("layer_concat_shared_input", true, false); @@ -923,8 +927,9 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy) { Target targetId = GetParam(); + std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : ""; Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt")); - Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin")); + Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin")); Mat inp = blobFromNPY(_tf("blob.npy")); @@ -935,22 +940,15 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy) net.setInput(inp); net.setPreferableTarget(targetId); - if (targetId != DNN_TARGET_MYRIAD) - { - Mat out = net.forward(); + Mat out = net.forward(); - normAssert(outDefault, out); + double l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.4e-3 : 1e-5; + double lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.8e-2 : 1e-4; + normAssert(outDefault, out, "", l1, lInf); - std::vector outLayers = net.getUnconnectedOutLayers(); - ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge"); - ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat"); - } - else - { - // An assertion is expected because the model is in FP32 format but - // Myriad plugin supports only FP16 models. - ASSERT_ANY_THROW(net.forward()); - } + std::vector outLayers = net.getUnconnectedOutLayers(); + ASSERT_EQ(net.getLayer(outLayers[0])->name, "output"); + ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution"); } TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8) @@ -962,23 +960,16 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8) randu(inputs[0], 0, 255); inputs[0].convertTo(inputs[1], CV_32F); + std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : ""; + Mat outs[2]; for (int i = 0; i < 2; ++i) { - Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin")); + Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin")); net.setPreferableTarget(targetId); net.setInput(inputs[i]); - if (targetId != DNN_TARGET_MYRIAD) - { - outs[i] = net.forward(); - ASSERT_EQ(outs[i].type(), CV_32F); - } - else - { - // An assertion is expected because the model is in FP32 format but - // Myriad plugin supports only FP16 models. - ASSERT_ANY_THROW(net.forward()); - } + outs[i] = net.forward(); + ASSERT_EQ(outs[i].type(), CV_32F); } if (targetId != DNN_TARGET_MYRIAD) normAssert(outs[0], outs[1]); @@ -1008,8 +999,8 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Convolution_DLDT, // net.save('/path/to/caffemodel') // // 3. Convert using ModelOptimizer. -typedef testing::TestWithParam > Test_DLDT_two_inputs; -TEST_P(Test_DLDT_two_inputs, as_IR) +typedef testing::TestWithParam > > Test_DLDT_two_inputs_3dim; +TEST_P(Test_DLDT_two_inputs_3dim, as_IR) { int firstInpType = get<0>(GetParam()); int secondInpType = get<1>(GetParam()); @@ -1020,32 +1011,39 @@ TEST_P(Test_DLDT_two_inputs, as_IR) throw SkipTestException("Test is enabled starts from OpenVINO 2018R4"); #endif - Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin")); - int inpSize[] = {1, 2, 3}; - Mat firstInp(3, &inpSize[0], firstInpType); - Mat secondInp(3, &inpSize[0], secondInpType); + std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : ""; + Net net = readNet(_tf("net_two_inputs" + suffix + ".xml"), _tf("net_two_inputs.bin")); + std::vector inpSize = get<3>(GetParam()); + Mat firstInp(3, inpSize.data(), firstInpType); + Mat secondInp(3, inpSize.data(), secondInpType); randu(firstInp, 0, 255); randu(secondInp, 0, 255); net.setInput(firstInp, "data"); net.setInput(secondInp, "second_input"); net.setPreferableTarget(targetId); - if (targetId != DNN_TARGET_MYRIAD) - { - Mat out = net.forward(); - Mat ref; - cv::add(firstInp, secondInp, ref, Mat(), CV_32F); - normAssert(out, ref); - } - else - { - // An assertion is expected because the model is in FP32 format but - // Myriad plugin supports only FP16 models. - ASSERT_ANY_THROW(net.forward()); - } + double l1 = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) && + (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.06 : 0.0; + double lInf = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) && + (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.23 : 0.0; + + Mat out = net.forward(); + + Mat ref; + cv::add(firstInp, secondInp, ref, Mat(), CV_32F); + normAssert(out, ref, "", l1, lInf); } +std::vector< std::vector > list_sizes{ {1, 2, 3}, {3, 2, 1}, {5, 5, 5}, {13, 7, 11} }; + +INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_DLDT_two_inputs_3dim, Combine( + Values(CV_8U, CV_32F), Values(CV_8U, CV_32F), + testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)), + testing::ValuesIn(list_sizes) +)); + +typedef testing::TestWithParam > Test_DLDT_two_inputs; TEST_P(Test_DLDT_two_inputs, as_backend) { static const float kScale = 0.5f; diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp index fa528b5c4b..60e3313048 100644 --- a/modules/dnn/test/test_misc.cpp +++ b/modules/dnn/test/test_misc.cpp @@ -308,4 +308,38 @@ TEST_P(DeprecatedForward, CustomLayerWithFallback) INSTANTIATE_TEST_CASE_P(/**/, DeprecatedForward, dnnBackendsAndTargets()); +TEST(Net, forwardAndRetrieve) +{ + std::string prototxt = + "input: \"data\"\n" + "layer {\n" + " name: \"testLayer\"\n" + " type: \"Slice\"\n" + " bottom: \"data\"\n" + " top: \"firstCopy\"\n" + " top: \"secondCopy\"\n" + " slice_param {\n" + " axis: 0\n" + " slice_point: 2\n" + " }\n" + "}"; + Net net = readNetFromCaffe(&prototxt[0], prototxt.size()); + net.setPreferableBackend(DNN_BACKEND_OPENCV); + + Mat inp(4, 5, CV_32F); + randu(inp, -1, 1); + net.setInput(inp); + + std::vector outNames; + outNames.push_back("testLayer"); + std::vector > outBlobs; + + net.forward(outBlobs, outNames); + + EXPECT_EQ(outBlobs.size(), 1); + EXPECT_EQ(outBlobs[0].size(), 2); + normAssert(outBlobs[0][0], inp.rowRange(0, 2), "first part"); + normAssert(outBlobs[0][1], inp.rowRange(2, 4), "second part"); +} + }} // namespace diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index acdd66631c..217ef34421 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -395,7 +395,7 @@ TEST_P(Test_ONNX_nets, DenseNet121) TEST_P(Test_ONNX_nets, Inception_v1) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) throw SkipTestException("Test is disabled for OpenVINO 2018R5"); #endif diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index b20b166551..7ddda7f03a 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -241,7 +241,7 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten) TEST_P(Test_TensorFlow_layers, leaky_relu) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) throw SkipTestException(""); #endif @@ -388,7 +388,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN) TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)) throw SkipTestException("Unstable test case"); #endif diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp index 7503eb9321..e4092ccb16 100644 --- a/modules/imgproc/perf/perf_blur.cpp +++ b/modules/imgproc/perf/perf_blur.cpp @@ -230,4 +230,27 @@ PERF_TEST_P(Size_MatType_BorderType, blur5x5, SANITY_CHECK(dst, 1); } +///////////// BlendLinear //////////////////////// +PERF_TEST_P(Size_MatType, BlendLinear, + testing::Combine( + testing::Values(szVGA, sz720p, sz1080p, sz2160p), + testing::Values(CV_8UC1, CV_32FC1, CV_8UC3, CV_32FC3, CV_8UC4, CV_32FC4) + ) + ) +{ + const Size srcSize = get<0>(GetParam()); + const int srcType = get<1>(GetParam()); + + Mat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType); + Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1); + + declare.in(src1, src2, WARMUP_RNG).in(weights1, weights2, WARMUP_READ).out(dst); + randu(weights1, 0, 1); + randu(weights2, 0, 1); + + TEST_CYCLE() blendLinear(src1, src2, weights1, weights2, dst); + + SANITY_CHECK_NOTHING(); +} + } // namespace diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp index 1a4ad0d525..e0ee9ec0c5 100644 --- a/modules/imgproc/src/blend.cpp +++ b/modules/imgproc/src/blend.cpp @@ -48,44 +48,44 @@ #include "opencv2/core/hal/intrin.hpp" namespace cv { -#if CV_SIMD128 -static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2) +#if CV_SIMD +static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2) { - const v_float32x4 v_eps = v_setall_f32(1e-5f); - v_float32x4 v_denom = v_w1 + v_w2 + v_eps; + const v_float32 v_eps = vx_setall_f32(1e-5f); + v_float32 v_denom = v_w1 + v_w2 + v_eps; return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom; } -static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const float* w_ptr1, const float* w_ptr2, int offset) +static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset) { - v_float32x4 v_w1 = v_load(w_ptr1 + offset); - v_float32x4 v_w2 = v_load(w_ptr2 + offset); + v_float32 v_w1 = vx_load(w_ptr1 + offset); + v_float32 v_w2 = vx_load(w_ptr2 + offset); return blend(v_src1, v_src2, v_w1, v_w2); } -static inline v_uint32x4 saturate_f32_u32(const v_float32x4& vec) +static inline v_uint32 saturate_f32_u32(const v_float32& vec) { - const v_int32x4 z = v_setzero_s32(); - const v_int32x4 x = v_setall_s32(255); + const v_int32 z = vx_setzero_s32(); + const v_int32 x = vx_setall_s32(255); return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x)); } -static inline v_uint8x16 pack_f32tou8(v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3) +static inline v_uint8 pack_f32tou8(v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3) { - v_uint32x4 a = saturate_f32_u32(val0); - v_uint32x4 b = saturate_f32_u32(val1); - v_uint32x4 c = saturate_f32_u32(val2); - v_uint32x4 d = saturate_f32_u32(val3); - v_uint16x8 e = v_pack(a, b); - v_uint16x8 f = v_pack(c, d); + v_uint32 a = saturate_f32_u32(val0); + v_uint32 b = saturate_f32_u32(val1); + v_uint32 c = saturate_f32_u32(val2); + v_uint32 d = saturate_f32_u32(val3); + v_uint16 e = v_pack(a, b); + v_uint16 f = v_pack(c, d); return v_pack(e, f); } -static inline void store_pack_f32tou8(uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3) +static inline void store_pack_f32tou8(uchar* ptr, v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3) { v_store((ptr), pack_f32tou8(val0, val1, val2, val3)); } -static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3) +static inline void expand_u8tof32(const v_uint8& src, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3) { - v_uint16x8 a0, a1; + v_uint16 a0, a1; v_expand(src, a0, a1); - v_uint32x4 b0, b1,b2,b3; + v_uint32 b0, b1,b2,b3; v_expand(a0, b0, b1); v_expand(a1, b2, b3); dst0 = v_cvt_f32(v_reinterpret_as_s32(b0)); @@ -93,71 +93,69 @@ static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_fl dst2 = v_cvt_f32(v_reinterpret_as_s32(b2)); dst3 = v_cvt_f32(v_reinterpret_as_s32(b3)); } -static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3) +static inline void load_expand_u8tof32(const uchar* ptr, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3) { - v_uint8x16 a = v_load((ptr)); + v_uint8 a = vx_load((ptr)); expand_u8tof32(a, dst0, dst1, dst2, dst3); } -int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn); -int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn); -int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn) +int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn); +int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn); +int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn) { - int step = v_uint8x16::nlanes * cn; - int weight_step = v_uint8x16::nlanes; switch(cn) { case 1: - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step) + for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes) { - v_float32x4 v_src10, v_src11, v_src12, v_src13; - v_float32x4 v_src20, v_src21, v_src22, v_src23; + v_float32 v_src10, v_src11, v_src12, v_src13; + v_float32 v_src20, v_src21, v_src22, v_src23; load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13); load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23); - v_float32x4 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset); - v_float32x4 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + 4); - v_float32x4 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 8); - v_float32x4 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 12); + v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset); + v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes); + v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes); + v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes); store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); } break; case 2: - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step) + for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes) { - v_uint8x16 v_src10, v_src11, v_src20, v_src21; + v_uint8 v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src1 + x, v_src10, v_src11); v_load_deinterleave(src2 + x, v_src20, v_src21); - v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113; - v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213; + v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113; + v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213; expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103); expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113); expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203); expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213); - v_float32x4 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset); - v_float32x4 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset); - v_float32x4 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + 4); - v_float32x4 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + 4); - v_float32x4 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 8); - v_float32x4 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 8); - v_float32x4 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 12); - v_float32x4 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 12); + v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset); + v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset); + v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes); + v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes); + v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes); + v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes); + v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes); + v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes); - v_uint8x16 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6); - v_uint8x16 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7); + v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6); + v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7); v_store_interleave(dst + x, v_dsta, v_dstb); } break; case 3: - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step) + for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes) { - v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; + v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22); - v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123; - v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223; + v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123; + v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223; expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103); expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113); expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123); @@ -165,14 +163,14 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213); expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223); - v_float32x4 v_w10 = v_load(weights1 + weight_offset); - v_float32x4 v_w11 = v_load(weights1 + weight_offset + 4); - v_float32x4 v_w12 = v_load(weights1 + weight_offset + 8); - v_float32x4 v_w13 = v_load(weights1 + weight_offset + 12); - v_float32x4 v_w20 = v_load(weights2 + weight_offset); - v_float32x4 v_w21 = v_load(weights2 + weight_offset + 4); - v_float32x4 v_w22 = v_load(weights2 + weight_offset + 8); - v_float32x4 v_w23 = v_load(weights2 + weight_offset + 12); + v_float32 v_w10 = vx_load(weights1 + weight_offset); + v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes); + v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes); + v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes); + v_float32 v_w20 = vx_load(weights2 + weight_offset); + v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes); + v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes); + v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes); v_src100 = blend(v_src100, v_src200, v_w10, v_w20); v_src110 = blend(v_src110, v_src210, v_w10, v_w20); v_src120 = blend(v_src120, v_src220, v_w10, v_w20); @@ -187,34 +185,36 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight v_src123 = blend(v_src123, v_src223, v_w13, v_w23); - v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103); - v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113); - v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123); + v_uint8 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103); + v_uint8 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113); + v_uint8 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123); v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2); } break; case 4: - step = v_uint8x16::nlanes; - weight_step = v_float32x4::nlanes; - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step) + for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes) { - v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17; - v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27; + v_float32 v_src10, v_src11, v_src12, v_src13; + v_float32 v_src20, v_src21, v_src22, v_src23; load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13); load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23); - v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17); - v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27); + v_float32 v_w10, v_w11, v_w12, v_w13, v_w20, v_w21, v_w22, v_w23, v_w0, v_w1; + v_w10 = vx_load(weights1 + weight_offset); + v_zip(v_w10, v_w10, v_w0, v_w1); + v_zip(v_w0, v_w0, v_w10, v_w11); + v_zip(v_w1, v_w1, v_w12, v_w13); + v_w20 = vx_load(weights2 + weight_offset); + v_zip(v_w20, v_w20, v_w0, v_w1); + v_zip(v_w0, v_w0, v_w20, v_w21); + v_zip(v_w1, v_w1, v_w22, v_w23); - v_float32x4 v_w1 = v_load(weights1 + weight_offset); - v_float32x4 v_w2 = v_load(weights2 + weight_offset); - v_src10 = blend(v_src14, v_src24, v_w1, v_w2); - v_src11 = blend(v_src15, v_src25, v_w1, v_w2); - v_src12 = blend(v_src16, v_src26, v_w1, v_w2); - v_src13 = blend(v_src17, v_src27, v_w1, v_w2); + v_float32 v_dst0, v_dst1, v_dst2, v_dst3; + v_dst0 = blend(v_src10, v_src20, v_w10, v_w20); + v_dst1 = blend(v_src11, v_src21, v_w11, v_w21); + v_dst2 = blend(v_src12, v_src22, v_w12, v_w22); + v_dst3 = blend(v_src13, v_src23, v_w13, v_w23); - v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3; - v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3); store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); } break; @@ -224,68 +224,67 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight return x; } -int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn) +int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn) { - int step = v_float32x4::nlanes*cn; switch(cn) { case 1: - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes) { - v_float32x4 v_src1 = v_load(src1 + x); - v_float32x4 v_src2 = v_load(src2 + x); - v_float32x4 v_w1 = v_load(weights1 + weight_offset); - v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32 v_src1 = vx_load(src1 + x); + v_float32 v_src2 = vx_load(src2 + x); + v_float32 v_w1 = vx_load(weights1 + weight_offset); + v_float32 v_w2 = vx_load(weights2 + weight_offset); - v_float32x4 v_dst = blend(v_src1, v_src2, v_w1, v_w2); + v_float32 v_dst = blend(v_src1, v_src2, v_w1, v_w2); v_store(dst + x, v_dst); } break; case 2: - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes) { - v_float32x4 v_src10, v_src11, v_src20, v_src21; + v_float32 v_src10, v_src11, v_src20, v_src21; v_load_deinterleave(src1 + x, v_src10, v_src11); v_load_deinterleave(src2 + x, v_src20, v_src21); - v_float32x4 v_w1 = v_load(weights1 + weight_offset); - v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32 v_w1 = vx_load(weights1 + weight_offset); + v_float32 v_w2 = vx_load(weights2 + weight_offset); - v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); - v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); + v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); + v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); v_store_interleave(dst + x, v_dst0, v_dst1); } break; case 3: - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes) { - v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; + v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22); - v_float32x4 v_w1 = v_load(weights1 + weight_offset); - v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32 v_w1 = vx_load(weights1 + weight_offset); + v_float32 v_w2 = vx_load(weights2 + weight_offset); - v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); - v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); - v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2); + v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); + v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); + v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2); v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2); } break; case 4: - for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) + for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes) { - v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; + v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13); v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23); - v_float32x4 v_w1 = v_load(weights1 + weight_offset); - v_float32x4 v_w2 = v_load(weights2 + weight_offset); + v_float32 v_w1 = vx_load(weights1 + weight_offset); + v_float32 v_w2 = vx_load(weights2 + weight_offset); - v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); - v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); - v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2); - v_float32x4 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2); + v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); + v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); + v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2); + v_float32 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2); v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); } @@ -321,8 +320,8 @@ public: T * const dst_row = dst->ptr(y); int x = 0; - #if CV_SIMD128 - x = blendLinearSimd128(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn); + #if CV_SIMD + x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn); #endif for ( ; x < width; ++x) diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.cpp index 0606aec578..d565b9486d 100644 --- a/modules/imgproc/src/median_blur.cpp +++ b/modules/imgproc/src/median_blur.cpp @@ -110,15 +110,19 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2; CV_Assert(cn > 0 && cn <= 4); size_t sstep = _src.step, dstep = _dst.step; - Histogram CV_DECL_ALIGNED(16) H[4]; - HT CV_DECL_ALIGNED(16) luc[4][16]; int STRIPE_SIZE = std::min( _dst.cols, 512/cn ); - std::vector _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16); - std::vector _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16); - HT* h_coarse = alignPtr(&_h_coarse[0], 16); - HT* h_fine = alignPtr(&_h_fine[0], 16); +#if defined(CV_SIMD_WIDTH) && CV_SIMD_WIDTH >= 16 +# define CV_ALIGNMENT CV_SIMD_WIDTH +#else +# define CV_ALIGNMENT 16 +#endif + + std::vector _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT); + std::vector _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT); + HT* h_coarse = alignPtr(&_h_coarse[0], CV_ALIGNMENT); + HT* h_fine = alignPtr(&_h_fine[0], CV_ALIGNMENT); for( int x = 0; x < _dst.cols; x += STRIPE_SIZE ) { @@ -148,10 +152,14 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) const uchar* p0 = src + sstep * std::max( 0, i-r-1 ); const uchar* p1 = src + sstep * std::min( m-1, i+r ); - memset( H, 0, cn*sizeof(H[0]) ); - memset( luc, 0, cn*sizeof(luc[0]) ); for( c = 0; c < cn; c++ ) { + Histogram CV_DECL_ALIGNED(CV_ALIGNMENT) H; + HT CV_DECL_ALIGNED(CV_ALIGNMENT) luc[16]; + + memset(&H, 0, sizeof(H)); + memset(luc, 0, sizeof(luc)); + // Update column histograms for the entire row. for( j = 0; j < n; j++ ) { @@ -163,21 +171,21 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) for (k = 0; k < 16; ++k) { #if CV_SIMD256 - v_store(H[c].fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H[c].fine[k])); + v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k])); #elif CV_SIMD128 - v_store(H[c].fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k])); - v_store(H[c].fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k] + 8)); + v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k])); + v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8)); #else for (int ind = 0; ind < 16; ++ind) - H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]); + H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]); #endif } #if CV_SIMD256 - v_uint16x16 v_coarse = v256_load(H[c].coarse); + v_uint16x16 v_coarse = v256_load(H.coarse); #elif CV_SIMD128 - v_uint16x8 v_coarsel = v_load(H[c].coarse); - v_uint16x8 v_coarseh = v_load(H[c].coarse + 8); + v_uint16x8 v_coarsel = v_load(H.coarse); + v_uint16x8 v_coarseh = v_load(H.coarse + 8); #endif HT* px = h_coarse + 16 * n*c; for( j = 0; j < 2*r; ++j, px += 16 ) @@ -189,7 +197,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) v_coarseh += v_load(px + 8); #else for (int ind = 0; ind < 16; ++ind) - H[c].coarse[ind] += px[ind]; + H.coarse[ind] += px[ind]; #endif } @@ -201,24 +209,24 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) px = h_coarse + 16 * (n*c + std::min(j + r, n - 1)); #if CV_SIMD256 v_coarse += v256_load(px); - v_store(H[c].coarse, v_coarse); + v_store(H.coarse, v_coarse); #elif CV_SIMD128 v_coarsel += v_load(px); v_coarseh += v_load(px + 8); - v_store(H[c].coarse, v_coarsel); - v_store(H[c].coarse + 8, v_coarseh); + v_store(H.coarse, v_coarsel); + v_store(H.coarse + 8, v_coarseh); #else for (int ind = 0; ind < 16; ++ind) - H[c].coarse[ind] += px[ind]; + H.coarse[ind] += px[ind]; #endif // Find median at coarse level for ( k = 0; k < 16 ; ++k ) { - sum += H[c].coarse[k]; + sum += H.coarse[k]; if ( sum > t ) { - sum -= H[c].coarse[k]; + sum -= H.coarse[k]; break; } } @@ -231,7 +239,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) v_uint16x8 v_finel; v_uint16x8 v_fineh; #endif - if ( luc[c][k] <= j-r ) + if ( luc[k] <= j-r ) { #if CV_SIMD256 v_fine = v256_setzero_u16(); @@ -239,10 +247,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) v_finel = v_setzero_u16(); v_fineh = v_setzero_u16(); #else - memset(&H[c].fine[k], 0, 16 * sizeof(HT)); + memset(&H.fine[k], 0, 16 * sizeof(HT)); #endif px = h_fine + 16 * (n*(16 * c + k) + j - r); - for (luc[c][k] = HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16) + for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16) { #if CV_SIMD256 v_fine += v256_load(px); @@ -251,11 +259,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) v_fineh += v_load(px + 8); #else for (int ind = 0; ind < 16; ++ind) - H[c].fine[k][ind] += px[ind]; + H.fine[k][ind] += px[ind]; #endif } - if ( luc[c][k] < j+r+1 ) + if ( luc[k] < j+r+1 ) { px = h_fine + 16 * (n*(16 * c + k) + (n - 1)); #if CV_SIMD256 @@ -265,50 +273,50 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))); #else for (int ind = 0; ind < 16; ++ind) - H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (j + r + 1 - n) * px[ind]); + H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]); #endif - luc[c][k] = (HT)(j+r+1); + luc[k] = (HT)(j+r+1); } } else { #if CV_SIMD256 - v_fine = v256_load(H[c].fine[k]); + v_fine = v256_load(H.fine[k]); #elif CV_SIMD128 - v_finel = v_load(H[c].fine[k]); - v_fineh = v_load(H[c].fine[k] + 8); + v_finel = v_load(H.fine[k]); + v_fineh = v_load(H.fine[k] + 8); #endif px = h_fine + 16*n*(16 * c + k); - for ( ; luc[c][k] < j+r+1; ++luc[c][k] ) + for ( ; luc[k] < j+r+1; ++luc[k] ) { #if CV_SIMD256 - v_fine = v_fine + v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0)); + v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); #elif CV_SIMD128 - v_finel = v_finel + v_load(px + 16 * MIN(luc[c][k], n - 1) ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0)); - v_fineh = v_fineh + v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8); + v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1) ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0)); + v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8); #else for (int ind = 0; ind < 16; ++ind) - H[c].fine[k][ind] += px[16 * MIN(luc[c][k], n - 1) + ind] - px[16 * MAX(luc[c][k] - 2 * r - 1, 0) + ind]; + H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind]; #endif } } px = h_coarse + 16 * (n*c + MAX(j - r, 0)); #if CV_SIMD256 - v_store(H[c].fine[k], v_fine); + v_store(H.fine[k], v_fine); v_coarse -= v256_load(px); #elif CV_SIMD128 - v_store(H[c].fine[k], v_finel); - v_store(H[c].fine[k] + 8, v_fineh); + v_store(H.fine[k], v_finel); + v_store(H.fine[k] + 8, v_fineh); v_coarsel -= v_load(px); v_coarseh -= v_load(px + 8); #else for (int ind = 0; ind < 16; ++ind) - H[c].coarse[ind] -= px[ind]; + H.coarse[ind] -= px[ind]; #endif /* Find median in segment */ - segment = H[c].fine[k]; + segment = H.fine[k]; for ( b = 0; b < 16 ; b++ ) { sum += segment[b]; diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index d212237a37..6aa9d0279f 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -112,6 +112,7 @@ struct PyrDownVec_32s8u v_rshr_pack_store<8>(dst + x, t0); x += v_uint16::nlanes; } + typedef int CV_DECL_ALIGNED(1) unaligned_int; for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) { v_int32x4 r0, r1, r2, r3, r4, t0; @@ -122,7 +123,7 @@ struct PyrDownVec_32s8u r4 = v_load(row4 + x); t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); - *(int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); + *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); } return x; diff --git a/modules/imgproc/src/spatialgradient.cpp b/modules/imgproc/src/spatialgradient.cpp index c942264e00..1aed1fa031 100644 --- a/modules/imgproc/src/spatialgradient.cpp +++ b/modules/imgproc/src/spatialgradient.cpp @@ -123,139 +123,125 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, } } - // Pointer to row vectors - uchar *p_src, *c_src, *n_src; // previous, current, next row - short *c_dx, *c_dy; - int i_start = 0; int j_start = 0; -#if CV_SIMD128 - if(hasSIMD128()) +#if CV_SIMD + // Characters in variable names have the following meanings: + // u: unsigned char + // s: signed int + // + // [row][column] + // m: offset -1 + // n: offset 0 + // p: offset 1 + // Example: umn is offset -1 in row and offset 0 in column + for ( i = 0; i < H - 1; i += 2 ) { - uchar *m_src; - short *n_dx, *n_dy; + uchar *p_src = src.ptr(i == 0 ? i_top : i - 1); + uchar *c_src = src.ptr(i); + uchar *n_src = src.ptr(i+1); + uchar *m_src = src.ptr(i == H - 2 ? i_bottom : i + 2); - // Characters in variable names have the following meanings: - // u: unsigned char - // s: signed int - // - // [row][column] - // m: offset -1 - // n: offset 0 - // p: offset 1 - // Example: umn is offset -1 in row and offset 0 in column - for ( i = 0; i < H - 1; i += 2 ) + short *c_dx = dx.ptr(i); + short *c_dy = dy.ptr(i); + short *n_dx = dx.ptr(i+1); + short *n_dy = dy.ptr(i+1); + + // Process rest of columns 16-column chunks at a time + for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes) { - if ( i == 0 ) p_src = src.ptr(i_top); - else p_src = src.ptr(i-1); + // Load top row for 3x3 Sobel filter + v_uint8 v_um = vx_load(&p_src[j-1]); + v_uint8 v_un = vx_load(&p_src[j]); + v_uint8 v_up = vx_load(&p_src[j+1]); + v_uint16 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s1m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s1m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s1n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s1n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s1p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s1p2 = v_reinterpret_as_s16(v_up2); - c_src = src.ptr(i); - n_src = src.ptr(i+1); + // Load second row for 3x3 Sobel filter + v_um = vx_load(&c_src[j-1]); + v_un = vx_load(&c_src[j]); + v_up = vx_load(&c_src[j+1]); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s2m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s2m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s2n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s2n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s2p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s2p2 = v_reinterpret_as_s16(v_up2); - if ( i == H - 2 ) m_src = src.ptr(i_bottom); - else m_src = src.ptr(i+2); + // Load third row for 3x3 Sobel filter + v_um = vx_load(&n_src[j-1]); + v_un = vx_load(&n_src[j]); + v_up = vx_load(&n_src[j+1]); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s3m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s3m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s3n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s3n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s3p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s3p2 = v_reinterpret_as_s16(v_up2); - c_dx = dx.ptr(i); - c_dy = dy.ptr(i); - n_dx = dx.ptr(i+1); - n_dy = dy.ptr(i+1); + // dx & dy for rows 1, 2, 3 + v_int16 v_sdx1, v_sdy1; + spatialGradientKernel( v_sdx1, v_sdy1, + v_s1m1, v_s1n1, v_s1p1, + v_s2m1, v_s2p1, + v_s3m1, v_s3n1, v_s3p1 ); - // Process rest of columns 16-column chunks at a time - for ( j = 1; j < W - 16; j += 16 ) - { - // Load top row for 3x3 Sobel filter - v_uint8x16 v_um = v_load(&p_src[j-1]); - v_uint8x16 v_un = v_load(&p_src[j]); - v_uint8x16 v_up = v_load(&p_src[j+1]); - v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2); + v_int16 v_sdx2, v_sdy2; + spatialGradientKernel( v_sdx2, v_sdy2, + v_s1m2, v_s1n2, v_s1p2, + v_s2m2, v_s2p2, + v_s3m2, v_s3n2, v_s3p2 ); - // Load second row for 3x3 Sobel filter - v_um = v_load(&c_src[j-1]); - v_un = v_load(&c_src[j]); - v_up = v_load(&c_src[j+1]); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2); + // Store + v_store(&c_dx[j], v_sdx1); + v_store(&c_dx[j+v_int16::nlanes], v_sdx2); + v_store(&c_dy[j], v_sdy1); + v_store(&c_dy[j+v_int16::nlanes], v_sdy2); - // Load third row for 3x3 Sobel filter - v_um = v_load(&n_src[j-1]); - v_un = v_load(&n_src[j]); - v_up = v_load(&n_src[j+1]); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2); + // Load fourth row for 3x3 Sobel filter + v_um = vx_load(&m_src[j-1]); + v_un = vx_load(&m_src[j]); + v_up = vx_load(&m_src[j+1]); + v_expand(v_um, v_um1, v_um2); + v_expand(v_un, v_un1, v_un2); + v_expand(v_up, v_up1, v_up2); + v_int16 v_s4m1 = v_reinterpret_as_s16(v_um1); + v_int16 v_s4m2 = v_reinterpret_as_s16(v_um2); + v_int16 v_s4n1 = v_reinterpret_as_s16(v_un1); + v_int16 v_s4n2 = v_reinterpret_as_s16(v_un2); + v_int16 v_s4p1 = v_reinterpret_as_s16(v_up1); + v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2); - // dx & dy for rows 1, 2, 3 - v_int16x8 v_sdx1, v_sdy1; - spatialGradientKernel( v_sdx1, v_sdy1, - v_s1m1, v_s1n1, v_s1p1, - v_s2m1, v_s2p1, - v_s3m1, v_s3n1, v_s3p1 ); + // dx & dy for rows 2, 3, 4 + spatialGradientKernel( v_sdx1, v_sdy1, + v_s2m1, v_s2n1, v_s2p1, + v_s3m1, v_s3p1, + v_s4m1, v_s4n1, v_s4p1 ); - v_int16x8 v_sdx2, v_sdy2; - spatialGradientKernel( v_sdx2, v_sdy2, - v_s1m2, v_s1n2, v_s1p2, - v_s2m2, v_s2p2, - v_s3m2, v_s3n2, v_s3p2 ); + spatialGradientKernel( v_sdx2, v_sdy2, + v_s2m2, v_s2n2, v_s2p2, + v_s3m2, v_s3p2, + v_s4m2, v_s4n2, v_s4p2 ); - // Store - v_store(&c_dx[j], v_sdx1); - v_store(&c_dx[j+8], v_sdx2); - v_store(&c_dy[j], v_sdy1); - v_store(&c_dy[j+8], v_sdy2); - - // Load fourth row for 3x3 Sobel filter - v_um = v_load(&m_src[j-1]); - v_un = v_load(&m_src[j]); - v_up = v_load(&m_src[j+1]); - v_expand(v_um, v_um1, v_um2); - v_expand(v_un, v_un1, v_un2); - v_expand(v_up, v_up1, v_up2); - v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1); - v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2); - v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1); - v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2); - v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1); - v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2); - - // dx & dy for rows 2, 3, 4 - spatialGradientKernel( v_sdx1, v_sdy1, - v_s2m1, v_s2n1, v_s2p1, - v_s3m1, v_s3p1, - v_s4m1, v_s4n1, v_s4p1 ); - - spatialGradientKernel( v_sdx2, v_sdy2, - v_s2m2, v_s2n2, v_s2p2, - v_s3m2, v_s3p2, - v_s4m2, v_s4n2, v_s4p2 ); - - // Store - v_store(&n_dx[j], v_sdx1); - v_store(&n_dx[j+8], v_sdx2); - v_store(&n_dy[j], v_sdy1); - v_store(&n_dy[j+8], v_sdy2); - } + // Store + v_store(&n_dx[j], v_sdx1); + v_store(&n_dx[j+v_int16::nlanes], v_sdx2); + v_store(&n_dy[j], v_sdy1); + v_store(&n_dy[j+v_int16::nlanes], v_sdy2); } } i_start = i; @@ -265,16 +251,12 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy, uchar v00, v01, v02, v10, v11, v12, v20, v21, v22; for ( i = 0; i < H; i++ ) { - if ( i == 0 ) p_src = src.ptr(i_top); - else p_src = src.ptr(i-1); + uchar *p_src = src.ptr(i == 0 ? i_top : i - 1); + uchar *c_src = src.ptr(i); + uchar *n_src = src.ptr(i == H - 1 ? i_bottom : i + 1); - c_src = src.ptr(i); - - if ( i == H - 1 ) n_src = src.ptr(i_bottom); - else n_src = src.ptr(i+1); - - c_dx = dx.ptr(i); - c_dy = dy.ptr(i); + short *c_dx = dx.ptr(i); + short *c_dy = dy.ptr(i); // Process left-most column j = 0; diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp index 7749e4a59a..6329a47afa 100644 --- a/modules/imgproc/test/test_filter.cpp +++ b/modules/imgproc/test/test_filter.cpp @@ -2235,4 +2235,13 @@ TEST(Imgproc_Sobel, s16_regression_13506) Sobel(src, dst, CV_16S, 0, 1, 5); ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF)); } + +TEST(Imgproc_Pyrdown, issue_12961) +{ + Mat src(9, 9, CV_8UC1, Scalar::all(0)); + Mat dst; + cv::pyrDown(src, dst); + ASSERT_EQ(0.0, cv::norm(dst)); +} + }} // namespace diff --git a/modules/js/src/core_bindings.cpp b/modules/js/src/core_bindings.cpp index 554f95aa83..72efd6350a 100644 --- a/modules/js/src/core_bindings.cpp +++ b/modules/js/src/core_bindings.cpp @@ -341,6 +341,9 @@ EMSCRIPTEN_BINDINGS(binding_utils) register_vector("MatVector"); register_vector("RectVector"); register_vector("KeyPointVector"); + register_vector("DMatchVector"); + register_vector>("DMatchVectorVector"); + emscripten::class_("Mat") .constructor<>() @@ -494,6 +497,12 @@ EMSCRIPTEN_BINDINGS(binding_utils) .field("response", &cv::KeyPoint::response) .field("size", &cv::KeyPoint::size); + emscripten::value_object("DMatch") + .field("queryIdx", &cv::DMatch::queryIdx) + .field("trainIdx", &cv::DMatch::trainIdx) + .field("imgIdx", &cv::DMatch::imgIdx) + .field("distance", &cv::DMatch::distance); + emscripten::value_array> ("Scalar") .element(index<0>()) .element(index<1>()) diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp index 02043ac929..0a0b84a364 100644 --- a/modules/ml/src/svm.cpp +++ b/modules/ml/src/svm.cpp @@ -200,20 +200,19 @@ public: { int j; calc_non_rbf_base( vcount, var_count, vecs, another, results, - -2*params.gamma, -2*params.coef0 ); + 2*params.gamma, 2*params.coef0 ); // TODO: speedup this for( j = 0; j < vcount; j++ ) { Qfloat t = results[j]; - Qfloat e = std::exp(-std::abs(t)); + Qfloat e = std::exp(std::abs(t)); if( t > 0 ) - results[j] = (Qfloat)((1. - e)/(1. + e)); - else results[j] = (Qfloat)((e - 1.)/(e + 1.)); + else + results[j] = (Qfloat)((1. - e)/(1. + e)); } } - void calc_rbf( int vcount, int var_count, const float* vecs, const float* another, Qfloat* results ) { @@ -1310,8 +1309,6 @@ public: if( kernelType != SIGMOID && kernelType != POLY ) params.coef0 = 0; - else if( params.coef0 < 0 ) - CV_Error( CV_StsOutOfRange, "The kernel parameter must be positive or zero" ); if( kernelType != POLY ) params.degree = 0; diff --git a/modules/ml/test/test_svmtrainauto.cpp b/modules/ml/test/test_svmtrainauto.cpp index 6d7a73eaef..fcd83d3533 100644 --- a/modules/ml/test/test_svmtrainauto.cpp +++ b/modules/ml/test/test_svmtrainauto.cpp @@ -88,6 +88,51 @@ void CV_SVMTrainAutoTest::run( int /*start_from*/ ) TEST(ML_SVM, trainauto) { CV_SVMTrainAutoTest test; test.safe_run(); } +TEST(ML_SVM, trainauto_sigmoid) +{ + const int datasize = 100; + cv::Mat samples = cv::Mat::zeros( datasize, 2, CV_32FC1 ); + cv::Mat responses = cv::Mat::zeros( datasize, 1, CV_32S ); + + const float scale_factor = 0.5; + const float radius = 2.0; + + // Populate samples with data that can be split into two concentric circles + for (int i = 0; i < datasize; i+=2) + { + const float pi = 3.14159f; + const float angle_rads = (i/datasize) * pi; + const float x = radius * cos(angle_rads); + const float y = radius * cos(angle_rads); + + // Larger circle + samples.at( i, 0 ) = x; + samples.at( i, 1 ) = y; + responses.at( i, 0 ) = 0; + + // Smaller circle + samples.at( i + 1, 0 ) = x * scale_factor; + samples.at( i + 1, 1 ) = y * scale_factor; + responses.at( i + 1, 0 ) = 1; + } + + cv::Ptr data = TrainData::create( samples, cv::ml::ROW_SAMPLE, responses ); + cv::Ptr svm = SVM::create(); + svm->setKernel(SVM::SIGMOID); + + svm->setGamma(10.0); + svm->setCoef0(-10.0); + svm->trainAuto( data, 10 ); // 2-fold cross validation. + + float test_data0[2] = {radius, radius}; + cv::Mat test_point0 = cv::Mat( 1, 2, CV_32FC1, test_data0 ); + ASSERT_EQ(0, svm->predict( test_point0 )); + + float test_data1[2] = {scale_factor * radius, scale_factor * radius}; + cv::Mat test_point1 = cv::Mat( 1, 2, CV_32FC1, test_data1 ); + ASSERT_EQ(1, svm->predict( test_point1 )); +} + TEST(ML_SVM, trainAuto_regression_5369) { diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py index a644420780..5a8e62495d 100644 --- a/samples/dnn/tf_text_graph_common.py +++ b/samples/dnn/tf_text_graph_common.py @@ -323,7 +323,7 @@ def writeTextGraph(modelPath, outputPath, outNodes): for node in graph_def.node: if node.op == 'Const': - if 'value' in node.attr: - del node.attr['value'] + if 'value' in node.attr and node.attr['value'].tensor.tensor_content: + node.attr['value'].tensor.tensor_content = '' tf.train.write_graph(graph_def, "", outputPath, as_text=True)