From 53f6198f27d54878336b42eece054167d8bcab4b Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Fri, 7 Dec 2018 12:40:34 +0300 Subject: [PATCH 1/7] Minor fixes in IE backend tests --- modules/dnn/src/op_inf_engine.cpp | 6 +- modules/dnn/src/op_inf_engine.hpp | 3 + modules/dnn/test/test_caffe_importer.cpp | 1 + modules/dnn/test/test_darknet_importer.cpp | 2 +- modules/dnn/test/test_halide_layers.cpp | 2 +- modules/dnn/test/test_layers.cpp | 94 ++++++++++++++++------ modules/dnn/test/test_torch_importer.cpp | 4 +- 7 files changed, 83 insertions(+), 29 deletions(-) diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index de8447d2be..bbe712c133 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -152,6 +152,7 @@ InfEngineBackendNet::InfEngineBackendNet() { targetDevice = InferenceEngine::TargetDevice::eCPU; precision = InferenceEngine::Precision::FP32; + hasNetOwner = false; } InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) @@ -162,6 +163,7 @@ InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) outputs = net.getOutputsInfo(); layers.resize(net.layerCount()); // A hack to execute InfEngineBackendNet::layerCount correctly. netOwner = net; + hasNetOwner = true; } void InfEngineBackendNet::Release() noexcept @@ -178,12 +180,12 @@ void InfEngineBackendNet::setPrecision(InferenceEngine::Precision p) noexcept InferenceEngine::Precision InfEngineBackendNet::getPrecision() noexcept { - return precision; + return hasNetOwner ? netOwner.getPrecision() : precision; } InferenceEngine::Precision InfEngineBackendNet::getPrecision() const noexcept { - return precision; + return hasNetOwner ? netOwner.getPrecision() : precision; } // Assume that outputs of network is unconnected blobs. diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp index ffaeb288b2..e5542b3296 100644 --- a/modules/dnn/src/op_inf_engine.hpp +++ b/modules/dnn/src/op_inf_engine.hpp @@ -134,6 +134,9 @@ private: InferenceEngine::InferRequest infRequest; // In case of models from Model Optimizer we need to manage their lifetime. InferenceEngine::CNNNetwork netOwner; + // There is no way to check if netOwner is initialized or not so we use + // a separate flag to determine if the model has been loaded from IR. + bool hasNetOwner; std::string name; diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp index 2ed07b301f..a914101d51 100644 --- a/modules/dnn/test/test_caffe_importer.cpp +++ b/modules/dnn/test/test_caffe_importer.cpp @@ -471,6 +471,7 @@ TEST(Test_Caffe, shared_weights) net.setInput(blob_1, "input_1"); net.setInput(blob_2, "input_2"); + net.setPreferableBackend(DNN_BACKEND_OPENCV); Mat sum = net.forward(); diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp index d7c14f2714..5d41b4b916 100644 --- a/modules/dnn/test/test_darknet_importer.cpp +++ b/modules/dnn/test/test_darknet_importer.cpp @@ -306,7 +306,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc) // batch size 1 testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff); -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_MYRIAD) #endif // batch size 2 diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp index ea5eafd71b..468953fe7e 100644 --- a/modules/dnn/test/test_halide_layers.cpp +++ b/modules/dnn/test/test_halide_layers.cpp @@ -166,7 +166,7 @@ TEST_P(Deconvolution, Accuracy) if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU && dilation.width == 2 && dilation.height == 2) throw SkipTestException(""); -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000 if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_CPU && hasBias && group != 1) throw SkipTestException("Test is disabled for OpenVINO 2018R4"); diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 1d41daa025..62e625f03c 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -137,7 +137,7 @@ TEST_P(Test_Caffe_layers, Convolution) TEST_P(Test_Caffe_layers, DeConvolution) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_CPU) throw SkipTestException("Test is disabled for OpenVINO 2018R4"); #endif @@ -918,8 +918,11 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_DWconv_Prelu, Combine(Values(3, 6), Val // Using Intel's Model Optimizer generate .xml and .bin files: // ./ModelOptimizer -w /path/to/caffemodel -d /path/to/prototxt \ // -p FP32 -i -b ${batch_size} -o /path/to/output/folder -TEST(Layer_Test_Convolution_DLDT, Accuracy) +typedef testing::TestWithParam Layer_Test_Convolution_DLDT; +TEST_P(Layer_Test_Convolution_DLDT, Accuracy) { + Target targetId = GetParam(); + Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt")); Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin")); @@ -930,17 +933,29 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy) Mat outDefault = netDefault.forward(); net.setInput(inp); - Mat out = net.forward(); + net.setPreferableTarget(targetId); - normAssert(outDefault, out); + if (targetId != DNN_TARGET_MYRIAD) + { + Mat out = net.forward(); - std::vector outLayers = net.getUnconnectedOutLayers(); - ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge"); - ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat"); + normAssert(outDefault, out); + + std::vector outLayers = net.getUnconnectedOutLayers(); + ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge"); + ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat"); + } + else + { + // An assertion is expected because the model is in FP32 format but + // Myriad plugin supports only FP16 models. + ASSERT_ANY_THROW(net.forward()); + } } -TEST(Layer_Test_Convolution_DLDT, setInput_uint8) +TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8) { + Target targetId = GetParam(); Mat inp = blobFromNPY(_tf("blob.npy")); Mat inputs[] = {Mat(inp.dims, inp.size, CV_8U), Mat()}; @@ -951,12 +966,25 @@ TEST(Layer_Test_Convolution_DLDT, setInput_uint8) for (int i = 0; i < 2; ++i) { Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin")); + net.setPreferableTarget(targetId); net.setInput(inputs[i]); - outs[i] = net.forward(); - ASSERT_EQ(outs[i].type(), CV_32F); + if (targetId != DNN_TARGET_MYRIAD) + { + outs[i] = net.forward(); + ASSERT_EQ(outs[i].type(), CV_32F); + } + else + { + // An assertion is expected because the model is in FP32 format but + // Myriad plugin supports only FP16 models. + ASSERT_ANY_THROW(net.forward()); + } } - normAssert(outs[0], outs[1]); + if (targetId != DNN_TARGET_MYRIAD) + normAssert(outs[0], outs[1]); } +INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Convolution_DLDT, + testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))); // 1. Create a .prototxt file with the following network: // layer { @@ -980,14 +1008,17 @@ TEST(Layer_Test_Convolution_DLDT, setInput_uint8) // net.save('/path/to/caffemodel') // // 3. Convert using ModelOptimizer. -typedef testing::TestWithParam > Test_DLDT_two_inputs; +typedef testing::TestWithParam > Test_DLDT_two_inputs; TEST_P(Test_DLDT_two_inputs, as_IR) { int firstInpType = get<0>(GetParam()); int secondInpType = get<1>(GetParam()); - // TODO: It looks like a bug in Inference Engine. + Target targetId = get<2>(GetParam()); + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE < 2018040000 if (secondInpType == CV_8U) - throw SkipTestException(""); + throw SkipTestException("Test is enabled starts from OpenVINO 2018R4"); +#endif Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin")); int inpSize[] = {1, 2, 3}; @@ -998,11 +1029,21 @@ TEST_P(Test_DLDT_two_inputs, as_IR) net.setInput(firstInp, "data"); net.setInput(secondInp, "second_input"); - Mat out = net.forward(); + net.setPreferableTarget(targetId); + if (targetId != DNN_TARGET_MYRIAD) + { + Mat out = net.forward(); - Mat ref; - cv::add(firstInp, secondInp, ref, Mat(), CV_32F); - normAssert(out, ref); + Mat ref; + cv::add(firstInp, secondInp, ref, Mat(), CV_32F); + normAssert(out, ref); + } + else + { + // An assertion is expected because the model is in FP32 format but + // Myriad plugin supports only FP16 models. + ASSERT_ANY_THROW(net.forward()); + } } TEST_P(Test_DLDT_two_inputs, as_backend) @@ -1010,6 +1051,8 @@ TEST_P(Test_DLDT_two_inputs, as_backend) static const float kScale = 0.5f; static const float kScaleInv = 1.0f / kScale; + Target targetId = get<2>(GetParam()); + Net net; LayerParams lp; lp.type = "Eltwise"; @@ -1018,9 +1061,9 @@ TEST_P(Test_DLDT_two_inputs, as_backend) int eltwiseId = net.addLayerToPrev(lp.name, lp.type, lp); // connect to a first input net.connect(0, 1, eltwiseId, 1); // connect to a second input - int inpSize[] = {1, 2, 3}; - Mat firstInp(3, &inpSize[0], get<0>(GetParam())); - Mat secondInp(3, &inpSize[0], get<1>(GetParam())); + int inpSize[] = {1, 2, 3, 4}; + Mat firstInp(4, &inpSize[0], get<0>(GetParam())); + Mat secondInp(4, &inpSize[0], get<1>(GetParam())); randu(firstInp, 0, 255); randu(secondInp, 0, 255); @@ -1028,15 +1071,20 @@ TEST_P(Test_DLDT_two_inputs, as_backend) net.setInput(firstInp, "data", kScale); net.setInput(secondInp, "second_input", kScaleInv); net.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE); + net.setPreferableTarget(targetId); Mat out = net.forward(); Mat ref; addWeighted(firstInp, kScale, secondInp, kScaleInv, 0, ref, CV_32F); - normAssert(out, ref); + // Output values are in range [0, 637.5]. + double l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 0.06 : 1e-6; + double lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 0.3 : 1e-5; + normAssert(out, ref, "", l1, lInf); } INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_DLDT_two_inputs, Combine( - Values(CV_8U, CV_32F), Values(CV_8U, CV_32F) + Values(CV_8U, CV_32F), Values(CV_8U, CV_32F), + testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)) )); class UnsupportedLayer : public Layer diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index c640c90ed3..7fa0dc47ef 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -136,7 +136,7 @@ TEST_P(Test_Torch_layers, run_reshape_change_batch_size) TEST_P(Test_Torch_layers, run_reshape) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) throw SkipTestException("Test is disabled for OpenVINO 2018R4"); #endif @@ -172,7 +172,7 @@ TEST_P(Test_Torch_layers, run_depth_concat) TEST_P(Test_Torch_layers, run_deconv) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018040000 +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018040000 if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) throw SkipTestException("Test is disabled for OpenVINO 2018R4"); #endif From 857fba0878f91388637f849839bd420fe87234ef Mon Sep 17 00:00:00 2001 From: Thad House Date: Tue, 11 Dec 2018 17:06:01 -0800 Subject: [PATCH 2/7] Remove MinCore_Downlevel, replace with Shlwapi On windows 7, MinCore_Downlevel does not work correctly. However, the only API used was QISearch, which can be found in Shlwapi. Closes #12010 --- modules/videoio/src/cap_msmf.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp index 9fa84a156c..3eca95bfa6 100644 --- a/modules/videoio/src/cap_msmf.cpp +++ b/modules/videoio/src/cap_msmf.cpp @@ -99,9 +99,7 @@ static void init_MFCreateDXGIDeviceManager() pMFCreateDXGIDeviceManager_initialized = true; } #endif -#if (WINVER >= 0x0602) // Available since Win 8 -#pragma comment(lib, "MinCore_Downlevel") -#endif +#pragma comment(lib, "Shlwapi.lib") #endif #include From 6fa23f330fb0c6466804776227f9e9c6f3acc774 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 12 Dec 2018 13:35:43 +0300 Subject: [PATCH 3/7] cmake: fix compiler flags filtering --- cmake/OpenCVUtils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake index 7596b5009a..17e691b838 100644 --- a/cmake/OpenCVUtils.cmake +++ b/cmake/OpenCVUtils.cmake @@ -508,7 +508,7 @@ macro(ocv_warnings_disable) foreach(var ${_flag_vars}) foreach(warning ${_gxx_warnings}) if(NOT warning MATCHES "^-Wno-") - string(REGEX REPLACE "${warning}(=[^ ]*)?" "" ${var} "${${var}}") + string(REGEX REPLACE "(^|[ ]+)${warning}(=[^ ]*)?([ ]+|$)" " " ${var} "${${var}}") string(REPLACE "-W" "-Wno-" warning "${warning}") endif() ocv_check_flag_support(${var} "${warning}" _varname "") From 00285a5e88dbbfeb2af990b002b3ef892a6cd6fd Mon Sep 17 00:00:00 2001 From: Adrian Kashivskyy Date: Wed, 12 Dec 2018 15:32:19 +0100 Subject: [PATCH 4/7] Merge pull request #13424 from akashivskyy:pr/ios-nonfree Add ability to build iOS and macOS frameworks with nonfree modules (#13424) * Allow building ios framework with nonfree * Allow building osx framework with nonfree --- platforms/ios/build_framework.py | 10 +++++++--- platforms/osx/build_framework.py | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py index 806f3462b2..f797d9b964 100755 --- a/platforms/ios/build_framework.py +++ b/platforms/ios/build_framework.py @@ -49,7 +49,7 @@ def getXCodeMajor(): raise Exception("Failed to parse Xcode version") class Builder: - def __init__(self, opencv, contrib, dynamic, bitcodedisabled, exclude, targets): + def __init__(self, opencv, contrib, dynamic, bitcodedisabled, exclude, enablenonfree, targets): self.opencv = os.path.abspath(opencv) self.contrib = None if contrib: @@ -61,6 +61,7 @@ class Builder: self.dynamic = dynamic self.bitcodedisabled = bitcodedisabled self.exclude = exclude + self.enablenonfree = enablenonfree self.targets = targets def getBD(self, parent, t): @@ -136,7 +137,9 @@ class Builder: "-DBUILD_SHARED_LIBS=ON", "-DCMAKE_MACOSX_BUNDLE=ON", "-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO", - ] if self.dynamic else []) + ] if self.dynamic else []) + ([ + "-DOPENCV_ENABLE_NONFREE=ON" + ] if self.enablenonfree else []) if len(self.exclude) > 0: args += ["-DBUILD_opencv_world=OFF"] if not self.dynamic else [] @@ -284,6 +287,7 @@ if __name__ == "__main__": parser.add_argument('--iphoneos_deployment_target', default=os.environ.get('IPHONEOS_DEPLOYMENT_TARGET', IPHONEOS_DEPLOYMENT_TARGET), help='specify IPHONEOS_DEPLOYMENT_TARGET') parser.add_argument('--iphoneos_archs', default='armv7,armv7s,arm64', help='select iPhoneOS target ARCHS') parser.add_argument('--iphonesimulator_archs', default='i386,x86_64', help='select iPhoneSimulator target ARCHS') + parser.add_argument('--enable_nonfree', default=False, dest='enablenonfree', action='store_true', help='enable non-free modules (disabled by default)') args = parser.parse_args() os.environ['IPHONEOS_DEPLOYMENT_TARGET'] = args.iphoneos_deployment_target @@ -293,7 +297,7 @@ if __name__ == "__main__": iphonesimulator_archs = args.iphonesimulator_archs.split(',') print('Using iPhoneSimulator ARCHS=' + str(iphonesimulator_archs)) - b = iOSBuilder(args.opencv, args.contrib, args.dynamic, args.bitcodedisabled, args.without, + b = iOSBuilder(args.opencv, args.contrib, args.dynamic, args.bitcodedisabled, args.without, args.enablenonfree, [ (iphoneos_archs, "iPhoneOS"), ] if os.environ.get('BUILD_PRECOMMIT', None) else diff --git a/platforms/osx/build_framework.py b/platforms/osx/build_framework.py index 2db5cd2564..2425fa158a 100644 --- a/platforms/osx/build_framework.py +++ b/platforms/osx/build_framework.py @@ -38,9 +38,10 @@ if __name__ == "__main__": parser.add_argument('--opencv', metavar='DIR', default=folder, help='folder with opencv repository (default is "../.." relative to script location)') parser.add_argument('--contrib', metavar='DIR', default=None, help='folder with opencv_contrib repository (default is "None" - build only main framework)') parser.add_argument('--without', metavar='MODULE', default=[], action='append', help='OpenCV modules to exclude from the framework') + parser.add_argument('--enable_nonfree', default=False, dest='enablenonfree', action='store_true', help='enable non-free modules (disabled by default)') args = parser.parse_args() - b = OSXBuilder(args.opencv, args.contrib, False, False, args.without, + b = OSXBuilder(args.opencv, args.contrib, False, False, args.without, args.enablenonfree, [ (["x86_64"], "MacOSX") ]) From e71758cfdf217590c613c371aa03c5b600e926ce Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Wed, 12 Dec 2018 17:36:17 +0300 Subject: [PATCH 5/7] Operate with shapes in ONNX models --- modules/dnn/src/onnx/onnx_importer.cpp | 126 +++++++++++++++++++++++- modules/dnn/test/test_onnx_importer.cpp | 4 + 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index 22eda5046c..18e26f1b86 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -6,6 +6,7 @@ // Third party copyrights are property of their respective owners. #include "../precomp.hpp" +#include #ifdef HAVE_PROTOBUF @@ -134,9 +135,38 @@ Mat getMatFromTensor(opencv_onnx::TensorProto& tensor_proto) else CV_Error(Error::StsUnsupportedFormat, "Unsupported data type: " + opencv_onnx::TensorProto_DataType_Name(datatype)); + if (tensor_proto.dims_size() == 0) + blob.dims = 1; // To force 1-dimensional cv::Mat for scalars. return blob; } +void runLayer(Ptr layer, const std::vector& inputs, + std::vector& outputs) +{ + std::vector inpShapes(inputs.size()); + int ddepth = CV_32F; + for (size_t i = 0; i < inputs.size(); ++i) + { + inpShapes[i] = shape(inputs[i]); + if (i > 0 && ddepth != inputs[i].depth()) + CV_Error(Error::StsNotImplemented, "Mixed input data types."); + ddepth = inputs[i].depth(); + } + + std::vector outShapes, internalShapes; + layer->getMemoryShapes(inpShapes, 0, outShapes, internalShapes); + + std::vector internals(internalShapes.size()); + outputs.resize(outShapes.size()); + for (size_t i = 0; i < outShapes.size(); ++i) + outputs[i].create(outShapes[i], ddepth); + for (size_t i = 0; i < internalShapes.size(); ++i) + internals[i].create(internalShapes[i], ddepth); + + layer->finalize(inputs, outputs); + layer->forward(inputs, outputs, internals); +} + std::map ONNXImporter::getGraphTensors( const opencv_onnx::GraphProto& graph_proto) { @@ -292,6 +322,26 @@ void ONNXImporter::populateNet(Net dstNet) CV_Assert(model_proto.has_graph()); opencv_onnx::GraphProto graph_proto = model_proto.graph(); std::map constBlobs = getGraphTensors(graph_proto); + // List of internal blobs shapes. + std::map outShapes; + // Add all the inputs shapes. It includes as constant blobs as network's inputs shapes. + for (int i = 0; i < graph_proto.input_size(); ++i) + { + opencv_onnx::ValueInfoProto valueInfoProto = graph_proto.input(i); + CV_Assert(valueInfoProto.has_type()); + opencv_onnx::TypeProto typeProto = valueInfoProto.type(); + CV_Assert(typeProto.has_tensor_type()); + opencv_onnx::TypeProto::Tensor tensor = typeProto.tensor_type(); + CV_Assert(tensor.has_shape()); + opencv_onnx::TensorShapeProto tensorShape = tensor.shape(); + + MatShape inpShape(tensorShape.dim_size()); + for (int j = 0; j < inpShape.size(); ++j) + { + inpShape[j] = tensorShape.dim(j).dim_value(); + } + outShapes[valueInfoProto.name()] = inpShape; + } std::string framework_name; if (model_proto.has_producer_name()) { @@ -301,6 +351,7 @@ void ONNXImporter::populateNet(Net dstNet) // create map with network inputs (without const blobs) std::map layer_id; std::map::iterator layerId; + std::map::iterator shapeIt; // fill map: push layer name, layer id and output id std::vector netInputs; for (int j = 0; j < graph_proto.input_size(); j++) @@ -317,9 +368,9 @@ void ONNXImporter::populateNet(Net dstNet) LayerParams layerParams; opencv_onnx::NodeProto node_proto; - for(int i = 0; i < layersSize; i++) + for(int li = 0; li < layersSize; li++) { - node_proto = graph_proto.node(i); + node_proto = graph_proto.node(li); layerParams = getLayerParams(node_proto); CV_Assert(node_proto.output_size() >= 1); layerParams.name = node_proto.output(0); @@ -598,6 +649,65 @@ void ONNXImporter::populateNet(Net dstNet) { layerParams.type = "Padding"; } + else if (layer_type == "Shape") + { + CV_Assert(node_proto.input_size() == 1); + shapeIt = outShapes.find(node_proto.input(0)); + CV_Assert(shapeIt != outShapes.end()); + MatShape inpShape = shapeIt->second; + + Mat shapeMat(inpShape.size(), 1, CV_32S); + for (int j = 0; j < inpShape.size(); ++j) + shapeMat.at(j) = inpShape[j]; + shapeMat.dims = 1; + + constBlobs.insert(std::make_pair(layerParams.name, shapeMat)); + continue; + } + else if (layer_type == "Gather") + { + CV_Assert(node_proto.input_size() == 2); + CV_Assert(layerParams.has("axis")); + Mat input = getBlob(node_proto, constBlobs, 0); + Mat indexMat = getBlob(node_proto, constBlobs, 1); + CV_Assert_N(indexMat.type() == CV_32S, indexMat.total() == 1); + int index = indexMat.at(0); + int axis = layerParams.get("axis"); + + std::vector ranges(input.dims, Range::all()); + ranges[axis] = Range(index, index + 1); + + Mat out = input(ranges); + constBlobs.insert(std::make_pair(layerParams.name, out)); + continue; + } + else if (layer_type == "Concat") + { + bool hasVariableInps = false; + for (int i = 0; i < node_proto.input_size(); ++i) + { + if (layer_id.find(node_proto.input(i)) != layer_id.end()) + { + hasVariableInps = true; + break; + } + } + + if (!hasVariableInps) + { + std::vector inputs(node_proto.input_size()), concatenated; + for (size_t i = 0; i < inputs.size(); ++i) + { + inputs[i] = getBlob(node_proto, constBlobs, i); + } + Ptr concat = ConcatLayer::create(layerParams); + runLayer(concat, inputs, concatenated); + + CV_Assert(concatenated.size() == 1); + constBlobs.insert(std::make_pair(layerParams.name, concatenated[0])); + continue; + } + } else { for (int j = 0; j < node_proto.input_size(); j++) { @@ -609,12 +719,24 @@ void ONNXImporter::populateNet(Net dstNet) int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams); layer_id.insert(std::make_pair(layerParams.name, LayerInfo(id, 0))); + + std::vector layerInpShapes, layerOutShapes, layerInternalShapes; for (int j = 0; j < node_proto.input_size(); j++) { layerId = layer_id.find(node_proto.input(j)); if (layerId != layer_id.end()) { dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, j); + // Collect input shapes. + shapeIt = outShapes.find(node_proto.input(j)); + CV_Assert(shapeIt != outShapes.end()); + layerInpShapes.push_back(shapeIt->second); } } + + // Compute shape of output blob for this layer. + Ptr layer = dstNet.getLayer(id); + layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes); + CV_Assert(!layerOutShapes.empty()); + outShapes[layerParams.name] = layerOutShapes[0]; } } diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 61b06cc7cf..36e7450892 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -162,6 +162,10 @@ TEST_P(Test_ONNX_layers, MultyInputs) normAssert(ref, out, "", default_l1, default_lInf); } +TEST_P(Test_ONNX_layers, DynamicReshape) +{ + testONNXModels("dynamic_reshape"); +} INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets()); From 3e710d8eeca79a787bf607af3b7419b5267843f7 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Thu, 13 Dec 2018 13:35:19 +0900 Subject: [PATCH 6/7] use correct CC value for Jetson Xavier --- cmake/OpenCVDetectCUDA.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake index eb7e79a800..2d94a2c86b 100644 --- a/cmake/OpenCVDetectCUDA.cmake +++ b/cmake/OpenCVDetectCUDA.cmake @@ -107,7 +107,7 @@ if(CUDA_FOUND) ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT _nvcc_res EQUAL 0) message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.") - set(__cuda_arch_bin "5.3 6.2 7.0 7.5") + set(__cuda_arch_bin "5.3 6.2 7.2") else() set(__cuda_arch_bin "${_nvcc_out}") string(REPLACE "2.1" "2.1(2.0)" __cuda_arch_bin "${__cuda_arch_bin}") From 3903174f7c9475b51c4bc81063f744eb405a7d90 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Thu, 13 Dec 2018 14:20:22 +0300 Subject: [PATCH 7/7] Merge pull request #13334 from terfendail:histogram_wintr * added performance test for compareHist * compareHist reworked to use wide universal intrinsics * Disabled vectorization for CV_COMP_CORREL and CV_COMP_BHATTACHARYYA if f64 is unsupported --- .../include/opencv2/core/hal/intrin_avx.hpp | 6 + .../include/opencv2/core/hal/intrin_neon.hpp | 7 + .../include/opencv2/core/hal/intrin_sse.hpp | 7 + .../include/opencv2/core/hal/intrin_vsx.hpp | 5 + modules/imgproc/perf/perf_histogram.cpp | 25 +++ modules/imgproc/src/histogram.cpp | 199 +++++++++--------- 6 files changed, 155 insertions(+), 94 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index af4efa238c..19de221005 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1125,6 +1125,12 @@ inline float v_reduce_sum(const v_float32x8& a) return _mm_cvtss_f32(s1); } +inline double v_reduce_sum(const v_float64x4& a) +{ + __m256d s0 = _mm256_hadd_pd(a.val, a.val); + return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0))); +} + inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, const v_float32x8& c, const v_float32x8& d) { diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 1b35896009..608dc97a87 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -984,6 +984,13 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32) +#if CV_SIMD128_64F +inline double v_reduce_sum(const v_float64x2& a) +{ + return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1); +} +#endif + inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index 24a34a3921..f7a67da1a5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1456,6 +1456,13 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) +inline double v_reduce_sum(const v_float64x2& a) +{ + double CV_DECL_ALIGNED(32) idx[2]; + v_store_aligned(idx, a); + return idx[0] + idx[1]; +} + inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d) { diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index efea72c281..9506adfe7e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -716,6 +716,11 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min) +inline double v_reduce_sum(const v_float64x2& a) +{ + return vec_extract(vec_add(a.val, vec_sld(a.val, a.val, 8)), 0); +} + #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \ inline scalartype v_reduce_##suffix(const _Tpvec& a) \ { \ diff --git a/modules/imgproc/perf/perf_histogram.cpp b/modules/imgproc/perf/perf_histogram.cpp index eca97e3c54..4f54e948bb 100644 --- a/modules/imgproc/perf/perf_histogram.cpp +++ b/modules/imgproc/perf/perf_histogram.cpp @@ -116,6 +116,31 @@ PERF_TEST_P(MatSize, equalizeHist, } #undef MatSize +typedef TestBaseWithParam< tuple > Dim_Cmpmethod; +PERF_TEST_P(Dim_Cmpmethod, compareHist, + testing::Combine(testing::Values(1, 3), + testing::Values(HISTCMP_CORREL, HISTCMP_CHISQR, HISTCMP_INTERSECT, HISTCMP_BHATTACHARYYA, HISTCMP_CHISQR_ALT, HISTCMP_KL_DIV)) + ) +{ + int dims = get<0>(GetParam()); + int method = get<1>(GetParam()); + int histSize[] = { 2048, 128, 64 }; + + Mat hist1(dims, histSize, CV_32FC1); + Mat hist2(dims, histSize, CV_32FC1); + randu(hist1, 0, 256); + randu(hist2, 0, 256); + + declare.in(hist1.reshape(1, 256), hist2.reshape(1, 256)); + + TEST_CYCLE() + { + compareHist(hist1, hist2, method); + } + + SANITY_CHECK_NOTHING(); +} + typedef tuple Sz_ClipLimit_t; typedef TestBaseWithParam Sz_ClipLimit; diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 60cb3630ce..a53a45eb25 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -41,6 +41,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" +#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/openvx/ovx_defs.hpp" @@ -1938,10 +1939,6 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) CV_Assert( it.planes[0].isContinuous() && it.planes[1].isContinuous() ); -#if CV_SSE2 - bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2); -#endif - for( size_t i = 0; i < it.nplanes; i++, ++it ) { const float* h1 = it.planes[0].ptr(); @@ -1961,50 +1958,63 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_CORREL ) { - #if CV_SSE2 - if (haveSIMD) +#if CV_SIMD_64F + v_float64 v_s1 = vx_setzero_f64(); + v_float64 v_s2 = vx_setzero_f64(); + v_float64 v_s11 = vx_setzero_f64(); + v_float64 v_s12 = vx_setzero_f64(); + v_float64 v_s22 = vx_setzero_f64(); + for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) { - __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1; - __m128d v_s11 = v_s1, v_s22 = v_s1, v_s12 = v_s1; + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); - for ( ; j <= len - 4; j += 4) - { - __m128 v_a = _mm_loadu_ps(h1 + j); - __m128 v_b = _mm_loadu_ps(h2 + j); + // 0-1 + v_float64 v_ad = v_cvt_f64(v_a); + v_float64 v_bd = v_cvt_f64(v_b); + v_s12 = v_muladd(v_ad, v_bd, v_s12); + v_s11 = v_muladd(v_ad, v_ad, v_s11); + v_s22 = v_muladd(v_bd, v_bd, v_s22); + v_s1 += v_ad; + v_s2 += v_bd; - // 0-1 - __m128d v_ad = _mm_cvtps_pd(v_a); - __m128d v_bd = _mm_cvtps_pd(v_b); - v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); - v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); - v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - - // 2-3 - v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); - v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); - v_s12 = _mm_add_pd(v_s12, _mm_mul_pd(v_ad, v_bd)); - v_s11 = _mm_add_pd(v_s11, _mm_mul_pd(v_ad, v_ad)); - v_s22 = _mm_add_pd(v_s22, _mm_mul_pd(v_bd, v_bd)); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - } - - double CV_DECL_ALIGNED(16) ar[10]; - _mm_store_pd(ar, v_s12); - _mm_store_pd(ar + 2, v_s11); - _mm_store_pd(ar + 4, v_s22); - _mm_store_pd(ar + 6, v_s1); - _mm_store_pd(ar + 8, v_s2); - - s12 += ar[0] + ar[1]; - s11 += ar[2] + ar[3]; - s22 += ar[4] + ar[5]; - s1 += ar[6] + ar[7]; - s2 += ar[8] + ar[9]; + // 2-3 + v_ad = v_cvt_f64_high(v_a); + v_bd = v_cvt_f64_high(v_b); + v_s12 = v_muladd(v_ad, v_bd, v_s12); + v_s11 = v_muladd(v_ad, v_ad, v_s11); + v_s22 = v_muladd(v_bd, v_bd, v_s22); + v_s1 += v_ad; + v_s2 += v_bd; } - #endif + s12 += v_reduce_sum(v_s12); + s11 += v_reduce_sum(v_s11); + s22 += v_reduce_sum(v_s22); + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); +#elif CV_SIMD && 0 //Disable vectorization for CV_COMP_CORREL if f64 is unsupported due to low precision + v_float32 v_s1 = vx_setzero_f32(); + v_float32 v_s2 = vx_setzero_f32(); + v_float32 v_s11 = vx_setzero_f32(); + v_float32 v_s12 = vx_setzero_f32(); + v_float32 v_s22 = vx_setzero_f32(); + for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); + + v_s12 = v_muladd(v_a, v_b, v_s12); + v_s11 = v_muladd(v_a, v_a, v_s11); + v_s22 = v_muladd(v_b, v_b, v_s22); + v_s1 += v_a; + v_s2 += v_b; + } + s12 += v_reduce_sum(v_s12); + s11 += v_reduce_sum(v_s11); + s22 += v_reduce_sum(v_s22); + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); +#endif for( ; j < len; j++ ) { double a = h1[j]; @@ -2019,67 +2029,68 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) } else if( method == CV_COMP_INTERSECT ) { - #if CV_NEON - float32x4_t v_result = vdupq_n_f32(0.0f); - for( ; j <= len - 4; j += 4 ) - v_result = vaddq_f32(v_result, vminq_f32(vld1q_f32(h1 + j), vld1q_f32(h2 + j))); - float CV_DECL_ALIGNED(16) ar[4]; - vst1q_f32(ar, v_result); - result += ar[0] + ar[1] + ar[2] + ar[3]; - #elif CV_SSE2 - if (haveSIMD) +#if CV_SIMD_64F + v_float64 v_result = vx_setzero_f64(); + for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) { - __m128d v_result = _mm_setzero_pd(); - for ( ; j <= len - 4; j += 4) - { - __m128 v_src = _mm_min_ps(_mm_loadu_ps(h1 + j), - _mm_loadu_ps(h2 + j)); - v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); - v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); - v_result = _mm_add_pd(v_result, _mm_cvtps_pd(v_src)); - } - - double CV_DECL_ALIGNED(16) ar[2]; - _mm_store_pd(ar, v_result); - result += ar[0] + ar[1]; + v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j)); + v_result += v_cvt_f64(v_src) + v_cvt_f64_high(v_src); } - #endif + result += v_reduce_sum(v_result); +#elif CV_SIMD + v_float32 v_result = vx_setzero_f32(); + for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 v_src = v_min(vx_load(h1 + j), vx_load(h2 + j)); + v_result += v_src; + } + result += v_reduce_sum(v_result); +#endif for( ; j < len; j++ ) result += std::min(h1[j], h2[j]); } else if( method == CV_COMP_BHATTACHARYYA ) { - #if CV_SSE2 - if (haveSIMD) +#if CV_SIMD_64F + v_float64 v_s1 = vx_setzero_f64(); + v_float64 v_s2 = vx_setzero_f64(); + v_float64 v_result = vx_setzero_f64(); + for ( ; j <= len - v_float32::nlanes; j += v_float32::nlanes) { - __m128d v_s1 = _mm_setzero_pd(), v_s2 = v_s1, v_result = v_s1; - for ( ; j <= len - 4; j += 4) - { - __m128 v_a = _mm_loadu_ps(h1 + j); - __m128 v_b = _mm_loadu_ps(h2 + j); + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); - __m128d v_ad = _mm_cvtps_pd(v_a); - __m128d v_bd = _mm_cvtps_pd(v_b); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); + v_float64 v_ad = v_cvt_f64(v_a); + v_float64 v_bd = v_cvt_f64(v_b); + v_s1 += v_ad; + v_s2 += v_bd; + v_result += v_sqrt(v_ad * v_bd); - v_ad = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_a), 8))); - v_bd = _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_b), 8))); - v_s1 = _mm_add_pd(v_s1, v_ad); - v_s2 = _mm_add_pd(v_s2, v_bd); - v_result = _mm_add_pd(v_result, _mm_sqrt_pd(_mm_mul_pd(v_ad, v_bd))); - } - - double CV_DECL_ALIGNED(16) ar[6]; - _mm_store_pd(ar, v_s1); - _mm_store_pd(ar + 2, v_s2); - _mm_store_pd(ar + 4, v_result); - s1 += ar[0] + ar[1]; - s2 += ar[2] + ar[3]; - result += ar[4] + ar[5]; + v_ad = v_cvt_f64_high(v_a); + v_bd = v_cvt_f64_high(v_b); + v_s1 += v_ad; + v_s2 += v_bd; + v_result += v_sqrt(v_ad * v_bd); } - #endif + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); + result += v_reduce_sum(v_result); +#elif CV_SIMD && 0 //Disable vectorization for CV_COMP_BHATTACHARYYA if f64 is unsupported due to low precision + v_float32 v_s1 = vx_setzero_f32(); + v_float32 v_s2 = vx_setzero_f32(); + v_float32 v_result = vx_setzero_f32(); + for (; j <= len - v_float32::nlanes; j += v_float32::nlanes) + { + v_float32 v_a = vx_load(h1 + j); + v_float32 v_b = vx_load(h2 + j); + v_s1 += v_a; + v_s2 += v_b; + v_result += v_sqrt(v_a * v_b); + } + s1 += v_reduce_sum(v_s1); + s2 += v_reduce_sum(v_s2); + result += v_reduce_sum(v_result); +#endif for( ; j < len; j++ ) { double a = h1[j];