diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp index b535a00802..9dfb3ea876 100644 --- a/modules/calib3d/test/test_fisheye.cpp +++ b/modules/calib3d/test/test_fisheye.cpp @@ -101,7 +101,7 @@ TEST_F(fisheyeTest, projectPoints) EXPECT_MAT_NEAR(distorted0, distorted2, 1e-10); } -TEST_F(fisheyeTest, DISABLED_undistortImage) +TEST_F(fisheyeTest, undistortImage) { cv::Matx33d theK = this->K; cv::Mat theD = cv::Mat(this->D); diff --git a/modules/dnn/src/darknet/darknet_io.cpp b/modules/dnn/src/darknet/darknet_io.cpp index ff322bc188..405209c458 100644 --- a/modules/dnn/src/darknet/darknet_io.cpp +++ b/modules/dnn/src/darknet/darknet_io.cpp @@ -229,6 +229,10 @@ namespace cv { { activation_param.type = "Swish"; } + else if (type == "mish") + { + activation_param.type = "Mish"; + } else if (type == "logistic") { activation_param.type = "Sigmoid"; @@ -436,7 +440,7 @@ namespace cv { fused_layer_names.push_back(last_layer); } - void setYolo(int classes, const std::vector& mask, const std::vector& anchors, float thresh, float nms_threshold) + void setYolo(int classes, const std::vector& mask, const std::vector& anchors, float thresh, float nms_threshold, float scale_x_y) { cv::dnn::LayerParams region_param; region_param.name = "Region-name"; @@ -449,6 +453,7 @@ namespace cv { region_param.set("logistic", true); region_param.set("thresh", thresh); region_param.set("nms_threshold", nms_threshold); + region_param.set("scale_x_y", scale_x_y); std::vector usedAnchors(numAnchors * 2); for (int i = 0; i < numAnchors; ++i) @@ -786,6 +791,7 @@ namespace cv { int num_of_anchors = getParam(layer_params, "num", -1); float thresh = getParam(layer_params, "thresh", 0.2); float nms_threshold = getParam(layer_params, "nms_threshold", 0.4); + float scale_x_y = getParam(layer_params, "scale_x_y", 1.0); std::string anchors_values = getParam(layer_params, "anchors", std::string()); CV_Assert(!anchors_values.empty()); @@ -798,7 +804,7 @@ namespace cv { CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size()); setParams.setPermute(false); - setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold); + setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold, scale_x_y); } else { CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type); @@ -813,6 +819,10 @@ namespace cv { { setParams.setActivation("swish"); } + else if (activation == "mish") + { + setParams.setActivation("mish"); + } else if (activation == "logistic") { setParams.setActivation("logistic"); @@ -935,8 +945,8 @@ namespace cv { } std::string activation = getParam(layer_params, "activation", "linear"); - if(activation == "leaky" || activation == "swish" || activation == "logistic") - ++cv_layers_counter; // For ReLU, Swish, Sigmoid + if(activation == "leaky" || activation == "swish" || activation == "mish" || activation == "logistic") + ++cv_layers_counter; // For ReLU, Swish, Mish, Sigmoid if(!darknet_layers_counter) tensor_shape.resize(1); diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp index 1e3efd06f4..ef30c25d7b 100644 --- a/modules/dnn/src/layers/region_layer.cpp +++ b/modules/dnn/src/layers/region_layer.cpp @@ -69,7 +69,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer { public: int coords, classes, anchors, classfix; - float thresh, nmsThreshold; + float thresh, nmsThreshold, scale_x_y; bool useSoftmax, useLogistic; #ifdef HAVE_OPENCL UMat blob_umat; @@ -88,6 +88,7 @@ public: useSoftmax = params.get("softmax", false); useLogistic = params.get("logistic", false); nmsThreshold = params.get("nms_threshold", 0.4); + scale_x_y = params.get("scale_x_y", 1.0); // Yolov4 CV_Assert(nmsThreshold >= 0.); CV_Assert(coords == 4); @@ -302,8 +303,10 @@ public: if (classfix == -1 && scale < .5) scale = 0; // if(t0 < 0.5) t0 = 0; int box_index = index_sample_offset + index * cell_size; - dstData[box_index + 0] = (x + logistic_activate(srcData[box_index + 0])) / cols; - dstData[box_index + 1] = (y + logistic_activate(srcData[box_index + 1])) / rows; + float x_tmp = (logistic_activate(srcData[box_index + 0]) - 0.5f) * scale_x_y + 0.5f; + float y_tmp = (logistic_activate(srcData[box_index + 1]) - 0.5f) * scale_x_y + 0.5f; + dstData[box_index + 0] = (x + x_tmp) / cols; + dstData[box_index + 1] = (y + y_tmp) / rows; dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / wNorm; dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / hNorm; @@ -471,6 +474,8 @@ public: auto shape_3d = std::make_shared(ngraph::element::i64, ngraph::Shape{boxes_shape.size()}, boxes_shape.data()); ngraph::Shape box_broad_shape{1, (size_t)anchors, (size_t)h, (size_t)w}; + auto scale_x_y_node = std::make_shared(ngraph::element::f32, ngraph::Shape{1}, &scale_x_y); + auto shift_node = std::make_shared(ngraph::element::f32, ngraph::Shape{1}, std::vector{0.5}); std::shared_ptr box_x; { @@ -478,6 +483,9 @@ public: auto upper_bounds = std::make_shared(ngraph::element::i64, ngraph::Shape{2}, std::vector{1, cols}); box_x = std::make_shared(input2d, lower_bounds, upper_bounds, strides, std::vector{}, std::vector{}); box_x = std::make_shared(box_x); + box_x = std::make_shared(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY); + box_x = std::make_shared(box_x, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY); + box_x = std::make_shared(box_x, shift_node, ngraph::op::AutoBroadcastType::NUMPY); box_x = std::make_shared(box_x, shape_3d, true); std::vector x_indices(w * h * anchors); @@ -504,6 +512,9 @@ public: auto upper_bounds = std::make_shared(ngraph::element::i64, ngraph::Shape{2}, std::vector{2, cols}); box_y = std::make_shared(input2d, lower_bounds, upper_bounds, strides, std::vector{}, std::vector{}); box_y = std::make_shared(box_y); + box_y = std::make_shared(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY); + box_y = std::make_shared(box_y, scale_x_y_node, ngraph::op::AutoBroadcastType::NUMPY); + box_y = std::make_shared(box_y, shift_node, ngraph::op::AutoBroadcastType::NUMPY); box_y = std::make_shared(box_y, shape_3d, true); std::vector y_indices(h * anchors); diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 701e64f953..937526a60e 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -172,18 +172,19 @@ public: CV_Assert(inputs.size() == 1); const MatSize& inpShape = inputs[0].size; + finalSliceRanges = sliceRanges; if (sliceRanges.empty()) { // Divide input blob on equal parts by axis. int outAxisSize = inpShape[axis] / outputs.size(); - sliceRanges.resize(outputs.size(), - std::vector(axis + 1, Range::all())); + finalSliceRanges.resize(outputs.size(), + std::vector(axis + 1, Range::all())); int prevSlice = 0; for (int i = 0; i < outputs.size(); ++i) { - sliceRanges[i][axis].start = prevSlice; - sliceRanges[i][axis].end = sliceRanges[i][axis].start + outAxisSize; - prevSlice = sliceRanges[i][axis].end; + finalSliceRanges[i][axis].start = prevSlice; + finalSliceRanges[i][axis].end = finalSliceRanges[i][axis].start + outAxisSize; + prevSlice = finalSliceRanges[i][axis].end; } } else @@ -191,16 +192,16 @@ public: for (int i = 0; i < outputs.size(); ++i) { - CV_Assert(sliceRanges[i].size() <= inpShape.dims()); + CV_Assert(finalSliceRanges[i].size() <= inpShape.dims()); // Fill the rest of ranges. - for (int j = sliceRanges[i].size(); j < inpShape.dims(); ++j) + for (int j = finalSliceRanges[i].size(); j < inpShape.dims(); ++j) { - sliceRanges[i].push_back(Range::all()); + finalSliceRanges[i].push_back(Range::all()); } // Clamp. - for (int j = 0; j < sliceRanges[i].size(); ++j) + for (int j = 0; j < finalSliceRanges[i].size(); ++j) { - sliceRanges[i][j] = clamp(sliceRanges[i][j], inpShape[j]); + finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]); } } } @@ -241,8 +242,8 @@ public: kernel.set(idx++, (int)(rows * cols)); kernel.set(idx++, (int)inpMat.size[3]); kernel.set(idx++, (int)cols); - kernel.set(idx++, (int)sliceRanges[i][2].start); - kernel.set(idx++, (int)sliceRanges[i][3].start); + kernel.set(idx++, (int)finalSliceRanges[i][2].start); + kernel.set(idx++, (int)finalSliceRanges[i][3].start); kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i])); bool ret = kernel.run(1, global, local, false); if (!ret) @@ -266,10 +267,10 @@ public: outputs_arr.getMatVector(outputs); const Mat& inpMat = inputs[0]; - CV_Assert(outputs.size() == sliceRanges.size()); + CV_Assert(outputs.size() == finalSliceRanges.size()); for (size_t i = 0; i < outputs.size(); i++) { - inpMat(sliceRanges[i]).copyTo(outputs[i]); + inpMat(finalSliceRanges[i]).copyTo(outputs[i]); } } @@ -278,11 +279,11 @@ public: #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1) virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { - CV_Assert_N(sliceRanges.size() == 1, inputs.size() <= 2); + CV_Assert_N(finalSliceRanges.size() == 1, inputs.size() <= 2); std::vector axes, offsets, dims; int from, to, step; - int numDims = sliceRanges[0].size(); + int numDims = finalSliceRanges[0].size(); if (preferableTarget == DNN_TARGET_MYRIAD) { from = axis; @@ -298,8 +299,8 @@ public: for (int i = from; i != to; i += step) { axes.push_back(i); - offsets.push_back(sliceRanges[0][i].start); - dims.push_back(sliceRanges[0][i].size()); + offsets.push_back(finalSliceRanges[0][i].start); + dims.push_back(finalSliceRanges[0][i].size()); } InferenceEngine::Builder::Layer ieLayer(name); @@ -315,7 +316,7 @@ public: { std::vector outShape(numDims); for (int i = 0; i < numDims; ++i) - outShape[i] = sliceRanges[0][i].size(); + outShape[i] = finalSliceRanges[0][i].size(); ieLayer.getInputPorts()[1].setParameter("type", "weights"); @@ -338,13 +339,13 @@ public: { CV_Assert_N(nodes.size() <= 2); auto& ieInpNode = nodes[0].dynamicCast()->node; - CV_Assert(sliceRanges[0].size() == ieInpNode->get_shape().size()); + CV_Assert(finalSliceRanges[0].size() == ieInpNode->get_shape().size()); std::vector offsets, dims; - for (int i = 0; i < sliceRanges[0].size(); ++i) + for (int i = 0; i < finalSliceRanges[0].size(); ++i) { - offsets.push_back(sliceRanges[0][i].start); - dims.push_back(sliceRanges[0][i].end); + offsets.push_back(finalSliceRanges[0][i].start); + dims.push_back(finalSliceRanges[0][i].end); } auto lower_bounds = std::make_shared(ngraph::element::i64, @@ -384,6 +385,10 @@ public: } #endif + +protected: + // The actual non-negative values determined from @p sliceRanges depends on input size. + std::vector > finalSliceRanges; }; class CropLayerImpl CV_FINAL : public SliceLayerImpl @@ -447,18 +452,18 @@ public: offset_final[i] = offset[i - start_axis]; } - sliceRanges.resize(1); - sliceRanges[0].resize(dims); + finalSliceRanges.resize(1); + finalSliceRanges[0].resize(dims); for (int i = 0; i < start_axis; i++) { - sliceRanges[0][i] = Range(0, inpBlob.size[i]); + finalSliceRanges[0][i] = Range(0, inpBlob.size[i]); } for (int i = start_axis; i < dims; i++) { if (offset_final[i] < 0 || offset_final[i] + inpSzBlob.size[i] > inpBlob.size[i]) CV_Error(Error::StsBadArg, "invalid crop parameters or blob sizes"); - sliceRanges[0][i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size[i]); + finalSliceRanges[0][i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size[i]); } } diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp index 7ac6d39360..653d0acbb8 100644 --- a/modules/dnn/test/test_darknet_importer.cpp +++ b/modules/dnn/test/test_darknet_importer.cpp @@ -549,6 +549,11 @@ TEST_P(Test_Darknet_layers, upsample) testDarknetLayer("upsample"); } +TEST_P(Test_Darknet_layers, mish) +{ + testDarknetLayer("mish", true); +} + TEST_P(Test_Darknet_layers, avgpool_softmax) { testDarknetLayer("avgpool_softmax"); diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 584e7bce36..f9aa63d56b 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -1791,4 +1791,38 @@ TEST_P(Layer_Test_Resize, change_input) INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Resize, dnnBackendsAndTargets()); +typedef testing::TestWithParam > Layer_Test_Slice; +TEST_P(Layer_Test_Slice, variable_input_shape) +{ + int backendId = get<0>(GetParam()); + int targetId = get<1>(GetParam()); + + int begin[] = {0, 0, 0, 0}; + int end[] = {-1, -1, -1, -1}; + + Net net; + LayerParams lp; + lp.type = "Slice"; + lp.name = "testLayer"; + lp.set("begin", DictValue::arrayInt(&begin[0], 4)); + lp.set("end", DictValue::arrayInt(&end[0], 4)); + net.addLayerToPrev(lp.name, lp.type, lp); + + for (int i = 0; i < 2; ++i) + { + Mat inp(4 + i, 5 + i, CV_8UC1); + randu(inp, 0, 255); + inp = blobFromImage(inp); + + net.setInput(inp); + net.setPreferableBackend(backendId); + net.setPreferableTarget(targetId); + Mat out = net.forward(); + + normAssert(out, inp); + } +} + +INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Slice, dnnBackendsAndTargets()); + }} // namespace diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index fadc19763a..928691b4b3 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -559,7 +559,7 @@ public: TEST_P(Test_ONNX_nets, Alexnet) { -#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL) +#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32)) applyTestTag(CV_TEST_TAG_MEMORY_2GB); #else applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); @@ -623,7 +623,7 @@ TEST_P(Test_ONNX_nets, Googlenet) TEST_P(Test_ONNX_nets, CaffeNet) { -#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL) +#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32)) applyTestTag(CV_TEST_TAG_MEMORY_2GB); #else applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); @@ -639,7 +639,7 @@ TEST_P(Test_ONNX_nets, CaffeNet) TEST_P(Test_ONNX_nets, RCNN_ILSVRC13) { -#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL) +#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32)) applyTestTag(CV_TEST_TAG_MEMORY_2GB); #else applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index acceca5314..29307053f3 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -55,19 +55,19 @@ namespace { -static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst) +static void calcScharrDeriv(const cv::Mat& src, cv::Mat& dst) { using namespace cv; using cv::detail::deriv_type; int rows = src.rows, cols = src.cols, cn = src.channels(), depth = src.depth(); CV_Assert(depth == CV_8U); dst.create(rows, cols, CV_MAKETYPE(DataType::depth, cn*2)); - parallel_for_(Range(0, rows), cv::detail::SharrDerivInvoker(src, dst), cv::getNumThreads()); + parallel_for_(Range(0, rows), cv::detail::ScharrDerivInvoker(src, dst), cv::getNumThreads()); } }//namespace -void cv::detail::SharrDerivInvoker::operator()(const Range& range) const +void cv::detail::ScharrDerivInvoker::operator()(const Range& range) const { using cv::detail::deriv_type; int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn; @@ -801,7 +801,7 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si deriv.create(sz.height + winSize.height*2, sz.width + winSize.width*2, derivType); Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height)); - calcSharrDeriv(thisLevel, derivI); + calcScharrDeriv(thisLevel, derivI); if(derivBorder != BORDER_TRANSPARENT) copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder|BORDER_ISOLATED); @@ -1382,7 +1382,7 @@ void SparsePyrLKOpticalFlowImpl::calc( InputArray _prevImg, InputArray _nextImg, Mat _derivI( imgSize.height + winSize.height*2, imgSize.width + winSize.width*2, derivIBuf.type(), derivIBuf.ptr() ); derivI = _derivI(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height)); - calcSharrDeriv(prevPyr[level * lvlStep1], derivI); + calcScharrDeriv(prevPyr[level * lvlStep1], derivI); copyMakeBorder(derivI, _derivI, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_CONSTANT|BORDER_ISOLATED); } else diff --git a/modules/video/src/lkpyramid.hpp b/modules/video/src/lkpyramid.hpp index 16b0da189e..62fab0564b 100644 --- a/modules/video/src/lkpyramid.hpp +++ b/modules/video/src/lkpyramid.hpp @@ -7,9 +7,9 @@ namespace detail typedef short deriv_type; - struct SharrDerivInvoker : ParallelLoopBody + struct ScharrDerivInvoker : ParallelLoopBody { - SharrDerivInvoker(const Mat& _src, const Mat& _dst) + ScharrDerivInvoker(const Mat& _src, const Mat& _dst) : src(_src), dst(_dst) { } diff --git a/samples/dnn/text_detection.cpp b/samples/dnn/text_detection.cpp index e7b0f237d3..706e2fe58b 100644 --- a/samples/dnn/text_detection.cpp +++ b/samples/dnn/text_detection.cpp @@ -1,3 +1,20 @@ +/* + Text detection model: https://github.com/argman/EAST + Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1 + + Text recognition model taken from here: https://github.com/meijieru/crnn.pytorch + How to convert from pb to onnx: + Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py + + import torch + import models.crnn as crnn + + model = CRNN(32, 1, 37, 256) + model.load_state_dict(torch.load('crnn.pth')) + dummy_input = torch.randn(1, 1, 32, 100) + torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True) +*/ + #include #include #include @@ -8,21 +25,26 @@ using namespace cv::dnn; const char* keys = "{ help h | | Print help message. }" "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}" - "{ model m | | Path to a binary .pb file contains trained network.}" + "{ model m | | Path to a binary .pb file contains trained detector network.}" + "{ ocr | | Path to a binary .pb or .onnx file contains trained recognition network.}" "{ width | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }" "{ height | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }" "{ thr | 0.5 | Confidence threshold. }" "{ nms | 0.4 | Non-maximum suppression threshold. }"; -void decode(const Mat& scores, const Mat& geometry, float scoreThresh, - std::vector& detections, std::vector& confidences); +void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh, + std::vector& detections, std::vector& confidences); + +void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result); + +void decodeText(const Mat& scores, std::string& text); int main(int argc, char** argv) { // Parse command line arguments. CommandLineParser parser(argc, argv, keys); parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of " - "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)"); + "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)"); if (argc == 1 || parser.has("help")) { parser.printMessage(); @@ -33,7 +55,8 @@ int main(int argc, char** argv) float nmsThreshold = parser.get("nms"); int inpWidth = parser.get("width"); int inpHeight = parser.get("height"); - String model = parser.get("model"); + String modelDecoder = parser.get("model"); + String modelRecognition = parser.get("ocr"); if (!parser.check()) { @@ -41,17 +64,19 @@ int main(int argc, char** argv) return 1; } - CV_Assert(!model.empty()); + CV_Assert(!modelDecoder.empty()); - // Load network. - Net net = readNet(model); + // Load networks. + Net detector = readNet(modelDecoder); + Net recognizer; + + if (!modelRecognition.empty()) + recognizer = readNet(modelRecognition); // Open a video file or an image file or a camera stream. VideoCapture cap; - if (parser.has("input")) - cap.open(parser.get("input")); - else - cap.open(0); + bool openSuccess = parser.has("input") ? cap.open(parser.get("input")) : cap.open(0); + CV_Assert(openSuccess); static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector"; namedWindow(kWinName, WINDOW_NORMAL); @@ -62,6 +87,7 @@ int main(int argc, char** argv) outNames[1] = "feature_fusion/concat_3"; Mat frame, blob; + TickMeter tickMeter; while (waitKey(1) < 0) { cap >> frame; @@ -72,8 +98,10 @@ int main(int argc, char** argv) } blobFromImage(frame, blob, 1.0, Size(inpWidth, inpHeight), Scalar(123.68, 116.78, 103.94), true, false); - net.setInput(blob); - net.forward(outs, outNames); + detector.setInput(blob); + tickMeter.start(); + detector.forward(outs, outNames); + tickMeter.stop(); Mat scores = outs[0]; Mat geometry = outs[1]; @@ -81,43 +109,64 @@ int main(int argc, char** argv) // Decode predicted bounding boxes. std::vector boxes; std::vector confidences; - decode(scores, geometry, confThreshold, boxes, confidences); + decodeBoundingBoxes(scores, geometry, confThreshold, boxes, confidences); // Apply non-maximum suppression procedure. std::vector indices; NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices); - // Render detections. Point2f ratio((float)frame.cols / inpWidth, (float)frame.rows / inpHeight); + + // Render text. for (size_t i = 0; i < indices.size(); ++i) { RotatedRect& box = boxes[indices[i]]; Point2f vertices[4]; box.points(vertices); + for (int j = 0; j < 4; ++j) { vertices[j].x *= ratio.x; vertices[j].y *= ratio.y; } + + if (!modelRecognition.empty()) + { + Mat cropped; + fourPointsTransform(frame, vertices, cropped); + + cvtColor(cropped, cropped, cv::COLOR_BGR2GRAY); + + Mat blobCrop = blobFromImage(cropped, 1.0/127.5, Size(), Scalar::all(127.5)); + recognizer.setInput(blobCrop); + + tickMeter.start(); + Mat result = recognizer.forward(); + tickMeter.stop(); + + std::string wordRecognized = ""; + decodeText(result, wordRecognized); + putText(frame, wordRecognized, vertices[1], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255)); + } + for (int j = 0; j < 4; ++j) line(frame, vertices[j], vertices[(j + 1) % 4], Scalar(0, 255, 0), 1); } // Put efficiency information. - std::vector layersTimes; - double freq = getTickFrequency() / 1000; - double t = net.getPerfProfile(layersTimes) / freq; - std::string label = format("Inference time: %.2f ms", t); + std::string label = format("Inference time: %.2f ms", tickMeter.getTimeMilli()); putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0)); imshow(kWinName, frame); + + tickMeter.reset(); } return 0; } -void decode(const Mat& scores, const Mat& geometry, float scoreThresh, - std::vector& detections, std::vector& confidences) +void decodeBoundingBoxes(const Mat& scores, const Mat& geometry, float scoreThresh, + std::vector& detections, std::vector& confidences) { detections.clear(); CV_Assert(scores.dims == 4); CV_Assert(geometry.dims == 4); CV_Assert(scores.size[0] == 1); @@ -159,3 +208,51 @@ void decode(const Mat& scores, const Mat& geometry, float scoreThresh, } } } + +void fourPointsTransform(const Mat& frame, Point2f vertices[4], Mat& result) +{ + const Size outputSize = Size(100, 32); + + Point2f targetVertices[4] = {Point(0, outputSize.height - 1), + Point(0, 0), Point(outputSize.width - 1, 0), + Point(outputSize.width - 1, outputSize.height - 1), + }; + Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices); + + warpPerspective(frame, result, rotationMatrix, outputSize); +} + +void decodeText(const Mat& scores, std::string& text) +{ + static const std::string alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"; + Mat scoresMat = scores.reshape(1, scores.size[0]); + + std::vector elements; + elements.reserve(scores.size[0]); + + for (int rowIndex = 0; rowIndex < scoresMat.rows; ++rowIndex) + { + Point p; + minMaxLoc(scoresMat.row(rowIndex), 0, 0, 0, &p); + if (p.x > 0 && static_cast(p.x) <= alphabet.size()) + { + elements.push_back(alphabet[p.x - 1]); + } + else + { + elements.push_back('-'); + } + } + + if (elements.size() > 0 && elements[0] != '-') + text += elements[0]; + + for (size_t elementIndex = 1; elementIndex < elements.size(); ++elementIndex) + { + if (elementIndex > 0 && elements[elementIndex] != '-' && + elements[elementIndex - 1] != elements[elementIndex]) + { + text += elements[elementIndex]; + } + } +} \ No newline at end of file