diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp index 2c1ff04421..5564983534 100644 --- a/modules/core/include/opencv2/core/core.hpp +++ b/modules/core/include/opencv2/core/core.hpp @@ -4614,11 +4614,11 @@ protected: class CV_EXPORTS ParallelLoopBody { public: - virtual void operator() (const Range& range) const = 0; virtual ~ParallelLoopBody(); + virtual void operator() (const Range& range) const = 0; }; -CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body); +CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes=-1.); /////////////////////////// Synchronization Primitives /////////////////////////////// diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp index 7f6cda99cc..85ac58b2ec 100644 --- a/modules/core/src/parallel.cpp +++ b/modules/core/src/parallel.cpp @@ -80,87 +80,114 @@ namespace cv { - ParallelLoopBody::~ParallelLoopBody() { } - -#ifdef HAVE_TBB - class TbbProxyLoopBody + class ParallelLoopBodyWrapper { public: - TbbProxyLoopBody(const ParallelLoopBody& _body) : - body(&_body) - { } + ParallelLoopBodyWrapper(const ParallelLoopBody& _body, const Range& _r, double _nstripes) + { + body = &_body; + wholeRange = _r; + double len = wholeRange.end - wholeRange.start; + nstripes = cvRound(_nstripes < 0 ? len : MIN(MAX(_nstripes, 1.), len)); + } + void operator()(const Range& sr) const + { + Range r; + r.start = (int)(wholeRange.start + + ((size_t)sr.start*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes); + r.end = sr.end >= nstripes ? wholeRange.end : (int)(wholeRange.start + + ((size_t)sr.end*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes); + (*body)(r); + } + Range stripeRange() const { return Range(0, nstripes); } + + protected: + const ParallelLoopBody* body; + Range wholeRange; + int nstripes; + }; + + ParallelLoopBody::~ParallelLoopBody() {} + +#if defined HAVE_TBB + class ProxyLoopBody : public ParallelLoopBodyWrapper + { + public: + ProxyLoopBody(const ParallelLoopBody& _body, const Range& _r, double _nstripes) + : ParallelLoopBodyWrapper(_body, _r, _nstripes) + {} void operator ()(const tbb::blocked_range& range) const { - body->operator()(Range(range.begin(), range.end())); + (*this)(Range(range.begin(), range.end())); } - - private: - const ParallelLoopBody* body; }; -#endif // end HAVE_TBB +#elif defined HAVE_GCD -#ifdef HAVE_GCD + typedef ParallelLoopBodyWrapper ProxyLoopBody; static void block_function(void* context, size_t index) { - ParallelLoopBody* ptr_body = static_cast(context); - ptr_body->operator()(Range(index, index + 1)); + ProxyLoopBody* ptr_body = static_cast(context); + (*ptr_body)(Range(index, index + 1)); } -#endif // HAVE_GCD - - void parallel_for_(const Range& range, const ParallelLoopBody& body) +#elif defined HAVE_CONCURRENCY + class ProxyLoopBody : public ParallelLoopBodyWrapper { -#ifdef HAVE_TBB + public: + ProxyLoopBody(const ParallelLoopBody& _body, const Range& _r, double _nstripes) + : ParallelLoopBodyWrapper(_body, _r, _nstripes) + {} + + void operator ()(int i) const + { + (*this)(Range(i, i + 1)); + } + } +#else + typedef ParallelLoopBodyWrapper ProxyLoopBody; +#endif - tbb::parallel_for(tbb::blocked_range(range.start, range.end), TbbProxyLoopBody(body)); + void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes) + { + ProxyLoopBody pbody(body, range, nstripes); + Range stripeRange = pbody.stripeRange(); + +#if defined HAVE_TBB + + tbb::parallel_for(tbb::blocked_range(stripeRange.start, stripeRange.end), pbody); #elif defined HAVE_CONCURRENCY - class ConcurrencyProxyLoopBody - { - public: - ConcurrencyProxyLoopBody(const ParallelLoopBody& body) : _body(body) {} - - void operator ()(int i) const - { - _body(Range(i, i + 1)); - } - - private: - const ParallelLoopBody& _body; - ConcurrencyProxyLoopBody& operator=(const ConcurrencyProxyLoopBody&) {return *this;} - } proxy(body); - - Concurrency::parallel_for(range.start, range.end, proxy); + Concurrency::parallel_for(stripeRange.start, stripeRange.end, pbody); #elif defined HAVE_OPENMP #pragma omp parallel for schedule(dynamic) - for (int i = range.start; i < range.end; ++i) - body(Range(i, i + 1)); + for (int i = stripeRange.start; i < stripeRange.end; ++i) + pbody(Range(i, i + 1)); #elif defined HAVE_GCD dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); - dispatch_apply_f(range.end - range.start, concurrent_queue, &const_cast(body), block_function); + dispatch_apply_f(stripeRange.end - stripeRange.start, concurrent_queue, &pbody, block_function); #elif defined HAVE_CSTRIPES parallel() { - int offset = range.start; - int len = range.end - offset; + int offset = stripeRange.start; + int len = stripeRange.end - offset; Range r(offset + CPX_RANGE_START(len), offset + CPX_RANGE_END(len)); - body(r); + pbody(r); barrier(); } #else - body(range); + pbody(stripeRange); -#endif // end HAVE_TBB +#endif } } // namespace cv diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index fd068d07f6..69be461a42 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -187,7 +187,7 @@ private: template void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt) { - parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker(src, dst, cvt)); + parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker(src, dst, cvt), src.total()/(double)(1<<16) ); } ////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index e3374f0878..6fb728ac1b 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -357,7 +357,7 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy ) Range range(0, dsize.height); resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify); - parallel_for_(range, invoker); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); } @@ -1222,7 +1222,7 @@ static void resizeGeneric_( const Mat& src, Mat& dst, Range range(0, dsize.height); resizeGeneric_Invoker invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta, ssize, dsize, ksize, xmin, xmax); - parallel_for_(range, invoker); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); } template @@ -1381,7 +1381,7 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int Range range(0, dst.rows); resizeAreaFast_Invoker invoker(src, dst, scale_x, scale_y, ofs, xofs); - parallel_for_(range, invoker); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); } struct DecimateAlpha @@ -2680,14 +2680,14 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy, const Mat& _fxy, const void* _wtab, int borderType, const Scalar& _borderValue); -class remapInvoker : +class RemapInvoker : public ParallelLoopBody { public: - remapInvoker(const Mat& _src, Mat _dst, const Mat& _map1, const Mat& _map2, const Mat *_m1, + RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1, const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue, int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) : - ParallelLoopBody(), src(_src), dst(_dst), map1(_map1), map2(_map2), m1(_m1), m2(_m2), + ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2), interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue), planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab) { @@ -2697,9 +2697,9 @@ public: { int x, y, x1, y1; const int buf_size = 1 << 14; - int brows0 = std::min(128, dst.rows), map_depth = map1.depth(); - int bcols0 = std::min(buf_size/brows0, dst.cols); - brows0 = std::min(buf_size/bcols0, dst.rows); + int brows0 = std::min(128, dst->rows), map_depth = m1->depth(); + int bcols0 = std::min(buf_size/brows0, dst->cols); + brows0 = std::min(buf_size/bcols0, dst->rows); #if CV_SSE2 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif @@ -2710,17 +2710,17 @@ public: for( y = range.start; y < range.end; y += brows0 ) { - for( x = 0; x < dst.cols; x += bcols0 ) + for( x = 0; x < dst->cols; x += bcols0 ) { int brows = std::min(brows0, range.end - y); - int bcols = std::min(bcols0, dst.cols - x); - Mat dpart(dst, Rect(x, y, bcols, brows)); + int bcols = std::min(bcols0, dst->cols - x); + Mat dpart(*dst, Rect(x, y, bcols, brows)); Mat bufxy(_bufxy, Rect(0, 0, bcols, brows)); if( nnfunc ) { - if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format - bufxy = map1(Rect(x, y, bcols, brows)); + if( m1->type() == CV_16SC2 && !m2->data ) // the data is already in the right format + bufxy = (*m1)(Rect(x, y, bcols, brows)); else if( map_depth != CV_32F ) { for( y1 = 0; y1 < brows; y1++ ) @@ -2738,14 +2738,14 @@ public: } } else if( !planar_input ) - map1(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth()); + (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth()); else { for( y1 = 0; y1 < brows; y1++ ) { short* XY = (short*)(bufxy.data + bufxy.step*y1); - const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x; - const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x; + const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x; + const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x; x1 = 0; #if CV_SSE2 @@ -2778,7 +2778,7 @@ public: } } } - nnfunc( src, dpart, bufxy, borderType, borderValue ); + nnfunc( *src, dpart, bufxy, borderType, borderValue ); continue; } @@ -2788,16 +2788,15 @@ public: short* XY = (short*)(bufxy.data + bufxy.step*y1); ushort* A = (ushort*)(bufa.data + bufa.step*y1); - if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) || - (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) ) + if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) ) { - bufxy = m1->operator()(Rect(x, y, bcols, brows)); - bufa = m2->operator()(Rect(x, y, bcols, brows)); + bufxy = (*m1)(Rect(x, y, bcols, brows)); + bufa = (*m2)(Rect(x, y, bcols, brows)); } else if( planar_input ) { - const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x; - const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x; + const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x; + const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x; x1 = 0; #if CV_SSE2 @@ -2850,7 +2849,7 @@ public: } else { - const float* sXY = (const float*)(map1.data + map1.step*(y+y1)) + x*2; + const float* sXY = (const float*)(m1->data + m1->step*(y+y1)) + x*2; for( x1 = 0; x1 < bcols; x1++ ) { @@ -2863,15 +2862,14 @@ public: } } } - ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue); + ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue); } } } private: - Mat src; - Mat dst; - Mat map1, map2; + const Mat* src; + Mat* dst; const Mat *m1, *m2; int interpolation, borderType; Scalar borderValue; @@ -2961,8 +2959,8 @@ void cv::remap( InputArray _src, OutputArray _dst, const Mat *m1 = &map1, *m2 = &map2; - if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) || - (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) ) + if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || !map2.data)) || + (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || !map1.data)) ) { if( map1.type() != CV_16SC2 ) std::swap(m1, m2); @@ -2974,11 +2972,10 @@ void cv::remap( InputArray _src, OutputArray _dst, planar_input = map1.channels() == 1; } - Range range(0, dst.rows); - remapInvoker invoker(src, dst, map1, map2, m1, m2, interpolation, + RemapInvoker invoker(src, dst, m1, m2, interpolation, borderType, borderValue, planar_input, nnfunc, ifunc, ctab); - parallel_for_(range, invoker); + parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16)); } @@ -3300,7 +3297,7 @@ void cv::warpAffine( InputArray _src, OutputArray _dst, Range range(0, dst.rows); warpAffineInvoker invoker(src, dst, interpolation, borderType, borderValue, adelta, bdelta, M); - parallel_for_(range, invoker); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); } @@ -3430,7 +3427,7 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0, Range range(0, dst.rows); warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue); - parallel_for_(range, invoker); + parallel_for_(range, invoker, dst.total()/(double)(1<<16)); } diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 58264e2a45..92d40f5855 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1919,7 +1919,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, } BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body); + parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); } @@ -2189,7 +2189,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d, // parallel_for usage BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body); + parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); } } diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index c4b25aadcb..e2ef8fe86c 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -664,13 +664,11 @@ getThreshVal_Otsu_8u( const Mat& _src ) class ThresholdRunner : public ParallelLoopBody { public: - ThresholdRunner(Mat _src, Mat _dst, int _nStripes, double _thresh, double _maxval, int _thresholdType) + ThresholdRunner(Mat _src, Mat _dst, double _thresh, double _maxval, int _thresholdType) { src = _src; dst = _dst; - nStripes = _nStripes; - thresh = _thresh; maxval = _maxval; thresholdType = _thresholdType; @@ -678,13 +676,8 @@ public: void operator () ( const Range& range ) const { - int row0 = std::min(cvRound(range.start * src.rows / nStripes), src.rows); - int row1 = range.end >= nStripes ? src.rows : - std::min(cvRound(range.end * src.rows / nStripes), src.rows); - - /*if(0) - printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n", - src.rows, src.cols, range.begin(), range.end(), row0, row1);*/ + int row0 = range.start; + int row1 = range.end; Mat srcStripe = src.rowRange(row0, row1); Mat dstStripe = dst.rowRange(row0, row1); @@ -789,10 +782,9 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m else CV_Error( CV_StsUnsupportedFormat, "" ); - size_t nStripes = (src.total() + (1<<15)) >> 16; - nStripes = MAX(MIN(nStripes, (size_t)4), (size_t)1); - parallel_for_(Range(0, (int)nStripes), - ThresholdRunner(src, dst, nStripes, thresh, maxval, type)); + parallel_for_(Range(0, dst.rows), + ThresholdRunner(src, dst, thresh, maxval, type), + dst.total()/(double)(1<<16)); return thresh; } diff --git a/modules/video/src/bgfg_gmg.cpp b/modules/video/src/bgfg_gmg.cpp index e3574b78f7..42d6fb1691 100644 --- a/modules/video/src/bgfg_gmg.cpp +++ b/modules/video/src/bgfg_gmg.cpp @@ -298,7 +298,7 @@ namespace void cv::BackgroundSubtractorGMG::operator ()(InputArray _frame, OutputArray _fgmask, double newLearningRate) { - cv::Mat frame = _frame.getMat(); + Mat frame = _frame.getMat(); CV_Assert(frame.depth() == CV_8U || frame.depth() == CV_16U || frame.depth() == CV_32F); CV_Assert(frame.channels() == 1 || frame.channels() == 3 || frame.channels() == 4); @@ -313,16 +313,16 @@ void cv::BackgroundSubtractorGMG::operator ()(InputArray _frame, OutputArray _fg initialize(frame.size(), 0.0, frame.depth() == CV_8U ? 255.0 : frame.depth() == CV_16U ? std::numeric_limits::max() : 1.0); _fgmask.create(frameSize_, CV_8UC1); - cv::Mat fgmask = _fgmask.getMat(); + Mat fgmask = _fgmask.getMat(); GMG_LoopBody body(frame, fgmask, nfeatures_, colors_, weights_, maxFeatures, learningRate, numInitializationFrames, quantizationLevels, backgroundPrior, decisionThreshold, maxVal_, minVal_, frameNum_, updateBackgroundModel); - cv::parallel_for_(cv::Range(0, frame.rows), body); + parallel_for_(Range(0, frame.rows), body, frame.total()/(double)(1<<16)); if (smoothingRadius > 0) { - cv::medianBlur(fgmask, buf_, smoothingRadius); + medianBlur(fgmask, buf_, smoothingRadius); cv::swap(fgmask, buf_); }