diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake index eac12ff67b..ccfc4b93b3 100644 --- a/cmake/FindCUDA.cmake +++ b/cmake/FindCUDA.cmake @@ -1098,7 +1098,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files) set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}") endif() - if(CUDA_TARGET_OS_VARIANT) + if(CUDA_TARGET_OS_VARIANT AND CUDA_VERSION VERSION_LESS "7.0") set(nvcc_flags ${nvcc_flags} "-target-os-variant=${CUDA_TARGET_OS_VARIANT}") endif() diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp index a97388bd05..9fff4ee281 100644 --- a/modules/core/include/opencv2/core/private.cuda.hpp +++ b/modules/core/include/opencv2/core/private.cuda.hpp @@ -130,6 +130,12 @@ namespace cv { namespace cuda class NppStreamHandler { public: + inline explicit NppStreamHandler(Stream& newStream) + { + oldStream = nppGetStream(); + nppSetStream(StreamAccessor::getStream(newStream)); + } + inline explicit NppStreamHandler(cudaStream_t newStream) { oldStream = nppGetStream(); diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp index 5a51230cef..1fb980d922 100644 --- a/modules/core/src/parallel.cpp +++ b/modules/core/src/parallel.cpp @@ -56,7 +56,7 @@ #include #if defined ANDROID #include - #else + #elif defined __APPLE__ #include #endif #endif diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index d9a20873f6..1f9d686723 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -163,8 +163,6 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix) #include #if defined ANDROID #include -#else -#include #endif #endif diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp index a5fbfb81fa..d84ae24534 100644 --- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp +++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp @@ -205,19 +205,11 @@ CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stre @param src Source image with CV_8UC1 type. @param dst Destination image. -@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes). @param stream Stream for the asynchronous version. @sa equalizeHist */ -CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null()); - -/** @overload */ -static inline void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::equalizeHist(src, dst, buf, stream); -} +CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); /** @brief Base class for Contrast Limited Adaptive Histogram Equalization. : */ @@ -248,8 +240,9 @@ CV_EXPORTS Ptr createCLAHE(double clipLimit = 40.0, Size tileGridSi @param nLevels Number of computed levels. nLevels must be at least 2. @param lowerLevel Lower boundary value of the lowest level. @param upperLevel Upper boundary value of the greatest level. +@param stream Stream for the asynchronous version. */ -CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel); +CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); /** @brief Calculates a histogram with evenly distributed bins. @@ -259,27 +252,11 @@ a four-channel image, all channels are processed separately. @param histSize Size of the histogram. @param lowerLevel Lower boundary of lowest-level bin. @param upperLevel Upper boundary of highest-level bin. -@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes). @param stream Stream for the asynchronous version. */ -CV_EXPORTS void histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); - +CV_EXPORTS void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); /** @overload */ -static inline void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream); -} - -/** @overload */ -CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); - -/** @overload */ -static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream); -} +CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); /** @brief Calculates a histogram with bins determined by the levels array. @@ -287,27 +264,11 @@ static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int For a four-channel image, all channels are processed separately. @param hist Destination histogram with one row, (levels.cols-1) columns, and the CV_32SC1 type. @param levels Number of levels in the histogram. -@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes). @param stream Stream for the asynchronous version. */ -CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null()); - +CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null()); /** @overload */ -static inline void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histRange(src, hist, levels, buf, stream); -} - -/** @overload */ -CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null()); - -/** @overload */ -static inline void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histRange(src, hist, levels, buf, stream); -} +CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()); //! @} cudaimgproc_hist @@ -321,15 +282,17 @@ public: /** @brief Finds edges in an image using the @cite Canny86 algorithm. @param image Single-channel 8-bit input image. - @param edges Output edge map. It has the same size and type as image . + @param edges Output edge map. It has the same size and type as image. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray image, OutputArray edges) = 0; + virtual void detect(InputArray image, OutputArray edges, Stream& stream = Stream::Null()) = 0; /** @overload @param dx First derivative of image in the vertical direction. Support only CV_32S type. @param dy First derivative of image in the horizontal direction. Support only CV_32S type. - @param edges Output edge map. It has the same size and type as image . + @param edges Output edge map. It has the same size and type as image. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0; + virtual void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream = Stream::Null()) = 0; virtual void setLowThreshold(double low_thresh) = 0; virtual double getLowThreshold() const = 0; @@ -376,18 +339,20 @@ public: \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of the image). \f$\theta\f$ is the line rotation angle in radians ( \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ). + @param stream Stream for the asynchronous version. @sa HoughLines */ - virtual void detect(InputArray src, OutputArray lines) = 0; + virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0; /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory. @param d_lines Result of cuda::HoughLinesDetector::detect . @param h_lines Output host array. @param h_votes Optional output array for line's votes. + @param stream Stream for the asynchronous version. */ - virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0; + virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray(), Stream& stream = Stream::Null()) = 0; virtual void setRho(float rho) = 0; virtual float getRho() const = 0; @@ -431,10 +396,11 @@ public: @param lines Output vector of lines. Each line is represented by a 4-element vector \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected line segment. + @param stream Stream for the asynchronous version. @sa HoughLinesP */ - virtual void detect(InputArray src, OutputArray lines) = 0; + virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0; virtual void setRho(float rho) = 0; virtual float getRho() const = 0; @@ -475,10 +441,11 @@ public: @param src 8-bit, single-channel grayscale input image. @param circles Output vector of found circles. Each vector is encoded as a 3-element floating-point vector \f$(x, y, radius)\f$ . + @param stream Stream for the asynchronous version. @sa HoughCircles */ - virtual void detect(InputArray src, OutputArray circles) = 0; + virtual void detect(InputArray src, OutputArray circles, Stream& stream = Stream::Null()) = 0; virtual void setDp(float dp) = 0; virtual float getDp() const = 0; @@ -593,8 +560,9 @@ public: positions). @param mask Optional region of interest. If the image is not empty (it needs to have the type CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0; + virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray(), Stream& stream = Stream::Null()) = 0; }; /** @brief Creates implementation for cuda::CornersDetector . @@ -630,7 +598,7 @@ as src . @param sp Spatial window radius. @param sr Color window radius. @param criteria Termination criteria. See TermCriteria. -@param stream +@param stream Stream for the asynchronous version. It maps each point of the source image into another point. As a result, you have a new color and new position of each point. @@ -650,7 +618,7 @@ src size. The type is CV_16SC2 . @param sp Spatial window radius. @param sr Color window radius. @param criteria Termination criteria. See TermCriteria. -@param stream +@param stream Stream for the asynchronous version. @sa cuda::meanShiftFiltering */ @@ -666,9 +634,11 @@ CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dsts @param sr Color window radius. @param minsize Minimum segment size. Smaller segments are merged. @param criteria Termination criteria. See TermCriteria. +@param stream Stream for the asynchronous version. */ CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), + Stream& stream = Stream::Null()); /////////////////////////// Match Template //////////////////////////// diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp index 0e020394a1..c638ce0ce5 100644 --- a/modules/cudaimgproc/perf/perf_histogram.cpp +++ b/modules/cudaimgproc/perf/perf_histogram.cpp @@ -63,9 +63,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC1, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; - cv::cuda::GpuMat d_buf; - TEST_CYCLE() cv::cuda::histEven(d_src, dst, d_buf, 30, 0, 180); + TEST_CYCLE() cv::cuda::histEven(d_src, dst, 30, 0, 180); CUDA_SANITY_CHECK(dst); } @@ -106,9 +105,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC4, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat d_hist[4]; - cv::cuda::GpuMat d_buf; - TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, d_buf, histSize, lowerLevel, upperLevel); + TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, histSize, lowerLevel, upperLevel); cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3; d_hist[0].download(cpu_hist0); @@ -167,9 +165,8 @@ PERF_TEST_P(Sz, EqualizeHist, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; - cv::cuda::GpuMat d_buf; - TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst, d_buf); + TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst); CUDA_SANITY_CHECK(dst); } diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp index eed4a284e5..1e52bd295a 100644 --- a/modules/cudaimgproc/src/canny.cpp +++ b/modules/cudaimgproc/src/canny.cpp @@ -53,16 +53,16 @@ Ptr cv::cuda::createCannyEdgeDetector(double, double, int, bo namespace canny { - void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); - void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream); + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream); - void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh); + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream); - void edgesHysteresisLocal(PtrStepSzi map, short2* st1); + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream); - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2); + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream); - void getEdges(PtrStepSzi map, PtrStepSzb dst); + void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream); } namespace @@ -76,8 +76,8 @@ namespace old_apperture_size_ = -1; } - void detect(InputArray image, OutputArray edges); - void detect(InputArray dx, InputArray dy, OutputArray edges); + void detect(InputArray image, OutputArray edges, Stream& stream); + void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream); void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; } double getLowThreshold() const { return low_thresh_; } @@ -111,7 +111,7 @@ namespace private: void createBuf(Size image_size); - void CannyCaller(GpuMat& edges); + void CannyCaller(GpuMat& edges, Stream& stream); double low_thresh_; double high_thresh_; @@ -128,7 +128,7 @@ namespace int old_apperture_size_; }; - void CannyImpl::detect(InputArray _image, OutputArray _edges) + void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream) { GpuMat image = _image.getGpuMat(); @@ -150,24 +150,24 @@ namespace image.locateROI(wholeSize, ofs); GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step); - canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); } else { #ifndef HAVE_OPENCV_CUDAFILTERS throw_no_cuda(); #else - filterDX_->apply(image, dx_); - filterDY_->apply(image, dy_); + filterDX_->apply(image, dx_, stream); + filterDY_->apply(image, dy_, stream); - canny::calcMagnitude(dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); #endif } - CannyCaller(edges); + CannyCaller(edges, stream); } - void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges) + void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges, Stream& stream) { GpuMat dx = _dx.getGpuMat(); GpuMat dy = _dy.getGpuMat(); @@ -176,8 +176,8 @@ namespace CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() ); CV_Assert( deviceSupports(SHARED_ATOMICS) ); - dx.copyTo(dx_); - dy.copyTo(dy_); + dx.copyTo(dx_, stream); + dy.copyTo(dy_, stream); if (low_thresh_ > high_thresh_) std::swap(low_thresh_, high_thresh_); @@ -187,9 +187,9 @@ namespace _edges.create(dx.size(), CV_8UC1); GpuMat edges = _edges.getGpuMat(); - canny::calcMagnitude(dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); - CannyCaller(edges); + CannyCaller(edges, stream); } void CannyImpl::createBuf(Size image_size) @@ -215,16 +215,16 @@ namespace ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_); } - void CannyImpl::CannyCaller(GpuMat& edges) + void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream) { map_.setTo(Scalar::all(0)); - canny::calcMap(dx_, dy_, mag_, map_, static_cast(low_thresh_), static_cast(high_thresh_)); + canny::calcMap(dx_, dy_, mag_, map_, static_cast(low_thresh_), static_cast(high_thresh_), StreamAccessor::getStream(stream)); - canny::edgesHysteresisLocal(map_, st1_.ptr()); + canny::edgesHysteresisLocal(map_, st1_.ptr(), StreamAccessor::getStream(stream)); - canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr()); + canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr(), StreamAccessor::getStream(stream)); - canny::getEdges(map_, edges); + canny::getEdges(map_, edges, StreamAccessor::getStream(stream)); } } diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu index 3d770e179d..e0ba515693 100644 --- a/modules/cudaimgproc/src/cuda/canny.cu +++ b/modules/cudaimgproc/src/cuda/canny.cu @@ -120,7 +120,7 @@ namespace canny mag(y, x) = norm(dxVal, dyVal); } - void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); @@ -131,30 +131,31 @@ namespace canny if (L2Grad) { L2 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); } else { L1 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } - void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) { if (L2Grad) { L2 norm; - transform(dx, dy, mag, norm, WithOutMask(), 0); + transform(dx, dy, mag, norm, WithOutMask(), stream); } else { L1 norm; - transform(dx, dy, mag, norm, WithOutMask(), 0); + transform(dx, dy, mag, norm, WithOutMask(), stream); } } } @@ -217,17 +218,18 @@ namespace canny map(y, x) = edge_type; } - void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); bindTexture(&tex_mag, mag); - calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); + calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -328,20 +330,21 @@ namespace canny } } - void edgesHysteresisLocal(PtrStepSzi map, short2* st1) + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); const dim3 block(16, 16); const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - edgesHysteresisLocalKernel<<>>(map, st1); + edgesHysteresisLocalKernel<<>>(map, st1); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -441,27 +444,30 @@ namespace canny } } - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2) + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); while (count > 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); const dim3 block(128); const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); + edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); count = min(count, map.cols * map.rows); @@ -499,9 +505,9 @@ namespace cv { namespace cuda { namespace device namespace canny { - void getEdges(PtrStepSzi map, PtrStepSzb dst) + void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream) { - transform(map, dst, GetEdges(), WithOutMask(), 0); + transform(map, dst, GetEdges(), WithOutMask(), stream); } } diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp index 243665083a..162ee469ce 100644 --- a/modules/cudaimgproc/src/gftt.cpp +++ b/modules/cudaimgproc/src/gftt.cpp @@ -68,7 +68,7 @@ namespace GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance, int blockSize, bool useHarrisDetector, double harrisK); - void detect(InputArray image, OutputArray corners, InputArray mask = noArray()); + void detect(InputArray image, OutputArray corners, InputArray mask, Stream& stream); private: int maxCorners_; @@ -96,8 +96,11 @@ namespace cuda::createMinEigenValCorner(srcType, blockSize, 3); } - void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask) + void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::gfft; GpuMat image = _image.getGpuMat(); diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index d63e57de31..e942e9eb86 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -49,11 +49,11 @@ using namespace cv::cuda; void cv::cuda::calcHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); } -void cv::cuda::equalizeHist(InputArray, OutputArray, InputOutputArray, Stream&) { throw_no_cuda(); } +void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); } cv::Ptr cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr(); } -void cv::cuda::evenLevels(OutputArray, int, int, int) { throw_no_cuda(); } +void cv::cuda::evenLevels(OutputArray, int, int, int, Stream&) { throw_no_cuda(); } void cv::cuda::histEven(InputArray, OutputArray, InputOutputArray, int, int, int, Stream&) { throw_no_cuda(); } void cv::cuda::histEven(InputArray, GpuMat*, InputOutputArray, int*, int*, int*, Stream&) { throw_no_cuda(); } @@ -93,7 +93,7 @@ namespace hist void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream); } -void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray _buf, Stream& _stream) +void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream) { GpuMat src = _src.getGpuMat(); @@ -107,8 +107,8 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray size_t bufSize = intBufSize + 2 * 256 * sizeof(int); - ensureSizeIsEnough(1, static_cast(bufSize), CV_8UC1, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(_stream); + GpuMat buf = pool.getBuffer(1, static_cast(bufSize), CV_8UC1); GpuMat hist(1, 256, CV_32SC1, buf.data); GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int)); @@ -288,7 +288,7 @@ namespace { typedef typename NppHistogramEvenFuncC1::src_t src_t; - static void hist(const GpuMat& src, OutputArray _hist, InputOutputArray _buf, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream) + static void hist(const GpuMat& src, OutputArray _hist, int histSize, int lowerLevel, int upperLevel, Stream& stream) { const int levels = histSize + 1; @@ -302,15 +302,15 @@ namespace int buf_size; get_buf_size(sz, levels, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8UC1, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); nppSafeCall( func(src.ptr(), static_cast(src.step), sz, hist.ptr(), levels, lowerLevel, upperLevel, buf.ptr()) ); - if (stream == 0) + if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -319,7 +319,7 @@ namespace { typedef typename NppHistogramEvenFuncC4::src_t src_t; - static void hist(const GpuMat& src, GpuMat hist[4],InputOutputArray _buf, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream) + static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream) { int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1}; hist[0].create(1, histSize[0], CV_32S); @@ -336,14 +336,14 @@ namespace int buf_size; get_buf_size(sz, levels, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8U, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); nppSafeCall( func(src.ptr(), static_cast(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr()) ); - if (stream == 0) + if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -392,7 +392,7 @@ namespace typedef typename NppHistogramRangeFuncC1::level_t level_t; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1::LEVEL_TYPE_CODE}; - static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, InputOutputArray _buf, cudaStream_t stream) + static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, Stream& stream) { CV_Assert( levels.type() == LEVEL_TYPE_CODE && levels.rows == 1 ); @@ -406,8 +406,8 @@ namespace int buf_size; get_buf_size(sz, levels.cols, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8U, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); @@ -424,7 +424,7 @@ namespace typedef typename NppHistogramRangeFuncC1::level_t level_t; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1::LEVEL_TYPE_CODE}; - static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4],InputOutputArray _buf, cudaStream_t stream) + static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream) { CV_Assert( levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1 ); CV_Assert( levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1 ); @@ -447,8 +447,8 @@ namespace int buf_size; get_buf_size(sz, nLevels, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8U, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); @@ -460,7 +460,7 @@ namespace }; } -void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel) +void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream) { const int kind = _levels.kind(); @@ -475,7 +475,7 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr(), nLevels, lowerLevel, upperLevel) ); if (kind == _InputArray::CUDA_GPU_MAT) - _levels.getGpuMatRef().upload(host_levels); + _levels.getGpuMatRef().upload(host_levels, stream); } namespace hist @@ -493,9 +493,9 @@ namespace } } -void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream) +void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, OutputArray hist, InputOutputArray buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream); static const hist_t hist_callers[] = { NppHistogramEvenC1::hist, @@ -514,12 +514,12 @@ void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 ); - hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream); } -void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream) +void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], InputOutputArray buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream); static const hist_t hist_callers[] = { NppHistogramEvenC4::hist, @@ -532,12 +532,12 @@ void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, i CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 ); - hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream); } -void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, InputOutputArray buf, Stream& stream) +void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, InputOutputArray buf, cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream); static const hist_t hist_callers[] = { NppHistogramRangeC1::hist, @@ -553,12 +553,12 @@ void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1 ); - hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, levels, stream); } -void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream) +void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream); static const hist_t hist_callers[] = { NppHistogramRangeC4::hist, @@ -573,7 +573,7 @@ void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4] CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4 ); - hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, levels, stream); } #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp index 3f9b9334c5..6bdaf16a2d 100644 --- a/modules/cudaimgproc/src/hough_circles.cpp +++ b/modules/cudaimgproc/src/hough_circles.cpp @@ -74,7 +74,7 @@ namespace public: HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles); - void detect(InputArray src, OutputArray circles); + void detect(InputArray src, OutputArray circles, Stream& stream); void setDp(float dp) { dp_ = dp; } float getDp() const { return dp_; } @@ -154,8 +154,11 @@ namespace filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1); } - void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles) + void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_circles; diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp index b9f159a9cb..7b9c082942 100644 --- a/modules/cudaimgproc/src/hough_lines.cpp +++ b/modules/cudaimgproc/src/hough_lines.cpp @@ -75,8 +75,8 @@ namespace { } - void detect(InputArray src, OutputArray lines); - void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()); + void detect(InputArray src, OutputArray lines, Stream& stream); + void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream); void setRho(float rho) { rho_ = rho; } float getRho() const { return rho_; } @@ -125,8 +125,11 @@ namespace GpuMat result_; }; - void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines) + void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_lines; @@ -170,7 +173,7 @@ namespace result_.copyTo(lines); } - void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes) + void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream) { GpuMat d_lines = _d_lines.getGpuMat(); @@ -184,12 +187,18 @@ namespace CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 ); - d_lines.row(0).download(h_lines); + if (stream) + d_lines.row(0).download(h_lines, stream); + else + d_lines.row(0).download(h_lines); if (h_votes.needed()) { GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr(1)); - d_votes.download(h_votes); + if (stream) + d_votes.download(h_votes, stream); + else + d_votes.download(h_votes); } } } diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp index 2434f6d26d..e3e34ec3d0 100644 --- a/modules/cudaimgproc/src/hough_segments.cpp +++ b/modules/cudaimgproc/src/hough_segments.cpp @@ -79,7 +79,7 @@ namespace { } - void detect(InputArray src, OutputArray lines); + void detect(InputArray src, OutputArray lines, Stream& stream); void setRho(float rho) { rho_ = rho; } float getRho() const { return rho_; } @@ -128,8 +128,11 @@ namespace GpuMat result_; }; - void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines) + void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_lines; using namespace cv::cuda::device::hough_segments; diff --git a/modules/cudaimgproc/src/mssegmentation.cpp b/modules/cudaimgproc/src/mssegmentation.cpp index ad5819800e..54926f3777 100644 --- a/modules/cudaimgproc/src/mssegmentation.cpp +++ b/modules/cudaimgproc/src/mssegmentation.cpp @@ -43,7 +43,7 @@ #if !defined HAVE_CUDA || defined(CUDA_DISABLER) -void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria) { throw_no_cuda(); } +void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria, Stream&) { throw_no_cuda(); } #else @@ -222,7 +222,7 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs) } // anonymous namespace -void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria) +void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria, Stream& stream) { GpuMat src = _src.getGpuMat(); @@ -235,7 +235,10 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, // Perform mean shift procedure and obtain region and spatial maps GpuMat d_rmap, d_spmap; - cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria); + cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria, stream); + + stream.waitForCompletion(); + Mat rmap(d_rmap); Mat spmap(d_spmap); diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp index 3e7d44e87c..08b5c5f959 100644 --- a/modules/ml/src/ann_mlp.cpp +++ b/modules/ml/src/ann_mlp.cpp @@ -103,6 +103,7 @@ public: ANN_MLPImpl( const Params& p ) { + clear(); setParams(p); } @@ -126,6 +127,7 @@ public: rng = RNG((uint64)-1); weights.clear(); trained = false; + max_buf_sz = 1 << 12; } int layer_count() const { return (int)layer_sizes.size(); } @@ -1241,7 +1243,7 @@ public: clear(); vector _layer_sizes; - fn["layer_sizes"] >> _layer_sizes; + readVectorOrMat(fn["layer_sizes"], _layer_sizes); create( _layer_sizes ); int i, l_count = layer_count(); diff --git a/modules/ml/src/boost.cpp b/modules/ml/src/boost.cpp index 5e0b307338..236cd97a2d 100644 --- a/modules/ml/src/boost.cpp +++ b/modules/ml/src/boost.cpp @@ -434,13 +434,17 @@ public: bparams.priors = params0.priors; FileNode tparams_node = fn["training_params"]; - String bts = (String)tparams_node["boosting_type"]; + // check for old layout + String bts = (String)(fn["boosting_type"].empty() ? + tparams_node["boosting_type"] : fn["boosting_type"]); bparams.boostType = (bts == "DiscreteAdaboost" ? Boost::DISCRETE : bts == "RealAdaboost" ? Boost::REAL : bts == "LogitBoost" ? Boost::LOGIT : bts == "GentleAdaboost" ? Boost::GENTLE : -1); _isClassifier = bparams.boostType == Boost::DISCRETE; - bparams.weightTrimRate = (double)tparams_node["weight_trimming_rate"]; + // check for old layout + bparams.weightTrimRate = (double)(fn["weight_trimming_rate"].empty() ? + tparams_node["weight_trimming_rate"] : fn["weight_trimming_rate"]); } void read( const FileNode& fn ) diff --git a/modules/ml/src/data.cpp b/modules/ml/src/data.cpp index 6b5ceb4881..d2ac18ff01 100644 --- a/modules/ml/src/data.cpp +++ b/modules/ml/src/data.cpp @@ -898,7 +898,7 @@ public: CV_Assert( m > 0 ); // if m==0, vi is an ordered variable const int* cmap = &catMap.at(ofs[0]); - bool fastMap = (m == cmap[m] - cmap[0]); + bool fastMap = (m == cmap[m - 1] - cmap[0] + 1); if( fastMap ) { diff --git a/modules/ml/src/inner_functions.cpp b/modules/ml/src/inner_functions.cpp index dbc21ff092..561abbaeb8 100644 --- a/modules/ml/src/inner_functions.cpp +++ b/modules/ml/src/inner_functions.cpp @@ -115,6 +115,7 @@ void StatModel::save(const String& filename) const { FileStorage fs(filename, FileStorage::WRITE); fs << getDefaultModelName() << "{"; + fs << "format" << (int)3; write(fs); fs << "}"; } diff --git a/modules/ml/src/precomp.hpp b/modules/ml/src/precomp.hpp index d308ae98ec..69ff03047e 100644 --- a/modules/ml/src/precomp.hpp +++ b/modules/ml/src/precomp.hpp @@ -263,11 +263,27 @@ namespace ml vector subsets; vector classLabels; vector missingSubst; + vector varMapping; bool _isClassifier; Ptr w; }; + template + static inline void readVectorOrMat(const FileNode & node, std::vector & v) + { + if (node.type() == FileNode::MAP) + { + Mat m; + node >> m; + m.copyTo(v); + } + else if (node.type() == FileNode::SEQ) + { + node >> v; + } + } + }} #endif /* __OPENCV_ML_PRECOMP_HPP__ */ diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp index 7c9cbaf268..7441faac17 100644 --- a/modules/ml/src/rtrees.cpp +++ b/modules/ml/src/rtrees.cpp @@ -346,7 +346,7 @@ public: oobError = (double)fn["oob_error"]; int ntrees = (int)fn["ntrees"]; - fn["var_importance"] >> varImportance; + readVectorOrMat(fn["var_importance"], varImportance); readParams(fn); diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp index c7c32f0be5..a0df44f78b 100644 --- a/modules/ml/src/svm.cpp +++ b/modules/ml/src/svm.cpp @@ -2038,7 +2038,8 @@ public: { Params _params; - String svm_type_str = (String)fn["svmType"]; + // check for old naming + String svm_type_str = (String)(fn["svm_type"].empty() ? fn["svmType"] : fn["svm_type"]); int svmType = svm_type_str == "C_SVC" ? C_SVC : svm_type_str == "NU_SVC" ? NU_SVC : diff --git a/modules/ml/src/tree.cpp b/modules/ml/src/tree.cpp index 416abd9364..64f66169b0 100644 --- a/modules/ml/src/tree.cpp +++ b/modules/ml/src/tree.cpp @@ -1597,7 +1597,10 @@ void DTreesImpl::writeParams(FileStorage& fs) const fs << "}"; if( !varIdx.empty() ) + { + fs << "global_var_idx" << 1; fs << "var_idx" << varIdx; + } fs << "var_type" << varType; @@ -1726,9 +1729,8 @@ void DTreesImpl::readParams( const FileNode& fn ) if( !tparams_node.empty() ) // training parameters are not necessary { params0.useSurrogates = (int)tparams_node["use_surrogates"] != 0; - params0.maxCategories = (int)tparams_node["max_categories"]; + params0.maxCategories = (int)(tparams_node["max_categories"].empty() ? 16 : tparams_node["max_categories"]); params0.regressionAccuracy = (float)tparams_node["regression_accuracy"]; - params0.maxDepth = (int)tparams_node["max_depth"]; params0.minSampleCount = (int)tparams_node["min_sample_count"]; params0.CVFolds = (int)tparams_node["cross_validation_folds"]; @@ -1741,13 +1743,83 @@ void DTreesImpl::readParams( const FileNode& fn ) tparams_node["priors"] >> params0.priors; } - fn["var_idx"] >> varIdx; + readVectorOrMat(fn["var_idx"], varIdx); fn["var_type"] >> varType; - fn["cat_ofs"] >> catOfs; - fn["cat_map"] >> catMap; - fn["missing_subst"] >> missingSubst; - fn["class_labels"] >> classLabels; + int format = 0; + fn["format"] >> format; + bool isLegacy = format < 3; + + int varAll = (int)fn["var_all"]; + if (isLegacy && (int)varType.size() <= varAll) + { + std::vector extendedTypes(varAll + 1, 0); + + int i = 0, n; + if (!varIdx.empty()) + { + n = (int)varIdx.size(); + for (; i < n; ++i) + { + int var = varIdx[i]; + extendedTypes[var] = varType[i]; + } + } + else + { + n = (int)varType.size(); + for (; i < n; ++i) + { + extendedTypes[i] = varType[i]; + } + } + extendedTypes[varAll] = (uchar)(_isClassifier ? VAR_CATEGORICAL : VAR_ORDERED); + extendedTypes.swap(varType); + } + + readVectorOrMat(fn["cat_map"], catMap); + + if (isLegacy) + { + // generating "catOfs" from "cat_count" + catOfs.clear(); + classLabels.clear(); + std::vector counts; + readVectorOrMat(fn["cat_count"], counts); + unsigned int i = 0, j = 0, curShift = 0, size = (int)varType.size() - 1; + for (; i < size; ++i) + { + Vec2i newOffsets(0, 0); + if (varType[i] == VAR_CATEGORICAL) // only categorical vars are represented in catMap + { + newOffsets[0] = curShift; + curShift += counts[j]; + newOffsets[1] = curShift; + ++j; + } + catOfs.push_back(newOffsets); + } + // other elements in "catMap" are "classLabels" + if (curShift < catMap.size()) + { + classLabels.insert(classLabels.end(), catMap.begin() + curShift, catMap.end()); + catMap.erase(catMap.begin() + curShift, catMap.end()); + } + } + else + { + fn["cat_ofs"] >> catOfs; + fn["missing_subst"] >> missingSubst; + fn["class_labels"] >> classLabels; + } + + // init var mapping for node reading (var indexes or varIdx indexes) + bool globalVarIdx = false; + fn["global_var_idx"] >> globalVarIdx; + if (globalVarIdx || varIdx.empty()) + setRangeVector(varMapping, (int)varType.size()); + else + varMapping = varIdx; initCompVarIdx(); setDParams(params0); @@ -1759,6 +1831,7 @@ int DTreesImpl::readSplit( const FileNode& fn ) int vi = (int)fn["var"]; CV_Assert( 0 <= vi && vi <= (int)varType.size() ); + vi = varMapping[vi]; // convert to varIdx if needed split.varIdx = vi; if( varType[vi] == VAR_CATEGORICAL ) // split on categorical var diff --git a/modules/ml/test/test_save_load.cpp b/modules/ml/test/test_save_load.cpp index bef2fd0e1c..74e8eef0df 100644 --- a/modules/ml/test/test_save_load.cpp +++ b/modules/ml/test/test_save_load.cpp @@ -158,6 +158,109 @@ TEST(ML_Boost, save_load) { CV_SLMLTest test( CV_BOOST ); test.safe_run(); } TEST(ML_RTrees, save_load) { CV_SLMLTest test( CV_RTREES ); test.safe_run(); } TEST(DISABLED_ML_ERTrees, save_load) { CV_SLMLTest test( CV_ERTREES ); test.safe_run(); } +class CV_LegacyTest : public cvtest::BaseTest +{ +public: + CV_LegacyTest(const std::string &_modelName, const std::string &_suffixes = std::string()) + : cvtest::BaseTest(), modelName(_modelName), suffixes(_suffixes) + { + } + virtual ~CV_LegacyTest() {} +protected: + void run(int) + { + unsigned int idx = 0; + for (;;) + { + if (idx >= suffixes.size()) + break; + int found = (int)suffixes.find(';', idx); + string piece = suffixes.substr(idx, found - idx); + if (piece.empty()) + break; + oneTest(piece); + idx += (unsigned int)piece.size() + 1; + } + } + void oneTest(const string & suffix) + { + using namespace cv::ml; + + int code = cvtest::TS::OK; + string filename = ts->get_data_path() + "legacy/" + modelName + suffix; + bool isTree = modelName == CV_BOOST || modelName == CV_DTREE || modelName == CV_RTREES; + Ptr model; + if (modelName == CV_BOOST) + model = StatModel::load(filename); + else if (modelName == CV_ANN) + model = StatModel::load(filename); + else if (modelName == CV_DTREE) + model = StatModel::load(filename); + else if (modelName == CV_NBAYES) + model = StatModel::load(filename); + else if (modelName == CV_SVM) + model = StatModel::load(filename); + else if (modelName == CV_RTREES) + model = StatModel::load(filename); + if (!model) + { + code = cvtest::TS::FAIL_INVALID_TEST_DATA; + } + else + { + Mat input = Mat(isTree ? 10 : 1, model->getVarCount(), CV_32F); + ts->get_rng().fill(input, RNG::UNIFORM, 0, 40); + + if (isTree) + randomFillCategories(filename, input); + + Mat output; + model->predict(input, output, StatModel::RAW_OUTPUT | (isTree ? DTrees::PREDICT_SUM : 0)); + // just check if no internal assertions or errors thrown + } + ts->set_failed_test_info(code); + } + void randomFillCategories(const string & filename, Mat & input) + { + Mat catMap; + Mat catCount; + std::vector varTypes; + + FileStorage fs(filename, FileStorage::READ); + FileNode root = fs.getFirstTopLevelNode(); + root["cat_map"] >> catMap; + root["cat_count"] >> catCount; + root["var_type"] >> varTypes; + + int offset = 0; + int countOffset = 0; + uint var = 0, varCount = (uint)varTypes.size(); + for (; var < varCount; ++var) + { + if (varTypes[var] == ml::VAR_CATEGORICAL) + { + int size = catCount.at(0, countOffset); + for (int row = 0; row < input.rows; ++row) + { + int randomChosenIndex = offset + ((uint)ts->get_rng()) % size; + int value = catMap.at(0, randomChosenIndex); + input.at(row, var) = (float)value; + } + offset += size; + ++countOffset; + } + } + } + string modelName; + string suffixes; +}; + +TEST(ML_ANN, legacy_load) { CV_LegacyTest test(CV_ANN, "_waveform.xml"); test.safe_run(); } +TEST(ML_Boost, legacy_load) { CV_LegacyTest test(CV_BOOST, "_adult.xml;_1.xml;_2.xml;_3.xml"); test.safe_run(); } +TEST(ML_DTree, legacy_load) { CV_LegacyTest test(CV_DTREE, "_abalone.xml;_mushroom.xml"); test.safe_run(); } +TEST(ML_NBayes, legacy_load) { CV_LegacyTest test(CV_NBAYES, "_waveform.xml"); test.safe_run(); } +TEST(ML_SVM, legacy_load) { CV_LegacyTest test(CV_SVM, "_poletelecomm.xml;_waveform.xml"); test.safe_run(); } +TEST(ML_RTrees, legacy_load) { CV_LegacyTest test(CV_RTREES, "_waveform.xml"); test.safe_run(); } /*TEST(ML_SVM, throw_exception_when_save_untrained_model) { diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl index 704dec4447..4fae320d02 100644 --- a/modules/objdetect/src/opencl/objdetect_hog.cl +++ b/modules/objdetect/src/opencl/objdetect_hog.cl @@ -48,7 +48,7 @@ #define CELLS_PER_BLOCK_X 2 #define CELLS_PER_BLOCK_Y 2 #define NTHREADS 256 -#define CV_PI_F 3.1415926535897932384626433832795f +#define CV_PI_F M_PI_F #ifdef INTEL_DEVICE #define QANGLE_TYPE int @@ -606,23 +606,23 @@ __kernel void compute_gradients_8UC4_kernel( barrier(CLK_LOCAL_MEM_FENCE); if (x < width) { - float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], - sh_row[tid + 2 * (NTHREADS + 2)]); - float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], - sh_row[tid + 2 + 2 * (NTHREADS + 2)]); + float4 a = (float4) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], + sh_row[tid + 2 * (NTHREADS + 2)], 0); + float4 b = (float4) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], + sh_row[tid + 2 + 2 * (NTHREADS + 2)], 0); - float3 dx; + float4 dx; if (correct_gamma == 1) dx = sqrt(b) - sqrt(a); else dx = b - a; - float3 dy = (float3) 0.f; + float4 dy = (float4) 0.f; if (gidY > 0 && gidY < height - 1) { - a = convert_float3(img[(gidY - 1) * img_step + x].xyz); - b = convert_float3(img[(gidY + 1) * img_step + x].xyz); + a = convert_float4(img[(gidY - 1) * img_step + x].xyzw); + b = convert_float4(img[(gidY + 1) * img_step + x].xyzw); if (correct_gamma == 1) dy = sqrt(b) - sqrt(a); @@ -630,28 +630,25 @@ __kernel void compute_gradients_8UC4_kernel( dy = b - a; } + float4 mag = hypot(dx, dy); float best_dx = dx.x; float best_dy = dy.x; - float mag0 = dx.x * dx.x + dy.x * dy.x; - float mag1 = dx.y * dx.y + dy.y * dy.y; - if (mag0 < mag1) + float mag0 = mag.x; + if (mag0 < mag.y) { best_dx = dx.y; best_dy = dy.y; - mag0 = mag1; + mag0 = mag.y; } - mag1 = dx.z * dx.z + dy.z * dy.z; - if (mag0 < mag1) + if (mag0 < mag.z) { best_dx = dx.z; best_dy = dy.z; - mag0 = mag1; + mag0 = mag.z; } - mag0 = sqrt(mag0); - float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f; int hidx = (int)floor(ang); ang -= hidx; @@ -710,7 +707,7 @@ __kernel void compute_gradients_8UC1_kernel( else dy = a - b; } - float mag = sqrt(dx * dx + dy * dy); + float mag = hypot(dx, dy); float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f; int hidx = (int)floor(ang); diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp index bd967f963d..5870f4bc71 100644 --- a/modules/videoio/src/cap_ffmpeg_impl.hpp +++ b/modules/videoio/src/cap_ffmpeg_impl.hpp @@ -139,8 +139,10 @@ extern "C" { #include #include #include +#if defined __APPLE__ #include #endif +#endif #ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index af3f874e1f..2e7faa3341 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -1053,12 +1053,11 @@ TEST(equalizeHist) cuda::GpuMat d_src(src); cuda::GpuMat d_dst; - cuda::GpuMat d_buf; - cuda::equalizeHist(d_src, d_dst, d_buf); + cuda::equalizeHist(d_src, d_dst); CUDA_ON; - cuda::equalizeHist(d_src, d_dst, d_buf); + cuda::equalizeHist(d_src, d_dst); CUDA_OFF; } }