diff --git a/doc/gpu_features2d.tex b/doc/gpu_features2d.tex index 82b6396e97..d92694dee5 100644 --- a/doc/gpu_features2d.tex +++ b/doc/gpu_features2d.tex @@ -1,7 +1,55 @@ \section{Feature Detection and Description} -\cvclass{gpu::SURF\_GPU} +\cvclass{gpu::SURFParams\_GPU}\label{class.gpu.SURFParams} +Various SURF algorithm parameters. + +\begin{lstlisting} +struct SURFParams_GPU +{ + SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), + initialScale(2.f), l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), + l4(1.f/1.5f), edgeScale(0.81f), initialStep(1), extended(true), + featuresRatio(0.01f) {} + + //! The interest operator threshold + float threshold; + //! The number of octaves to process + int nOctaves; + //! The number of intervals in each octave + int nIntervals; + //! The scale associated with the first interval of the first octave + float initialScale; + + //! mask parameter l_1 + float l1; + //! mask parameter l_2 + float l2; + //! mask parameter l_3 + float l3; + //! mask parameter l_4 + float l4; + //! The amount to scale the edge rejection mask + float edgeScale; + //! The initial sampling step in pixels. + int initialStep; + + //! True, if generate 128-len descriptors, false - 64-len descriptors + bool extended; + + //! max features = featuresRatio * img.size().area() + float featuresRatio; +}; +\end{lstlisting} + +In contrast to \hyperref[cv.class.SURF]{cv::SURF} \texttt{SURF\_GPU} works with float sources (with range [0..1]). It performs conversion after calculation of the integral by division result by 255. Please take it into consideration when change some parameters (like hessian threshold). + +Current \texttt{SURF\_GPU} implementation supports the number of intervals in each octave in range [3..21]. + +See also: \hyperref[class.gpu.SURF]{cv::gpu::SURF\_GPU}. + + +\cvclass{gpu::SURF\_GPU}\label{class.gpu.SURF} Class for extracting Speeded Up Robust Features from an image. \begin{lstlisting} @@ -62,7 +110,7 @@ The class \texttt{SURF\_GPU} can store results to GPU and CPU memory and provide The class \texttt{SURF\_GPU} uses some buffers and provides access to it. All buffers can be safely released between function calls. -See also: \hyperref[cv.class.SURF]{cv::SURF}. +See also: \hyperref[cv.class.SURF]{cv::SURF}, \hyperref[class.gpu.SURFParams]{cv::gpu::SURFParams\_GPU}. \cvclass{gpu::BruteForceMatcher\_GPU} @@ -269,7 +317,7 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par void radiusMatch(const GpuMat\& queryDescs, \par std::vector< std::vector >\& matches, \par float maxDistance, \par const std::vector\& masks = std::vector(), \par bool compactResult = false); } -This function works only on devices with Compute Capability $>=$ 1.1. +\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1. See also: \cvCppCross{DescriptorMatcher::radiusMatch}. @@ -293,7 +341,8 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par In contrast to \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatch]{cv::gpu::BruteForceMather\_GPU::radiusMatch} results are not sorted by distance increasing order. -This function works only on devices with Compute Capability $>=$ 1.1. +\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1. + \cvfunc{cv::gpu::BruteForceMatcher\_GPU::radiusMatchDownload}\label{cppfunc.gpu.BruteForceMatcher.radiusMatchDownload} Downloads \texttt{trainIdx}, \texttt{nMatches} and \texttt{distance} matrices obtained via \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatchSingle]{radiusMatch} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}. If \texttt{compactResult} is true \texttt{matches} vector will not contain matches for fully masked out query descriptors. diff --git a/doc/gpu_image_processing.tex b/doc/gpu_image_processing.tex index c967093198..87ccea2c05 100644 --- a/doc/gpu_image_processing.tex +++ b/doc/gpu_image_processing.tex @@ -17,6 +17,8 @@ Performs mean-shift filtering for each point of the source image. It maps each p \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.} \end{description} +\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2. + \cvCppFunc{gpu::meanShiftProc} Performs mean-shift procedure and stores information about processed points (i.e. their colors and positions) into two images. @@ -35,6 +37,8 @@ Performs mean-shift procedure and stores information about processed points (i.e \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.} \end{description} +\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2. + See also: \cvCppCross{gpu::meanShiftFiltering}. @@ -55,6 +59,8 @@ Performs mean-shift segmentation of the source image and eleminates small segmen \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.} \end{description} +\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2. + \cvCppFunc{gpu::integral} Computes integral image and squared integral image. @@ -319,7 +325,7 @@ double threshold(const GpuMat\& src, GpuMat\& dst, double thresh, \par double ma } \begin{description} -\cvarg{src}{Source array (single-channel, \texttt{CV\_64F} depth isn't supported).} +\cvarg{src}{Source array (single-channel).} \cvarg{dst}{Destination array; will have the same size and the same type as \texttt{src}.} \cvarg{thresh}{Threshold value.} \cvarg{maxVal}{Maximum value to use with \texttt{THRESH\_BINARY} and \texttt{THRESH\_BINARY\_INV} thresholding types.} diff --git a/doc/opencv.pdf b/doc/opencv.pdf index cb646f5992..82c4ff4bbe 100644 Binary files a/doc/opencv.pdf and b/doc/opencv.pdf differ diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index 6ebf5a4cc9..0e9752e347 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -582,10 +582,10 @@ namespace cv { namespace gpu { namespace bfmatcher } /////////////////////////////////////////////////////////////////////////////// - // Match kernel chooser + // Match caller template - void match_chooser(const DevMem2D_& queryDescs, const Train& train, + void matchDispatcher(const DevMem2D_& queryDescs, const Train& train, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12) { @@ -616,11 +616,11 @@ namespace cv { namespace gpu { namespace bfmatcher if (mask.data) { SingleMask m(mask); - match_chooser((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } @@ -640,11 +640,11 @@ namespace cv { namespace gpu { namespace bfmatcher if (mask.data) { SingleMask m(mask); - match_chooser((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } @@ -664,11 +664,11 @@ namespace cv { namespace gpu { namespace bfmatcher if (maskCollection.data) { MaskCollection mask(maskCollection.data); - match_chooser((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } @@ -688,11 +688,11 @@ namespace cv { namespace gpu { namespace bfmatcher if (maskCollection.data) { MaskCollection mask(maskCollection.data); - match_chooser((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); + matchDispatcher((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } @@ -942,22 +942,35 @@ namespace cv { namespace gpu { namespace bfmatcher /////////////////////////////////////////////////////////////////////////////// // knn match caller + template + void calcDistanceDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, + const Mask& mask, const DevMem2Df& allDist) + { + calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist); + } + + void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, + const DevMem2Df& allDist) + { + findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); + } + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) { if (mask.data) { - calcDistance_caller<16, 16, L1Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, SingleMask(mask), allDist); } else { - calcDistance_caller<16, 16, L1Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, WithOutMask(), allDist); } - findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); + findKnnMatchDispatcher(knn, trainIdx, distance, allDist); } template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); @@ -973,16 +986,16 @@ namespace cv { namespace gpu { namespace bfmatcher { if (mask.data) { - calcDistance_caller<16, 16, L2Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, SingleMask(mask), allDist); } else { - calcDistance_caller<16, 16, L2Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + calcDistanceDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, WithOutMask(), allDist); } - findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); + findKnnMatchDispatcher(knn, trainIdx, distance, allDist); } template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); @@ -1061,7 +1074,16 @@ namespace cv { namespace gpu { namespace bfmatcher } /////////////////////////////////////////////////////////////////////////////// - // Radius Match kernel chooser + // Radius Match caller + + template + void radiusMatchDispatcher(const DevMem2D_& queryDescs, const DevMem2D_& trainDescs, + float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, + const DevMem2Df& distance) + { + radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, + trainIdx, nMatches, distance); + } template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, @@ -1069,12 +1091,12 @@ namespace cv { namespace gpu { namespace bfmatcher { if (mask.data) { - radiusMatch_caller<16, 16, L1Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, maxDistance, SingleMask(mask), trainIdx, nMatches, distance); } else { - radiusMatch_caller<16, 16, L1Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, maxDistance, WithOutMask(), trainIdx, nMatches, distance); } } @@ -1092,12 +1114,12 @@ namespace cv { namespace gpu { namespace bfmatcher { if (mask.data) { - radiusMatch_caller<16, 16, L2Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, maxDistance, SingleMask(mask), trainIdx, nMatches, distance); } else { - radiusMatch_caller<16, 16, L2Dist>((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, + radiusMatchDispatcher((DevMem2D_)queryDescs, (DevMem2D_)trainDescs, maxDistance, WithOutMask(), trainIdx, nMatches, distance); } } diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index 6228fb7f8f..e3d9851689 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -44,6 +44,7 @@ #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/vecmath.hpp" #include "opencv2/gpu/device/limits_gpu.hpp" +#include "opencv2/gpu/device/transform.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -94,46 +95,46 @@ namespace cv { namespace gpu { namespace color return vec.w; } + template + void callConvert(const DevMem2D& src, const DevMem2D& dst, const Cvt& cvt, cudaStream_t stream) + { + typedef typename Cvt::src_t src_t; + typedef typename Cvt::dst_t dst_t; + + transform((DevMem2D_)src, (DevMem2D_)dst, cvt, stream); + } + ////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// - template - __global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + template + struct RGB2RGB { typedef typename TypeVec::vec_t src_t; typedef typename TypeVec::vec_t dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + explicit RGB2RGB(int bidx) : bidx(bidx) {} - if (y < rows && x < cols) + __device__ dst_t operator()(const src_t& src) const { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); dst_t dst; dst.x = (&src.x)[bidx]; dst.y = src.y; dst.z = (&src.x)[bidx ^ 2]; setAlpha(dst, getAlpha(src)); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; + + return dst; } - } + + private: + int bidx; + }; template void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - RGB2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + RGB2RGB cvt(bidx); + callConvert(src, dst, cvt, stream); } void RGB2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream) @@ -174,110 +175,90 @@ namespace cv { namespace gpu { namespace color /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// - template struct RGB5x52RGBConverter {}; - template struct RGB5x52RGBConverter<5, DSTCN> + template struct RGB5x52RGBConverter; + template <> struct RGB5x52RGBConverter<5> { - typedef typename TypeVec::vec_t dst_t; - - static __device__ dst_t cvt(uint src, int bidx) - { - dst_t dst; - + template + static __device__ void cvt(uint src, D& dst, int bidx) + { (&dst.x)[bidx] = (uchar)(src << 3); dst.y = (uchar)((src >> 2) & ~7); (&dst.x)[bidx ^ 2] = (uchar)((src >> 7) & ~7); setAlpha(dst, (uchar)(src & 0x8000 ? 255 : 0)); - - return dst; } }; - template struct RGB5x52RGBConverter<6, DSTCN> + template <> struct RGB5x52RGBConverter<6> { - typedef typename TypeVec::vec_t dst_t; - - static __device__ dst_t cvt(uint src, int bidx) - { - dst_t dst; - + template + static __device__ void cvt(uint src, D& dst, int bidx) + { (&dst.x)[bidx] = (uchar)(src << 3); dst.y = (uchar)((src >> 3) & ~3); (&dst.x)[bidx ^ 2] = (uchar)((src >> 8) & ~7); setAlpha(dst, (uchar)(255)); - - return dst; } }; - template - __global__ void RGB5x52RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + template struct RGB5x52RGB { + typedef ushort src_t; typedef typename TypeVec::vec_t dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + explicit RGB5x52RGB(int bidx) : bidx(bidx) {} - if (y < rows && x < cols) + __device__ dst_t operator()(ushort src) const { - uint src = *(const ushort*)(src_ + y * src_step + (x << 1)); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN) = RGB5x52RGBConverter::cvt(src, bidx); + dst_t dst; + RGB5x52RGBConverter::cvt((uint)src, dst, bidx); + return dst; } - } - template struct RGB2RGB5x5Converter {}; - template struct RGB2RGB5x5Converter + private: + int bidx; + }; + + template struct RGB2RGB5x5Converter; + template<> struct RGB2RGB5x5Converter<6> { - static __device__ ushort cvt(const uchar* src, int bidx) + template + static __device__ ushort cvt(const T& src, int bidx) { - return (ushort)((src[bidx] >> 3) | ((src[1] & ~3) << 3) | ((src[bidx^2] & ~7) << 8)); + return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8)); } }; - template<> struct RGB2RGB5x5Converter<3, 5> + template<> struct RGB2RGB5x5Converter<5> { - static __device__ ushort cvt(const uchar* src, int bidx) + static __device__ ushort cvt(const uchar3& src, int bidx) { - return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7)); + return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7)); } - }; - template<> struct RGB2RGB5x5Converter<4, 5> - { - static __device__ ushort cvt(const uchar* src, int bidx) + static __device__ ushort cvt(const uchar4& src, int bidx) { - return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7) | (src[3] ? 0x8000 : 0)); + return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0)); } - }; + }; - template - __global__ void RGB2RGB5x5(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + template struct RGB2RGB5x5 { typedef typename TypeVec::vec_t src_t; + typedef ushort dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + explicit RGB2RGB5x5(int bidx) : bidx(bidx) {} - if (y < rows && x < cols) + __device__ ushort operator()(const src_t& src) { - src_t src = *(src_t*)(src_ + y * src_step + x * SRCCN); - - *(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter::cvt(&src.x, bidx); + return RGB2RGB5x5Converter::cvt(src, bidx); } - } + + private: + int bidx; + }; template void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - RGB5x52RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + RGB5x52RGB cvt(bidx); + callConvert(src, dst, cvt, stream); } void RGB5x52RGB_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream) @@ -295,18 +276,8 @@ namespace cv { namespace gpu { namespace color template void RGB2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - RGB2RGB5x5<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + RGB2RGB5x5 cvt(bidx); + callConvert(src, dst, cvt, stream); } void RGB2RGB5x5_gpu(const DevMem2D& src, int srccn, const DevMem2D& dst, int green_bits, int bidx, cudaStream_t stream) @@ -323,27 +294,23 @@ namespace cv { namespace gpu { namespace color ///////////////////////////////// Grayscale to Color //////////////////////////////// - template - __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) + template struct Gray2RGB { + typedef T src_t; typedef typename TypeVec::vec_t dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (y < rows && x < cols) + __device__ dst_t operator()(const T& src) const { - T src = *(const T*)(src_ + y * src_step + x * sizeof(T)); dst_t dst; - dst.x = src; - dst.y = src; - dst.z = src; - setAlpha(dst, ColorChannel::max()); - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; - } - } - template struct Gray2RGB5x5Converter {}; + dst.z = dst.y = dst.x = src; + setAlpha(dst, ColorChannel::max()); + + return dst; + } + }; + + template struct Gray2RGB5x5Converter; template<> struct Gray2RGB5x5Converter<6> { static __device__ ushort cvt(uint t) @@ -360,35 +327,22 @@ namespace cv { namespace gpu { namespace color } }; - template - __global__ void Gray2RGB5x5(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) + template struct Gray2RGB5x5 { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + typedef uchar src_t; + typedef ushort dst_t; - if (y < rows && x < cols) + __device__ ushort operator()(uchar src) const { - uint src = src_[y * src_step + x]; - - *(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter::cvt(src); + return Gray2RGB5x5Converter::cvt((uint)src); } - } + }; template void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - Gray2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + Gray2RGB cvt; + callConvert(src, dst, cvt, stream); } void Gray2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream) @@ -418,18 +372,8 @@ namespace cv { namespace gpu { namespace color template void Gray2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - Gray2RGB5x5<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + Gray2RGB5x5 cvt; + callConvert(src, dst, cvt, stream); } void Gray2RGB5x5_gpu(const DevMem2D& src, const DevMem2D& dst, int green_bits, cudaStream_t stream) @@ -459,7 +403,7 @@ namespace cv { namespace gpu { namespace color BLOCK_SIZE = 256 }; - template struct RGB5x52GrayConverter {}; + template struct RGB5x52GrayConverter; template<> struct RGB5x52GrayConverter<6> { static __device__ uchar cvt(uint t) @@ -475,70 +419,52 @@ namespace cv { namespace gpu { namespace color } }; - template - __global__ void RGB5x52Gray(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) + template struct RGB5x52Gray { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + typedef ushort src_t; + typedef uchar dst_t; - if (y < rows && x < cols) + __device__ uchar operator()(ushort src) const { - uint src = *(ushort*)(src_ + y * src_step + (x << 1)); - - dst_[y * dst_step + x] = RGB5x52GrayConverter::cvt(src); + return RGB5x52GrayConverter::cvt((uint)src); } + }; + + template + __device__ T RGB2GrayConvert(const T* src, int bidx) + { + return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift); + } + __device__ float RGB2GrayConvert(const float* src, int bidx) + { + const float cr = 0.299f; + const float cg = 0.587f; + const float cb = 0.114f; + + return src[bidx] * cb + src[1] * cg + src[bidx^2] * cr; } - template struct RGB2GrayConvertor - { - static __device__ T cvt(const T* src, int bidx) - { - return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift); - } - }; - template <> struct RGB2GrayConvertor - { - static __device__ float cvt(const float* src, int bidx) - { - const float cr = 0.299f; - const float cg = 0.587f; - const float cb = 0.114f; - - return src[bidx] * cb + src[1] * cg + src[bidx^2] * cr; - } - }; - - template - __global__ void RGB2Gray(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + template struct RGB2Gray { typedef typename TypeVec::vec_t src_t; + typedef T dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + explicit RGB2Gray(int bidx) : bidx(bidx) {} - if (y < rows && x < cols) + __device__ T operator()(const src_t& src) { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - - *(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor::cvt(&src.x, bidx); + return RGB2GrayConvert(&src.x, bidx); } - } + + private: + int bidx; + }; template void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - RGB2Gray<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + RGB2Gray cvt(bidx); + callConvert(src, dst, cvt, stream); } void RGB2Gray_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream) @@ -568,18 +494,8 @@ namespace cv { namespace gpu { namespace color template void RGB5x52Gray_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - RGB5x52Gray<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + RGB5x52Gray cvt; + callConvert(src, dst, cvt, stream); } void RGB5x52Gray_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, cudaStream_t stream) @@ -595,622 +511,614 @@ namespace cv { namespace gpu { namespace color ///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// - __constant__ float cYCrCbCoeffs_f[5]; __constant__ int cYCrCbCoeffs_i[5]; - - template struct RGB2YCrCbConverter + __constant__ float cYCrCbCoeffs_f[5]; + + template + __device__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx) { - template - static __device__ void cvt(const T* src, D& dst, int bidx) - { - const int delta = ColorChannel::half() * (1 << yuv_shift); + const int delta = ColorChannel::half() * (1 << yuv_shift); - const int Y = CV_DESCALE(src[0] * cYCrCbCoeffs_i[0] + src[1] * cYCrCbCoeffs_i[1] + src[2] * cYCrCbCoeffs_i[2], yuv_shift); - const int Cr = CV_DESCALE((src[bidx^2] - Y) * cYCrCbCoeffs_i[3] + delta, yuv_shift); - const int Cb = CV_DESCALE((src[bidx] - Y) * cYCrCbCoeffs_i[4] + delta, yuv_shift); + const int Y = CV_DESCALE(src[0] * cYCrCbCoeffs_i[0] + src[1] * cYCrCbCoeffs_i[1] + src[2] * cYCrCbCoeffs_i[2], yuv_shift); + const int Cr = CV_DESCALE((src[bidx^2] - Y) * cYCrCbCoeffs_i[3] + delta, yuv_shift); + const int Cb = CV_DESCALE((src[bidx] - Y) * cYCrCbCoeffs_i[4] + delta, yuv_shift); - dst.x = saturate_cast(Y); - dst.y = saturate_cast(Cr); - dst.z = saturate_cast(Cb); - } - }; - template<> struct RGB2YCrCbConverter + dst.x = saturate_cast(Y); + dst.y = saturate_cast(Cr); + dst.z = saturate_cast(Cb); + } + template + static __device__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx) { - template - static __device__ void cvt(const float* src, D& dst, int bidx) - { - dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2]; - dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel::half(); - dst.z = (src[bidx] - dst.x) * cYCrCbCoeffs_f[4] + ColorChannel::half(); - } - }; - - template - __global__ void RGB2YCrCb(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) - { - typedef typename TypeVec::vec_t src_t; - typedef typename TypeVec::vec_t dst_t; - - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (y < rows && x < cols) - { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - dst_t dst; - - RGB2YCrCbConverter::cvt(&src.x, dst, bidx); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; - } + dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2]; + dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel::half(); + dst.z = (src[bidx] - dst.x) * cYCrCbCoeffs_f[4] + ColorChannel::half(); } - template struct YCrCb2RGBConvertor + template struct RGB2YCrCbBase { - template - static __device__ void cvt(const T& src, D* dst, int bidx) - { - const int b = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[3], yuv_shift); - const int g = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_i[1], yuv_shift); - const int r = src.x + CV_DESCALE((src.y - ColorChannel::half()) * cYCrCbCoeffs_i[0], yuv_shift); + typedef int coeff_t; - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); + explicit RGB2YCrCbBase(const coeff_t coeffs[5]) + { + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); } }; - template <> struct YCrCb2RGBConvertor + template<> struct RGB2YCrCbBase { - template - static __device__ void cvt(const T& src, float* dst, int bidx) + typedef float coeff_t; + + explicit RGB2YCrCbBase(const coeff_t coeffs[5]) { - dst[bidx] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[3]; - dst[1] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_f[1]; - dst[bidx^2] = src.x + (src.y - ColorChannel::half()) * cYCrCbCoeffs_f[0]; + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); } }; - - template - __global__ void YCrCb2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + template struct RGB2YCrCb : RGB2YCrCbBase { + typedef typename RGB2YCrCbBase::coeff_t coeff_t; typedef typename TypeVec::vec_t src_t; typedef typename TypeVec::vec_t dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + RGB2YCrCb(int bidx, const coeff_t coeffs[5]) : RGB2YCrCbBase(coeffs), bidx(bidx) {} - if (y < rows && x < cols) + __device__ dst_t operator()(const src_t& src) const + { + dst_t dst; + RGB2YCrCbConvert(&src.x, dst, bidx); + return dst; + } + + private: + int bidx; + }; + + template + __device__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx) + { + const int b = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[3], yuv_shift); + const int g = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_i[1], yuv_shift); + const int r = src.x + CV_DESCALE((src.y - ColorChannel::half()) * cYCrCbCoeffs_i[0], yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + } + template + __device__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx) + { + dst[bidx] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[3]; + dst[1] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_f[1]; + dst[bidx^2] = src.x + (src.y - ColorChannel::half()) * cYCrCbCoeffs_f[0]; + } + + template struct YCrCb2RGBBase + { + typedef int coeff_t; + + explicit YCrCb2RGBBase(const coeff_t coeffs[4]) + { + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); + } + }; + template<> struct YCrCb2RGBBase + { + typedef float coeff_t; + + explicit YCrCb2RGBBase(const coeff_t coeffs[4]) + { + cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); + } + }; + template struct YCrCb2RGB : YCrCb2RGBBase + { + typedef typename YCrCb2RGBBase::coeff_t coeff_t; + typedef typename TypeVec::vec_t src_t; + typedef typename TypeVec::vec_t dst_t; + + YCrCb2RGB(int bidx, const coeff_t coeffs[4]) : YCrCb2RGBBase(coeffs), bidx(bidx) {} + + __device__ dst_t operator()(const src_t& src) const { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); dst_t dst; - YCrCb2RGBConvertor::cvt(src, &dst.x, bidx); + YCrCb2RGBConvert(src, &dst.x, bidx); setAlpha(dst, ColorChannel::max()); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; + + return dst; } - } + + private: + int bidx; + }; template - void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) + void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - RGB2YCrCb<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + typedef typename RGB2YCrCb::coeff_t coeff_t; + RGB2YCrCb cvt(bidx, (const coeff_t*)coeffs); + callConvert(src, dst, cvt, stream); } void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream) { - typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); + typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream); static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = { {RGB2YCrCb_caller, RGB2YCrCb_caller}, {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); - - RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); + RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream); } void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream) { - typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); + typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream); static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = { {RGB2YCrCb_caller, RGB2YCrCb_caller}, {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - - cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); - RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); + RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream); } void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream) { - typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); + typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream); static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = { {RGB2YCrCb_caller, RGB2YCrCb_caller}, {RGB2YCrCb_caller, RGB2YCrCb_caller} }; - - cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); - RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); + RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream); } template - void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) + void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - YCrCb2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + typedef typename YCrCb2RGB::coeff_t coeff_t; + YCrCb2RGB cvt(bidx, (const coeff_t*)coeffs); + callConvert(src, dst, cvt, stream); } void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream) { - typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); + typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream); static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = { {YCrCb2RGB_caller, YCrCb2RGB_caller}, {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); - - YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); + YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream); } void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream) { - typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); + typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream); static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = { {YCrCb2RGB_caller, YCrCb2RGB_caller}, {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); - - YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); + YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream); } void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream) { - typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); + typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream); static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = { {YCrCb2RGB_caller, YCrCb2RGB_caller}, {YCrCb2RGB_caller, YCrCb2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); - - YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); + YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream); } ////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// - __constant__ float cXYZ_D65f[9]; __constant__ int cXYZ_D65i[9]; + __constant__ float cXYZ_D65f[9]; - template struct RGB2XYZConvertor + template + __device__ void RGB2XYZConvert(const T* src, D& dst) { - template - static __device__ void cvt(const T* src, D& dst) - { - dst.x = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift)); - dst.y = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift)); - dst.z = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift)); - } - }; - template <> struct RGB2XYZConvertor + dst.x = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift)); + dst.y = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift)); + dst.z = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift)); + } + template + __device__ void RGB2XYZConvert(const float* src, D& dst) { - template - static __device__ void cvt(const float* src, D& dst) - { - dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2]; - dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5]; - dst.z = src[0] * cXYZ_D65f[6] + src[1] * cXYZ_D65f[7] + src[2] * cXYZ_D65f[8]; - } - }; - - template - __global__ void RGB2XYZ(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) - { - typedef typename TypeVec::vec_t src_t; - typedef typename TypeVec::vec_t dst_t; - - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (y < rows && x < cols) - { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - - dst_t dst; - RGB2XYZConvertor::cvt(&src.x, dst); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; - } + dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2]; + dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5]; + dst.z = src[0] * cXYZ_D65f[6] + src[1] * cXYZ_D65f[7] + src[2] * cXYZ_D65f[8]; } - template struct XYZ2RGBConvertor + template struct RGB2XYZBase { - template - static __device__ void cvt(const T& src, D* dst) - { - dst[0] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift)); - dst[1] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift)); - dst[2] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift)); - } - }; - template <> struct XYZ2RGBConvertor - { - template - static __device__ void cvt(const T& src, float* dst) - { - dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2]; - dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5]; - dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8]; - } - }; + typedef int coeff_t; - template - __global__ void XYZ2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) + explicit RGB2XYZBase(const coeff_t coeffs[9]) + { + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + } + }; + template <> struct RGB2XYZBase { + typedef float coeff_t; + + explicit RGB2XYZBase(const coeff_t coeffs[9]) + { + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) ); + } + }; + template struct RGB2XYZ : RGB2XYZBase + { + typedef typename RGB2XYZBase::coeff_t coeff_t; typedef typename TypeVec::vec_t src_t; typedef typename TypeVec::vec_t dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + explicit RGB2XYZ(const coeff_t coeffs[9]) : RGB2XYZBase(coeffs) {} - if (y < rows && x < cols) + __device__ dst_t operator()(const src_t& src) const { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - dst_t dst; - XYZ2RGBConvertor::cvt(src, &dst.x); + RGB2XYZConvert(&src.x, dst); + return dst; + } + }; + + template + __device__ void XYZ2RGBConvert(const T& src, D* dst) + { + dst[0] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift)); + dst[1] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift)); + dst[2] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift)); + } + template + __device__ void XYZ2RGBConvert(const T& src, float* dst) + { + dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2]; + dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5]; + dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8]; + } + + template struct XYZ2RGBBase + { + typedef int coeff_t; + + explicit XYZ2RGBBase(const coeff_t coeffs[9]) + { + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); + } + }; + template <> struct XYZ2RGBBase + { + typedef float coeff_t; + + explicit XYZ2RGBBase(const coeff_t coeffs[9]) + { + cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) ); + } + }; + template struct XYZ2RGB : XYZ2RGBBase + { + typedef typename RGB2XYZBase::coeff_t coeff_t; + typedef typename TypeVec::vec_t src_t; + typedef typename TypeVec::vec_t dst_t; + + explicit XYZ2RGB(const coeff_t coeffs[9]) : XYZ2RGBBase(coeffs) {} + + __device__ dst_t operator()(const src_t& src) const + { + dst_t dst; + XYZ2RGBConvert(src, &dst.x); setAlpha(dst, ColorChannel::max()); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; + return dst; } - } + }; template - void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) + void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - RGB2XYZ<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + typedef typename RGB2XYZ::coeff_t coeff_t; + RGB2XYZ cvt((const coeff_t*)coeffs); + callConvert(src, dst, cvt, stream); } void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream) { - typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream); static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = { {RGB2XYZ_caller, RGB2XYZ_caller}, {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - - RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); + RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream); } void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream) { - typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream); static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = { {RGB2XYZ_caller, RGB2XYZ_caller}, {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - - RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); + RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream); } void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream) { - typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream); static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = { {RGB2XYZ_caller, RGB2XYZ_caller}, {RGB2XYZ_caller, RGB2XYZ_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) ); - - RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); + RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream); } template - void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) + void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - - XYZ2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + typedef typename XYZ2RGB::coeff_t coeff_t; + XYZ2RGB cvt((const coeff_t*)coeffs); + callConvert(src, dst, cvt, stream); } void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream) { - typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream); static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = { {XYZ2RGB_caller, XYZ2RGB_caller}, {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - - XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); + XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream); } void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream) { - typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream); static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = { {XYZ2RGB_caller, XYZ2RGB_caller}, {XYZ2RGB_caller, XYZ2RGB_caller} }; - - cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); + XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream); } void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream) { - typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream); static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = { {XYZ2RGB_caller, XYZ2RGB_caller}, {XYZ2RGB_caller, XYZ2RGB_caller} }; - cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) ); - - XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); + XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream); } ////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// - __constant__ int cHsvDivTable[256]; - - template struct RGB2HSVConvertor; - template struct RGB2HSVConvertor + __constant__ int cHsvDivTable[256] = { - template - static __device__ void cvt(const uchar* src, D& dst, int bidx) - { - const int hsv_shift = 12; - const int hscale = HR == 180 ? 15 : 21; - - int b = src[bidx], g = src[1], r = src[bidx^2]; - int h, s, v = b; - int vmin = b, diff; - int vr, vg; - - v = max(v, g); - v = max(v, r); - vmin = min(vmin, g); - vmin = min(vmin, r); - - diff = v - vmin; - vr = v == r ? -1 : 0; - vg = v == g ? -1 : 0; - - s = diff * cHsvDivTable[v] >> hsv_shift; - h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); - h = (h * cHsvDivTable[diff] * hscale + (1 << (hsv_shift + 6))) >> (7 + hsv_shift); - h += h < 0 ? HR : 0; - - dst.x = (uchar)h; - dst.y = (uchar)s; - dst.z = (uchar)v; - } + 0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, + 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, + 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, + 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, + 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, + 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, + 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, + 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, + 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, + 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, + 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, + 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, + 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, + 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, + 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, + 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, + 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, + 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, + 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, + 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, + 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, + 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, + 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, + 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, + 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, + 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, + 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, + 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, + 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, + 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, + 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, + 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096 }; - template struct RGB2HSVConvertor + + template + __device__ void RGB2HSVConvert(const uchar* src, D& dst, int bidx) { - template - static __device__ void cvt(const float* src, D& dst, int bidx) - { - const float hscale = HR * (1.f / 360.f); + const int hsv_shift = 12; + const int hscale = HR == 180 ? 15 : 21; - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h, s, v; + int b = src[bidx], g = src[1], r = src[bidx^2]; + int h, s, v = b; + int vmin = b, diff; + int vr, vg; - float vmin, diff; + v = max(v, g); + v = max(v, r); + vmin = min(vmin, g); + vmin = min(vmin, r); - v = vmin = r; - v = fmax(v, g); - v = fmax(v, b); - vmin = fmin(vmin, g); - vmin = fmin(vmin, b); + diff = v - vmin; + vr = v == r ? -1 : 0; + vg = v == g ? -1 : 0; - diff = v - vmin; - s = diff / (float)(fabs(v) + numeric_limits_gpu::epsilon()); - diff = (float)(60. / (diff + numeric_limits_gpu::epsilon())); + s = diff * cHsvDivTable[v] >> hsv_shift; + h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); + h = (h * cHsvDivTable[diff] * hscale + (1 << (hsv_shift + 6))) >> (7 + hsv_shift); + h += h < 0 ? HR : 0; - if (v == r) - h = (g - b) * diff; - else if (v == g) - h = (b - r) * diff + 120.f; - else - h = (r - g) * diff + 240.f; - - if (h < 0) h += 360.f; - - dst.x = h * hscale; - dst.y = s; - dst.z = v; - } - }; - - template - __global__ void RGB2HSV(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + dst.x = (uchar)h; + dst.y = (uchar)s; + dst.z = (uchar)v; + } + template + __device__ void RGB2HSVConvert(const float* src, D& dst, int bidx) { - typedef typename TypeVec::vec_t src_t; - typedef typename TypeVec::vec_t dst_t; + const float hscale = HR * (1.f / 360.f); - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + float b = src[bidx], g = src[1], r = src[bidx^2]; + float h, s, v; - if (y < rows && x < cols) - { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); + float vmin, diff; - dst_t dst; - RGB2HSVConvertor::cvt(&src.x, dst, bidx); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; - } + v = vmin = r; + v = fmax(v, g); + v = fmax(v, b); + vmin = fmin(vmin, g); + vmin = fmin(vmin, b); + + diff = v - vmin; + s = diff / (float)(fabs(v) + numeric_limits_gpu::epsilon()); + diff = (float)(60. / (diff + numeric_limits_gpu::epsilon())); + + if (v == r) + h = (g - b) * diff; + else if (v == g) + h = (b - r) * diff + 120.f; + else + h = (r - g) * diff + 240.f; + + if (h < 0) h += 360.f; + + dst.x = h * hscale; + dst.y = s; + dst.z = v; } - __constant__ int cHsvSectorData[6][3]; - - template struct HSV2RGBConvertor; - template struct HSV2RGBConvertor - { - template - static __device__ void cvt(const T& src, float* dst, int bidx) - { - const float hscale = 6.f / HR; - - float h = src.x, s = src.y, v = src.z; - float b, g, r; - - if( s == 0 ) - b = g = r = v; - else - { - float tab[4]; - int sector; - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - sector = __float2int_rd(h); - h -= sector; - - tab[0] = v; - tab[1] = v*(1.f - s); - tab[2] = v*(1.f - s*h); - tab[3] = v*(1.f - s*(1.f - h)); - - b = tab[cHsvSectorData[sector][0]]; - g = tab[cHsvSectorData[sector][1]]; - r = tab[cHsvSectorData[sector][2]]; - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; - } - }; - template struct HSV2RGBConvertor - { - template - static __device__ void cvt(const T& src, uchar* dst, int bidx) - { - float3 buf; - - buf.x = src.x; - buf.y = src.y * (1.f/255.f); - buf.z = src.z * (1.f/255.f); - - HSV2RGBConvertor::cvt(buf, &buf.x, bidx); - - dst[0] = saturate_cast(buf.x * 255.f); - dst[1] = saturate_cast(buf.y * 255.f); - dst[2] = saturate_cast(buf.z * 255.f); - } - }; - - template - __global__ void HSV2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + template struct RGB2HSV { typedef typename TypeVec::vec_t src_t; typedef typename TypeVec::vec_t dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + explicit RGB2HSV(int bidx) : bidx(bidx) {} - if (y < rows && x < cols) + __device__ dst_t operator()(const src_t& src) const { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - dst_t dst; - HSV2RGBConvertor::cvt(src, &dst.x, bidx); + RGB2HSVConvert
(&src.x, dst, bidx); + return dst; + } + + private: + int bidx; + }; + + __constant__ int cHsvSectorData[6][3] = + { + {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} + }; + + template + __device__ void HSV2RGBConvert(const T& src, float* dst, int bidx) + { + const float hscale = 6.f / HR; + + float h = src.x, s = src.y, v = src.z; + float b, g, r; + + if( s == 0 ) + b = g = r = v; + else + { + float tab[4]; + int sector; + h *= hscale; + if( h < 0 ) + do h += 6; while( h < 0 ); + else if( h >= 6 ) + do h -= 6; while( h >= 6 ); + sector = __float2int_rd(h); + h -= sector; + + tab[0] = v; + tab[1] = v*(1.f - s); + tab[2] = v*(1.f - s*h); + tab[3] = v*(1.f - s*(1.f - h)); + + b = tab[cHsvSectorData[sector][0]]; + g = tab[cHsvSectorData[sector][1]]; + r = tab[cHsvSectorData[sector][2]]; + } + + dst[bidx] = b; + dst[1] = g; + dst[bidx^2] = r; + } + template + __device__ void HSV2RGBConvert(const T& src, uchar* dst, int bidx) + { + float3 buf; + + buf.x = src.x; + buf.y = src.y * (1.f/255.f); + buf.z = src.z * (1.f/255.f); + + HSV2RGBConvert
(buf, &buf.x, bidx); + + dst[0] = saturate_cast(buf.x * 255.f); + dst[1] = saturate_cast(buf.y * 255.f); + dst[2] = saturate_cast(buf.z * 255.f); + } + + template struct HSV2RGB + { + typedef typename TypeVec::vec_t src_t; + typedef typename TypeVec::vec_t dst_t; + + explicit HSV2RGB(int bidx) : bidx(bidx) {} + + __device__ dst_t operator()(const src_t& src) const + { + dst_t dst; + HSV2RGBConvert
(src, &dst.x, bidx); setAlpha(dst, ColorChannel::max()); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; + return dst; } - } + + private: + int bidx; + }; template void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - if (hrange == 180) - RGB2HSV<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); + { + RGB2HSV cvt(bidx); + callConvert(src, dst, cvt, stream); + } else - RGB2HSV<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + { + RGB2HSV cvt(bidx); + callConvert(src, dst, cvt, stream); + } } void RGB2HSV_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream) @@ -1222,43 +1130,6 @@ namespace cv { namespace gpu { namespace color {RGB2HSV_caller, RGB2HSV_caller} }; - static const int div_table[] = - { - 0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, - 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, - 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, - 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, - 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, - 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, - 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, - 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, - 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, - 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, - 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, - 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, - 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, - 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, - 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, - 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, - 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, - 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, - 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, - 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, - 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, - 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, - 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, - 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, - 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, - 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, - 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, - 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, - 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, - 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, - 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, - 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096 - }; - cudaSafeCall( cudaMemcpyToSymbol(cHsvDivTable, div_table, sizeof(div_table)) ); - RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1273,28 +1144,20 @@ namespace cv { namespace gpu { namespace color RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } - template void HSV2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - if (hrange == 180) - HSV2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); + { + HSV2RGB cvt(bidx); + callConvert(src, dst, cvt, stream); + } else - HSV2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + { + HSV2RGB cvt(bidx); + callConvert(src, dst, cvt, stream); + } } void HSV2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream) @@ -1306,11 +1169,6 @@ namespace cv { namespace gpu { namespace color {HSV2RGB_caller, HSV2RGB_caller} }; - static const int sector_data[][3] = - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - - cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) ); - HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1323,202 +1181,177 @@ namespace cv { namespace gpu { namespace color {HSV2RGB_caller, HSV2RGB_caller} }; - static const int sector_data[][3] = - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - - cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) ); - HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } /////////////////////////////////////// RGB <-> HLS //////////////////////////////////////// - template struct RGB2HLSConvertor; - template struct RGB2HLSConvertor + template + __device__ void RGB2HLSConvert(const float* src, D& dst, int bidx) { - template - static __device__ void cvt(const float* src, D& dst, int bidx) + const float hscale = HR * (1.f/360.f); + + float b = src[bidx], g = src[1], r = src[bidx^2]; + float h = 0.f, s = 0.f, l; + float vmin, vmax, diff; + + vmax = vmin = r; + vmax = fmax(vmax, g); + vmax = fmax(vmax, b); + vmin = fmin(vmin, g); + vmin = fmin(vmin, b); + + diff = vmax - vmin; + l = (vmax + vmin) * 0.5f; + + if (diff > numeric_limits_gpu::epsilon()) { - const float hscale = HR * (1.f/360.f); + s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin); + diff = 60.f / diff; - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h = 0.f, s = 0.f, l; - float vmin, vmax, diff; - - vmax = vmin = r; - vmax = fmax(vmax, g); - vmax = fmax(vmax, b); - vmin = fmin(vmin, g); - vmin = fmin(vmin, b); - - diff = vmax - vmin; - l = (vmax + vmin) * 0.5f; - - if (diff > numeric_limits_gpu::epsilon()) - { - s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin); - diff = 60.f / diff; - - if (vmax == r) - h = (g - b)*diff; - else if (vmax == g) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if (h < 0.f) h += 360.f; - } - - dst.x = h * hscale; - dst.y = l; - dst.z = s; - } - }; - template struct RGB2HLSConvertor - { - template - static __device__ void cvt(const uchar* src, D& dst, int bidx) - { - float3 buf; - - buf.x = src[0]*(1.f/255.f); - buf.y = src[1]*(1.f/255.f); - buf.z = src[2]*(1.f/255.f); - - RGB2HLSConvertor::cvt(&buf.x, buf, bidx); - - dst.x = saturate_cast(buf.x); - dst.y = saturate_cast(buf.y*255.f); - dst.z = saturate_cast(buf.z*255.f); - } - }; - - template - __global__ void RGB2HLS(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) - { - typedef typename TypeVec::vec_t src_t; - typedef typename TypeVec::vec_t dst_t; - - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (y < rows && x < cols) - { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - - dst_t dst; - RGB2HLSConvertor::cvt(&src.x, dst, bidx); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; - } - } - - __constant__ int cHlsSectorData[6][3]; - - template struct HLS2RGBConvertor; - template struct HLS2RGBConvertor - { - template - static __device__ void cvt(const T& src, float* dst, int bidx) - { - const float hscale = 6.0f / HR; - - float h = src.x, l = src.y, s = src.z; - float b, g, r; - - if (s == 0) - b = g = r = l; + if (vmax == r) + h = (g - b)*diff; + else if (vmax == g) + h = (b - r)*diff + 120.f; else - { - float tab[4]; - int sector; + h = (r - g)*diff + 240.f; - float p2 = l <= 0.5f ? l * (1 + s) : l + s - l * s; - float p1 = 2 * l - p2; - - h *= hscale; - - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - - sector = __float2int_rd(h); - h -= sector; - - tab[0] = p2; - tab[1] = p1; - tab[2] = p1 + (p2 - p1) * (1 - h); - tab[3] = p1 + (p2 - p1) * h; - - b = tab[cHlsSectorData[sector][0]]; - g = tab[cHlsSectorData[sector][1]]; - r = tab[cHlsSectorData[sector][2]]; - } - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; + if (h < 0.f) h += 360.f; } - }; - template struct HLS2RGBConvertor + + dst.x = h * hscale; + dst.y = l; + dst.z = s; + } + template + __device__ void RGB2HLSConvert(const uchar* src, D& dst, int bidx) { - template - static __device__ void cvt(const T& src, uchar* dst, int bidx) - { - float3 buf; + float3 buf; - buf.x = src.x; - buf.y = src.y*(1.f/255.f); - buf.z = src.z*(1.f/255.f); + buf.x = src[0]*(1.f/255.f); + buf.y = src[1]*(1.f/255.f); + buf.z = src[2]*(1.f/255.f); - HLS2RGBConvertor::cvt(buf, &buf.x, bidx); + RGB2HLSConvert
(&buf.x, buf, bidx); - dst[0] = saturate_cast(buf.x*255.f); - dst[1] = saturate_cast(buf.y*255.f); - dst[2] = saturate_cast(buf.z*255.f); - } - }; + dst.x = saturate_cast(buf.x); + dst.y = saturate_cast(buf.y*255.f); + dst.z = saturate_cast(buf.z*255.f); + } - template - __global__ void HLS2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) + template struct RGB2HLS { typedef typename TypeVec::vec_t src_t; typedef typename TypeVec::vec_t dst_t; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + explicit RGB2HLS(int bidx) : bidx(bidx) {} - if (y < rows && x < cols) + __device__ dst_t operator()(const src_t& src) const { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - dst_t dst; - HLS2RGBConvertor::cvt(src, &dst.x, bidx); - setAlpha(dst, ColorChannel::max()); - - *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; + RGB2HLSConvert
(&src.x, dst, bidx); + return dst; } + + private: + int bidx; + }; + + __constant__ int cHlsSectorData[6][3] = + { + {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} + }; + + template + __device__ void HLS2RGBConvert(const T& src, float* dst, int bidx) + { + const float hscale = 6.0f / HR; + + float h = src.x, l = src.y, s = src.z; + float b, g, r; + + if (s == 0) + b = g = r = l; + else + { + float tab[4]; + int sector; + + float p2 = l <= 0.5f ? l * (1 + s) : l + s - l * s; + float p1 = 2 * l - p2; + + h *= hscale; + + if( h < 0 ) + do h += 6; while( h < 0 ); + else if( h >= 6 ) + do h -= 6; while( h >= 6 ); + + sector = __float2int_rd(h); + h -= sector; + + tab[0] = p2; + tab[1] = p1; + tab[2] = p1 + (p2 - p1) * (1 - h); + tab[3] = p1 + (p2 - p1) * h; + + b = tab[cHlsSectorData[sector][0]]; + g = tab[cHlsSectorData[sector][1]]; + r = tab[cHlsSectorData[sector][2]]; + } + + dst[bidx] = b; + dst[1] = g; + dst[bidx^2] = r; } + template + __device__ void HLS2RGBConvert(const T& src, uchar* dst, int bidx) + { + float3 buf; + + buf.x = src.x; + buf.y = src.y*(1.f/255.f); + buf.z = src.z*(1.f/255.f); + + HLS2RGBConvert
(buf, &buf.x, bidx); + + dst[0] = saturate_cast(buf.x*255.f); + dst[1] = saturate_cast(buf.y*255.f); + dst[2] = saturate_cast(buf.z*255.f); + } + + template struct HLS2RGB + { + typedef typename TypeVec::vec_t src_t; + typedef typename TypeVec::vec_t dst_t; + + explicit HLS2RGB(int bidx) : bidx(bidx) {} + + __device__ dst_t operator()(const src_t& src) const + { + dst_t dst; + HLS2RGBConvert
(src, &dst.x, bidx); + setAlpha(dst, ColorChannel::max()); + return dst; + } + + private: + int bidx; + }; template void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - if (hrange == 180) - RGB2HLS<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); + { + RGB2HLS cvt(bidx); + callConvert(src, dst, cvt, stream); + } else - RGB2HLS<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + { + RGB2HLS cvt(bidx); + callConvert(src, dst, cvt, stream); + } } void RGB2HLS_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream) @@ -1549,23 +1382,16 @@ namespace cv { namespace gpu { namespace color template void HLS2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - - grid.x = divUp(src.cols, threads.x); - grid.y = divUp(src.rows, threads.y); - if (hrange == 180) - HLS2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); + { + HLS2RGB cvt(bidx); + callConvert(src, dst, cvt, stream); + } else - HLS2RGB<<>>(src.data, src.step, - dst.data, dst.step, src.rows, src.cols, bidx); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaThreadSynchronize() ); + { + HLS2RGB cvt(bidx); + callConvert(src, dst, cvt, stream); + } } void HLS2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream) @@ -1576,11 +1402,6 @@ namespace cv { namespace gpu { namespace color {HLS2RGB_caller, HLS2RGB_caller}, {HLS2RGB_caller, HLS2RGB_caller} }; - - static const int sector_data[][3]= - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - - cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) ); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } @@ -1593,11 +1414,6 @@ namespace cv { namespace gpu { namespace color {HLS2RGB_caller, HLS2RGB_caller}, {HLS2RGB_caller, HLS2RGB_caller} }; - - static const int sector_data[][3]= - {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; - - cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) ); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); } diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp index 5561fe3430..d9e4aa8178 100644 --- a/modules/gpu/src/cudastream.cpp +++ b/modules/gpu/src/cudastream.cpp @@ -190,6 +190,9 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val) { + CV_Assert((src.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream); static const set_caller_t set_callers[] = { @@ -201,6 +204,11 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) { + CV_Assert((src.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + + CV_Assert(mask.type() == CV_8UC1); + typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream); static const set_caller_t set_callers[] = { @@ -212,6 +220,9 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta) { + CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + bool noScale = fabs(alpha-1) < std::numeric_limits::epsilon() && fabs(beta) < std::numeric_limits::epsilon(); if( rtype < 0 ) diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 5507b4e7c6..4b2737fa1a 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -625,7 +625,11 @@ namespace } void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) -{ +{ + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -637,6 +641,10 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) { + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -648,6 +656,9 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) { + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -659,6 +670,9 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) { + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -670,6 +684,10 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) { + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -681,6 +699,10 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) { + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -692,6 +714,9 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) { + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -703,6 +728,9 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) { + CV_Assert((src1.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream); static const func_t funcs[] = { @@ -749,6 +777,9 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double } else { + CV_Assert((src.depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream); diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index 42901a61c3..b4018aeebd 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -205,6 +205,9 @@ namespace void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double beta ) const { + CV_Assert((depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + bool noScale = fabs(alpha-1) < std::numeric_limits::epsilon() && fabs(beta) < std::numeric_limits::epsilon(); if( rtype < 0 ) @@ -428,6 +431,9 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask) { CV_Assert(mask.type() == CV_8UC1); + CV_Assert((depth() != CV_64F) || + (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); + CV_DbgAssert(!this->empty()); NppiSize sz; diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index b23065affe..cb581ba507 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -393,11 +393,37 @@ namespace cv } }; + template struct UseSmartUn_ + { + static const bool value = false; + }; + template struct UseSmartUn_ + { + static const bool value = device::UnReadWriteTraits::shift != 1; + }; + template struct UseSmartUn + { + static const bool value = UseSmartUn_::cn, device::VecTraits::cn>::value; + }; + + template struct UseSmartBin_ + { + static const bool value = false; + }; + template struct UseSmartBin_ + { + static const bool value = device::BinReadWriteTraits::shift != 1; + }; + template struct UseSmartBin + { + static const bool value = UseSmartBin_::cn, device::VecTraits::cn, device::VecTraits::cn>::value; + }; + template static void transform_caller(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, const Mask& mask, cudaStream_t stream = 0) { - TransformDispatcher::cn == 1 && device::VecTraits::cn == 1 && device::UnReadWriteTraits::shift != 1>::call(src, dst, op, mask, stream); + TransformDispatcher< UseSmartUn::value >::call(src, dst, op, mask, stream); } template @@ -416,7 +442,7 @@ namespace cv static void transform_caller(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, BinOp op, const Mask& mask, cudaStream_t stream = 0) { - TransformDispatcher::cn == 1 && device::VecTraits::cn == 1 && device::VecTraits::cn == 1 && device::BinReadWriteTraits::shift != 1>::call(src1, src2, dst, op, mask, stream); + TransformDispatcher< UseSmartBin::value >::call(src1, src2, dst, op, mask, stream); } template diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index 8f9d89b6f2..933c2e2800 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -680,4 +680,67 @@ TEST(erode) gpu::erode(d_src, d_dst, ker); GPU_OFF; } +} + +TEST(threshold) +{ + Mat src, dst; + gpu::GpuMat d_src, d_dst; + + for (int size = 2000; size <= 4000; size += 1000) + { + SUBTEST << "size " << size << ", 8U, THRESH_TRUNC"; + + gen(src, size, size, CV_8U, 0, 100); + dst.create(size, size, CV_8U); + + CPU_ON; + threshold(src, dst, 50.0, 0.0, THRESH_TRUNC); + CPU_OFF; + + d_src = src; + d_dst.create(size, size, CV_8U); + + GPU_ON; + gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC); + GPU_OFF; + } + + for (int size = 2000; size <= 4000; size += 1000) + { + SUBTEST << "size " << size << ", 8U, THRESH_BINARY"; + + gen(src, size, size, CV_8U, 0, 100); + dst.create(size, size, CV_8U); + + CPU_ON; + threshold(src, dst, 50.0, 0.0, THRESH_BINARY); + CPU_OFF; + + d_src = src; + d_dst.create(size, size, CV_8U); + + GPU_ON; + gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY); + GPU_OFF; + } + + for (int size = 2000; size <= 4000; size += 1000) + { + SUBTEST << "size " << size << ", 32F, THRESH_TRUNC"; + + gen(src, size, size, CV_32F, 0, 100); + dst.create(size, size, CV_32F); + + CPU_ON; + threshold(src, dst, 50.0, 0.0, THRESH_TRUNC); + CPU_OFF; + + d_src = src; + d_dst.create(size, size, CV_32F); + + GPU_ON; + gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC); + GPU_OFF; + } } \ No newline at end of file diff --git a/tests/gpu/src/brute_force_matcher.cpp b/tests/gpu/src/brute_force_matcher.cpp index 341d16a68d..07c48272d3 100644 --- a/tests/gpu/src/brute_force_matcher.cpp +++ b/tests/gpu/src/brute_force_matcher.cpp @@ -384,7 +384,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train ) { - bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS); + bool atomics_ok = TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS); if (!atomics_ok) { ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)"); diff --git a/tests/gpu/src/meanshift.cpp b/tests/gpu/src/meanshift.cpp index 7720e3e092..4da291f175 100644 --- a/tests/gpu/src/meanshift.cpp +++ b/tests/gpu/src/meanshift.cpp @@ -53,7 +53,7 @@ struct CV_GpuMeanShiftTest : public CvTest void run(int) { - bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12); + bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); if (!cc12_ok) { ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required"); @@ -67,8 +67,8 @@ struct CV_GpuMeanShiftTest : public CvTest cv::Mat img = cv::imread(std::string(ts->get_data_path()) + "meanshift/cones.png"); cv::Mat img_template; - if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) && - cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20)) + if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) && + cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20)) img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result.png"); else img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result_CC1X.png"); @@ -145,7 +145,7 @@ struct CV_GpuMeanShiftProcTest : public CvTest void run(int) { - bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12); + bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); if (!cc12_ok) { ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required"); @@ -219,8 +219,8 @@ struct CV_GpuMeanShiftProcTest : public CvTest cv::Mat spmap_template; cv::FileStorage fs; - if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) && - cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20)) + if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) && + cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20)) fs.open(std::string(ts->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ); else fs.open(std::string(ts->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ); diff --git a/tests/gpu/src/mssegmentation.cpp b/tests/gpu/src/mssegmentation.cpp index c87feb8bce..73e229cd8f 100644 --- a/tests/gpu/src/mssegmentation.cpp +++ b/tests/gpu/src/mssegmentation.cpp @@ -54,7 +54,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest { { try { - bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12); + bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); if (!cc12_ok) { ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required"); @@ -77,7 +77,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest { { stringstream path; path << ts->get_data_path() << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize; - if (TargetArchs::builtWith(COMPUTE_20) && DeviceInfo().supports(COMPUTE_20)) + if (TargetArchs::builtWith(FEATURE_SET_COMPUTE_20) && DeviceInfo().supports(FEATURE_SET_COMPUTE_20)) path << ".png"; else path << "_CC1X.png"; diff --git a/tests/gpu/src/operator_convert_to.cpp b/tests/gpu/src/operator_convert_to.cpp index 8368f24a98..2901b3620f 100644 --- a/tests/gpu/src/operator_convert_to.cpp +++ b/tests/gpu/src/operator_convert_to.cpp @@ -66,21 +66,24 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */) { const Size img_size(67, 35); - const int types[] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F}; - const int types_num = sizeof(types) / sizeof(int); const char* types_str[] = {"CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F"}; bool passed = true; try { - for (int i = 0; i < types_num && passed; ++i) + int lastType = CV_32F; + + if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)) + lastType = CV_64F; + + for (int i = 0; i <= lastType && passed; ++i) { - for (int j = 0; j < types_num && passed; ++j) + for (int j = 0; j <= lastType && passed; ++j) { for (int c = 1; c < 5 && passed; ++c) { - const int src_type = CV_MAKETYPE(types[i], c); - const int dst_type = types[j]; + const int src_type = CV_MAKETYPE(i, c); + const int dst_type = j; cv::RNG rng(*ts->get_rng()); diff --git a/tests/gpu/src/operator_copy_to.cpp b/tests/gpu/src/operator_copy_to.cpp index 21ce7eb536..639b573470 100644 --- a/tests/gpu/src/operator_copy_to.cpp +++ b/tests/gpu/src/operator_copy_to.cpp @@ -126,7 +126,12 @@ void CV_GpuMatOpCopyToTest::run( int /* start_from */) try { - for (int i = 0 ; i < 7; i++) + int lastType = CV_32F; + + if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)) + lastType = CV_64F; + + for (int i = 0 ; i <= lastType; i++) { Mat cpumat(rows, cols, i); cpumat.setTo(Scalar::all(127)); diff --git a/tests/gpu/src/operator_set_to.cpp b/tests/gpu/src/operator_set_to.cpp index 97a281b660..ad3dffd175 100644 --- a/tests/gpu/src/operator_set_to.cpp +++ b/tests/gpu/src/operator_set_to.cpp @@ -101,7 +101,12 @@ void CV_GpuMatOpSetToTest::run( int /* start_from */) rng.fill(cpumask, RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar(1.5)); cv::gpu::GpuMat gpumask(cpumask); - for (int i = 0; i < 7; i++) + int lastType = CV_32F; + + if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)) + lastType = CV_64F; + + for (int i = 0; i <= lastType; i++) { for (int cn = 1; cn <= 4; ++cn) {