diff --git a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp index 81117d6786..f3ee64a9b6 100644 --- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp +++ b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp @@ -76,7 +76,7 @@ inline GpuMat::GpuMat(int _rows, int _cols, int _type, const Scalar& _s) *this = _s; } } - + inline GpuMat::GpuMat(Size _size, int _type, const Scalar& _s) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) { @@ -85,7 +85,7 @@ inline GpuMat::GpuMat(Size _size, int _type, const Scalar& _s) create( _size.height, _size.width, _type ); *this = _s; } -} +} inline GpuMat::GpuMat(const GpuMat& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend) @@ -182,8 +182,8 @@ inline GpuMat::GpuMat(const GpuMat& m, const Rect& roi) if( rows <= 0 || cols <= 0 ) rows = cols = 0; } - -inline GpuMat::GpuMat(const Mat& m) + +inline GpuMat::GpuMat(const Mat& m) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) { upload(m); } inline GpuMat::~GpuMat() { release(); } @@ -217,7 +217,7 @@ template inline GpuMat::operator DevMem2D_() const { return DevMem2 return m; } -//CPP void GpuMat::download(cv::Mat& m) const; +//CPP void GpuMat::download(cv::Mat& m) const; inline GpuMat GpuMat::row(int y) const { return GpuMat(*this, Range(y, y+1), Range::all()); } inline GpuMat GpuMat::col(int x) const { return GpuMat(*this, Range::all(), Range(x, x+1)); } @@ -252,7 +252,7 @@ inline void GpuMat::create(Size _size, int _type) { create(_size.height, _size.w //CPP void GpuMat::create(int _rows, int _cols, int _type); //CPP void GpuMat::release(); -inline void GpuMat::swap(GpuMat& b) +inline void GpuMat::swap(GpuMat& b) { std::swap( flags, b.flags ); std::swap( rows, b.rows ); std::swap( cols, b.cols ); @@ -342,27 +342,27 @@ static inline void swap( GpuMat& a, GpuMat& b ) { a.swap(b); } //////////////////////////////// MatPL //////////////////////////////// /////////////////////////////////////////////////////////////////////// -MatPL::MatPL() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) {} -MatPL::MatPL(int _rows, int _cols, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) +inline MatPL::MatPL() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) {} +inline MatPL::MatPL(int _rows, int _cols, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) { if( _rows > 0 && _cols > 0 ) create( _rows, _cols, _type ); } -MatPL::MatPL(Size _size, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) +inline MatPL::MatPL(Size _size, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) { if( _size.height > 0 && _size.width > 0 ) create( _size.height, _size.width, _type ); } -MatPL::MatPL(const MatPL& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(0), dataend(0) +inline MatPL::MatPL(const MatPL& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(0), dataend(0) { if( refcount ) CV_XADD(refcount, 1); } -MatPL::MatPL(const Mat& m) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) +inline MatPL::MatPL(const Mat& m) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) { if( m.rows > 0 && m.cols > 0 ) create( m.size(), m.type() ); @@ -371,11 +371,11 @@ MatPL::MatPL(const Mat& m) : flags(0), rows(0), cols(0), step(0), data(0), refco m.copyTo(tmp); } -MatPL::~MatPL() +inline MatPL::~MatPL() { release(); } -MatPL& MatPL::operator = (const MatPL& m) +inline MatPL& MatPL::operator = (const MatPL& m) { if( this != &m ) { @@ -384,7 +384,7 @@ MatPL& MatPL::operator = (const MatPL& m) release(); flags = m.flags; rows = m.rows; cols = m.cols; - step = m.step; data = m.data; + step = m.step; data = m.data; datastart = m.datastart; dataend = m.dataend; refcount = m.refcount; @@ -392,17 +392,17 @@ MatPL& MatPL::operator = (const MatPL& m) return *this; } -MatPL MatPL::clone() const +inline MatPL MatPL::clone() const { - MatPL m(size(), type()); + MatPL m(size(), type()); Mat to = m; Mat from = *this; from.copyTo(to); return m; } -inline void MatPL::create(Size _size, int _type) { create(_size.height, _size.width, _type); } -//CCP void MatPL::create(int _rows, int _cols, int _type); +inline void MatPL::create(Size _size, int _type) { create(_size.height, _size.width, _type); } +//CCP void MatPL::create(int _rows, int _cols, int _type); //CPP void MatPL::release(); inline Mat MatPL::createMatHeader() const { return Mat(size(), type(), data); } @@ -416,11 +416,11 @@ inline int MatPL::depth() const { return CV_MAT_DEPTH(flags); } inline int MatPL::channels() const { return CV_MAT_CN(flags); } inline size_t MatPL::step1() const { return step/elemSize1(); } inline Size MatPL::size() const { return Size(cols, rows); } -inline bool MatPL::empty() const { return data == 0; } +inline bool MatPL::empty() const { return data == 0; } } /* end of namespace gpu */ } /* end of namespace cv */ -#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */ \ No newline at end of file +#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */ diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp index 917d4508b0..23c4c0021f 100644 --- a/modules/gpu/src/cuda/cuda_shared.hpp +++ b/modules/gpu/src/cuda/cuda_shared.hpp @@ -44,23 +44,26 @@ #define __OPENCV_CUDA_SHARED_HPP__ #include "opencv2/gpu/devmem2d.hpp" -#include "cuda_runtime_api.h" +#include "cuda_runtime_api.h" namespace cv { namespace gpu - { + { typedef unsigned char uchar; typedef unsigned short ushort; - typedef unsigned int uint; + typedef unsigned int uint; extern "C" void error( const char *error_string, const char *file, const int line, const char *func = ""); namespace impl - { + { static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, DevMem2D& disp, int maxdisp, DevMem2D_& minSSD_buf); + + extern "C" void set_to_without_mask (const DevMem2D& mat, const double * scalar, int depth, int channels); + extern "C" void set_to_with_mask (const DevMem2D& mat, const double * scalar, const DevMem2D& mask, int depth, int channels); } } } @@ -68,12 +71,12 @@ namespace cv #if defined(__GNUC__) #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__); #else /* defined(__CUDACC__) || defined(__MSVC__) */ - #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) + #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) #endif static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") { - if( cudaSuccess != err) + if( cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), __FILE__, __LINE__, func); } diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu new file mode 100644 index 0000000000..2db555e637 --- /dev/null +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -0,0 +1,150 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include +#include "cuda_shared.hpp" +#include "cuda_runtime.h" + +__constant__ float scalar_d[4]; + +namespace mat_operators +{ + + template + struct unroll + { + __device__ static void unroll_set(T * mat, size_t i) + { + mat[i] = static_cast(scalar_d[i % channels]); + unroll::unroll_set(mat, i+1); + } + + __device__ static void unroll_set_with_mask(T * mat, float mask, size_t i) + { + mat[i] = mask * static_cast(scalar_d[i % channels]); + unroll::unroll_set_with_mask(mat, mask, i+1); + } + }; + + template + struct unroll + { + __device__ static void unroll_set(T * , size_t){} + __device__ static void unroll_set_with_mask(T * , float, size_t){} + }; + + template + __global__ void kernel_set_to_without_mask(T * mat) + { + size_t i = (blockIdx.x * blockDim.x + threadIdx.x) * sizeof(T); + unroll::unroll_set(mat, i); + } + + template + __global__ void kernel_set_to_with_mask(T * mat, const float * mask) + { + size_t i = (blockIdx.x * blockDim.x + threadIdx.x) * sizeof(T); + unroll::unroll_set_with_mask(mat, i, mask[i]); + } +} + + +extern "C" void cv::gpu::impl::set_to_with_mask(const DevMem2D& mat, const double * scalar, const DevMem2D& mask, int depth, int channels) +{ + scalar_d[0] = scalar[0]; + scalar_d[1] = scalar[1]; + scalar_d[2] = scalar[2]; + scalar_d[3] = scalar[3]; + + dim3 numBlocks(mat.rows * mat.step / 256, 1, 1); + dim3 threadsPerBlock(256); + + if (channels == 1) + { + if (depth == 1) ::mat_operators::kernel_set_to_with_mask<<>>(mat.ptr, (float *)mask.ptr); + if (depth == 2) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned short *)mat.ptr, (float *)mask.ptr); + if (depth == 4) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned int *)mat.ptr, (float *)mask.ptr); + } + if (channels == 2) + { + if (depth == 1) ::mat_operators::kernel_set_to_with_mask<<>>(mat.ptr, (float *)mask.ptr); + if (depth == 2) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned short *)mat.ptr, (float *)mask.ptr); + if (depth == 4) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned int *)mat.ptr, (float *)mask.ptr); + } + if (channels == 3) + { + if (depth == 1) ::mat_operators::kernel_set_to_with_mask<<>>(mat.ptr, (float *)mask.ptr); + if (depth == 2) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned short *)mat.ptr, (float *)mask.ptr); + if (depth == 4) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned int *)mat.ptr, (float *)mask.ptr); + } +} + +extern "C" void cv::gpu::impl::set_to_without_mask(const DevMem2D& mat, const double * scalar, int depth, int channels) +{ + scalar_d[0] = scalar[0]; + scalar_d[1] = scalar[1]; + scalar_d[2] = scalar[2]; + scalar_d[3] = scalar[3]; + + int numBlocks = mat.rows * mat.step / 256; + + dim3 threadsPerBlock(256); + + if (channels == 1) + { + if (depth == 1) ::mat_operators::kernel_set_to_without_mask<<>>(mat.ptr); + if (depth == 2) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned short *)mat.ptr); + if (depth == 4) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned int *)mat.ptr); + } + if (channels == 2) + { + if (depth == 1) ::mat_operators::kernel_set_to_without_mask<<>>(mat.ptr); + if (depth == 2) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned short *)mat.ptr); + if (depth == 4) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned int *)mat.ptr); + } + if (channels == 3) + { + if (depth == 1) ::mat_operators::kernel_set_to_without_mask<<>>(mat.ptr); + if (depth == 2) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned short *)mat.ptr); + if (depth == 4) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned int *)mat.ptr); + } +} diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp index 77c29006d0..d17fdb0051 100644 --- a/modules/gpu/src/cudastream.cpp +++ b/modules/gpu/src/cudastream.cpp @@ -74,13 +74,13 @@ struct CudaStream::Impl cudaStream_t stream; int ref_counter; }; -namespace +namespace { template void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k) { dst.create(src.size(), src.type()); size_t bwidth = src.cols * src.elemSize(); - cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) ); + cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) ); }; } @@ -97,7 +97,7 @@ void cv::gpu::CudaStream::create() impl = (CudaStream::Impl*)fastMalloc(sizeof(CudaStream::Impl)); impl->stream = stream; - impl->ref_counter = 1; + impl->ref_counter = 1; } void cv::gpu::CudaStream::release() @@ -125,7 +125,7 @@ CudaStream& cv::gpu::CudaStream::operator=(const CudaStream& stream) CV_XADD(&stream.impl->ref_counter, 1); release(); - impl = stream.impl; + impl = stream.impl; } return *this; } @@ -138,20 +138,21 @@ bool cv::gpu::CudaStream::queryIfComplete() return err == cudaSuccess; cudaSafeCall(err); + return false; } void cv::gpu::CudaStream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( impl->stream ) ); } -void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst) -{ +void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst) +{ // if not -> allocation will be done, but after that dst will not point to page locked memory CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() ) - devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); + devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); } void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); } void cv::gpu::CudaStream::enqueueUpload(const MatPL& src, GpuMat& dst){ devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); } -void cv::gpu::CudaStream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); } +void cv::gpu::CudaStream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); } void cv::gpu::CudaStream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); } void cv::gpu::CudaStream::enqueueMemSet(const GpuMat& src, Scalar val) @@ -170,4 +171,4 @@ void cv::gpu::CudaStream::enqueueConvert(const GpuMat& src, GpuMat& dst, int typ } -#endif /* !defined (HAVE_CUDA) */ \ No newline at end of file +#endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index 1d27afb731..14c85c4a2c 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -100,7 +100,7 @@ void cv::gpu::GpuMat::copyTo( GpuMat& m ) const } void cv::gpu::GpuMat::copyTo( GpuMat& /*m*/, const GpuMat&/* mask */) const -{ +{ CV_Assert(!"Not implemented"); } @@ -109,15 +109,27 @@ void cv::gpu::GpuMat::convertTo( GpuMat& /*m*/, int /*rtype*/, double /*alpha*/, CV_Assert(!"Not implemented"); } -GpuMat& cv::gpu::GpuMat::operator = (const Scalar& /*s*/) +GpuMat& GpuMat::operator = (const Scalar& s) { - CV_Assert(!"Not implemented"); + cv::gpu::impl::set_to_without_mask(*this, s.val, this->depth(), this->channels()); return *this; } -GpuMat& cv::gpu::GpuMat::setTo(const Scalar& /*s*/, const GpuMat& /*mask*/) +GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask) { - CV_Assert(!"Not implemented"); + CV_Assert(mask.type() == CV_8U); + + CV_DbgAssert(!this->empty()); + + if (mask.empty()) + { + cv::gpu::impl::set_to_without_mask(*this, s.val, this->depth(), this->channels()); + } + else + { + cv::gpu::impl::set_to_with_mask(*this, s.val, mask, this->depth(), this->channels()); + } + return *this; } @@ -177,7 +189,7 @@ void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) rows = _rows; cols = _cols; - size_t esz = elemSize(); + size_t esz = elemSize(); void *dev_ptr; cudaSafeCall( cudaMallocPitch(&dev_ptr, &step, esz * cols, rows) ); @@ -189,7 +201,7 @@ void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) size_t nettosize = (size_t)_nettosize; datastart = data = (uchar*)dev_ptr; - dataend = data + nettosize; + dataend = data + nettosize; refcount = (int*)fastMalloc(sizeof(*refcount)); *refcount = 1; @@ -201,7 +213,7 @@ void cv::gpu::GpuMat::release() if( refcount && CV_XADD(refcount, -1) == 1 ) { fastFree(refcount); - cudaSafeCall( cudaFree(datastart) ); + cudaSafeCall( cudaFree(datastart) ); } data = datastart = dataend = 0; step = rows = cols = 0; @@ -233,12 +245,12 @@ void cv::gpu::MatPL::create(int _rows, int _cols, int _type) CV_Error(CV_StsNoMem, "Too big buffer is allocated"); size_t datasize = alignSize(nettosize, (int)sizeof(*refcount)); - //datastart = data = (uchar*)fastMalloc(datasize + sizeof(*refcount)); + //datastart = data = (uchar*)fastMalloc(datasize + sizeof(*refcount)); void *ptr; cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); - datastart = data = (uchar*)ptr; - dataend = data + nettosize; + datastart = data = (uchar*)ptr; + dataend = data + nettosize; refcount = (int*)cv::fastMalloc(sizeof(*refcount)); *refcount = 1; @@ -257,4 +269,4 @@ void cv::gpu::MatPL::release() refcount = 0; } -#endif /* !defined (HAVE_CUDA) */ \ No newline at end of file +#endif /* !defined (HAVE_CUDA) */