Merge pull request #17556 from nglee:dev_optFlowTVL1Async
This commit is contained in:
commit
9755ab160d
@ -101,6 +101,20 @@ namespace cv { namespace cuda
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
|
||||
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
|
||||
}
|
||||
|
||||
template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc)
|
||||
{
|
||||
cudaResourceDesc resDesc;
|
||||
memset(&resDesc, 0, sizeof(resDesc));
|
||||
resDesc.resType = cudaResourceTypePitch2D;
|
||||
resDesc.res.pitch2D.devPtr = static_cast<void*>(img.ptr());
|
||||
resDesc.res.pitch2D.height = img.rows;
|
||||
resDesc.res.pitch2D.width = img.cols;
|
||||
resDesc.res.pitch2D.pitchInBytes = img.step;
|
||||
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
|
||||
|
||||
cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
|
||||
}
|
||||
}
|
||||
}}
|
||||
|
||||
|
||||
@ -90,53 +90,47 @@ namespace cv { namespace cuda { namespace device
|
||||
|
||||
namespace canny
|
||||
{
|
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
struct SrcTex
|
||||
{
|
||||
int xoff;
|
||||
int yoff;
|
||||
virtual ~SrcTex() {}
|
||||
|
||||
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
|
||||
|
||||
__device__ __forceinline__ int operator ()(int y, int x) const
|
||||
__device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
|
||||
|
||||
int xoff;
|
||||
int yoff;
|
||||
};
|
||||
|
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
struct SrcTexRef : SrcTex
|
||||
{
|
||||
__host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
|
||||
|
||||
__device__ __forceinline__ int operator ()(int y, int x) const override
|
||||
{
|
||||
return tex2D(tex_src, x + xoff, y + yoff);
|
||||
}
|
||||
};
|
||||
|
||||
struct SrcTexObject
|
||||
struct SrcTexObj : SrcTex
|
||||
{
|
||||
int xoff;
|
||||
int yoff;
|
||||
cudaTextureObject_t tex_src_object;
|
||||
__host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
|
||||
__host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
|
||||
|
||||
__device__ __forceinline__ int operator ()(int y, int x) const
|
||||
__device__ __forceinline__ int operator ()(int y, int x) const override
|
||||
{
|
||||
return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
|
||||
}
|
||||
|
||||
cudaTextureObject_t tex_src_object;
|
||||
};
|
||||
|
||||
template <class Norm> __global__
|
||||
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y >= mag.rows || x >= mag.cols)
|
||||
return;
|
||||
|
||||
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
|
||||
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
|
||||
|
||||
dx(y, x) = dxVal;
|
||||
dy(y, x) = dyVal;
|
||||
|
||||
mag(y, x) = norm(dxVal, dyVal);
|
||||
}
|
||||
|
||||
template <class Norm> __global__
|
||||
void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
|
||||
template <
|
||||
class T,
|
||||
class Norm,
|
||||
typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
|
||||
>
|
||||
__global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
@ -162,15 +156,6 @@ namespace canny
|
||||
|
||||
if (cc30)
|
||||
{
|
||||
cudaResourceDesc resDesc;
|
||||
memset(&resDesc, 0, sizeof(resDesc));
|
||||
resDesc.resType = cudaResourceTypePitch2D;
|
||||
resDesc.res.pitch2D.devPtr = srcWhole.ptr();
|
||||
resDesc.res.pitch2D.height = srcWhole.rows;
|
||||
resDesc.res.pitch2D.width = srcWhole.cols;
|
||||
resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
|
||||
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
|
||||
|
||||
cudaTextureDesc texDesc;
|
||||
memset(&texDesc, 0, sizeof(texDesc));
|
||||
texDesc.addressMode[0] = cudaAddressModeClamp;
|
||||
@ -178,9 +163,9 @@ namespace canny
|
||||
texDesc.addressMode[2] = cudaAddressModeClamp;
|
||||
|
||||
cudaTextureObject_t tex = 0;
|
||||
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
|
||||
createTextureObjectPitch2D(&tex, srcWhole, texDesc);
|
||||
|
||||
SrcTexObject src(xoff, yoff, tex);
|
||||
SrcTexObj src(xoff, yoff, tex);
|
||||
|
||||
if (L2Grad)
|
||||
{
|
||||
@ -205,7 +190,7 @@ namespace canny
|
||||
else
|
||||
{
|
||||
bindTexture(&tex_src, srcWhole);
|
||||
SrcTex src(xoff, yoff);
|
||||
SrcTexRef src(xoff, yoff);
|
||||
|
||||
if (L2Grad)
|
||||
{
|
||||
|
||||
@ -116,7 +116,7 @@ protected:
|
||||
bool useL2gradient;
|
||||
};
|
||||
|
||||
#define NUM_STREAMS 64
|
||||
#define NUM_STREAMS 128
|
||||
|
||||
CUDA_TEST_P(Canny, Async)
|
||||
{
|
||||
|
||||
@ -45,6 +45,7 @@
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/border_interpolate.hpp"
|
||||
#include "opencv2/core/cuda/limits.hpp"
|
||||
#include "opencv2/core/cuda.hpp"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
@ -101,11 +102,64 @@ namespace tvl1flow
|
||||
}
|
||||
}
|
||||
|
||||
struct SrcTex
|
||||
{
|
||||
virtual ~SrcTex() {}
|
||||
|
||||
__device__ __forceinline__ virtual float I1(float x, float y) const = 0;
|
||||
__device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
|
||||
__device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
|
||||
};
|
||||
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
struct SrcTexRef : SrcTex
|
||||
{
|
||||
__device__ __forceinline__ float I1(float x, float y) const override
|
||||
{
|
||||
return tex2D(tex_I1, x, y);
|
||||
}
|
||||
__device__ __forceinline__ float I1x(float x, float y) const override
|
||||
{
|
||||
return tex2D(tex_I1x, x, y);
|
||||
}
|
||||
__device__ __forceinline__ float I1y(float x, float y) const override
|
||||
{
|
||||
return tex2D(tex_I1y, x, y);
|
||||
}
|
||||
};
|
||||
|
||||
__global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
|
||||
struct SrcTexObj : SrcTex
|
||||
{
|
||||
__host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
|
||||
: tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
|
||||
|
||||
__device__ __forceinline__ float I1(float x, float y) const override
|
||||
{
|
||||
return tex2D<float>(tex_obj_I1, x, y);
|
||||
}
|
||||
__device__ __forceinline__ float I1x(float x, float y) const override
|
||||
{
|
||||
return tex2D<float>(tex_obj_I1x, x, y);
|
||||
}
|
||||
__device__ __forceinline__ float I1y(float x, float y) const override
|
||||
{
|
||||
return tex2D<float>(tex_obj_I1y, x, y);
|
||||
}
|
||||
|
||||
cudaTextureObject_t tex_obj_I1;
|
||||
cudaTextureObject_t tex_obj_I1x;
|
||||
cudaTextureObject_t tex_obj_I1y;
|
||||
};
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename = std::enable_if_t<std::is_base_of<SrcTex, T>::value>
|
||||
>
|
||||
__global__ void warpBackwardKernel(
|
||||
const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
|
||||
PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
@ -136,9 +190,9 @@ namespace tvl1flow
|
||||
{
|
||||
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
|
||||
|
||||
sum += w * tex2D(tex_I1 , cx, cy);
|
||||
sumx += w * tex2D(tex_I1x, cx, cy);
|
||||
sumy += w * tex2D(tex_I1y, cx, cy);
|
||||
sum += w * src.I1(cx, cy);
|
||||
sumx += w * src.I1x(cx, cy);
|
||||
sumy += w * src.I1y(cx, cy);
|
||||
|
||||
wsum += w;
|
||||
}
|
||||
@ -173,15 +227,46 @@ namespace tvl1flow
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
|
||||
|
||||
bindTexture(&tex_I1 , I1);
|
||||
bindTexture(&tex_I1x, I1x);
|
||||
bindTexture(&tex_I1y, I1y);
|
||||
bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
|
||||
|
||||
warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
if (cc30)
|
||||
{
|
||||
cudaTextureDesc texDesc;
|
||||
memset(&texDesc, 0, sizeof(texDesc));
|
||||
texDesc.addressMode[0] = cudaAddressModeClamp;
|
||||
texDesc.addressMode[1] = cudaAddressModeClamp;
|
||||
texDesc.addressMode[2] = cudaAddressModeClamp;
|
||||
|
||||
if (!stream)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
|
||||
|
||||
createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
|
||||
createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
|
||||
createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
|
||||
|
||||
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
|
||||
if (!stream)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
else
|
||||
cudaSafeCall(cudaStreamSynchronize(stream));
|
||||
|
||||
cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
|
||||
cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
|
||||
cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
|
||||
}
|
||||
else
|
||||
{
|
||||
bindTexture(&tex_I1, I1);
|
||||
bindTexture(&tex_I1x, I1x);
|
||||
bindTexture(&tex_I1y, I1y);
|
||||
|
||||
warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
|
||||
if (!stream)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -405,10 +405,71 @@ CUDA_TEST_P(OpticalFlowDual_TVL1, Accuracy)
|
||||
EXPECT_MAT_SIMILAR(flow, d_flow, 4e-3);
|
||||
}
|
||||
|
||||
class TVL1AsyncParallelLoopBody : public cv::ParallelLoopBody
|
||||
{
|
||||
public:
|
||||
TVL1AsyncParallelLoopBody(const cv::cuda::GpuMat& d_img1_, const cv::cuda::GpuMat& d_img2_, cv::cuda::GpuMat* d_flow_, int iterations_, double gamma_)
|
||||
: d_img1(d_img1_), d_img2(d_img2_), d_flow(d_flow_), iterations(iterations_), gamma(gamma_) {}
|
||||
~TVL1AsyncParallelLoopBody() {}
|
||||
void operator()(const cv::Range& r) const
|
||||
{
|
||||
for (int i = r.start; i < r.end; i++) {
|
||||
cv::cuda::Stream stream;
|
||||
cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg = cv::cuda::OpticalFlowDual_TVL1::create();
|
||||
d_alg->setNumIterations(iterations);
|
||||
d_alg->setGamma(gamma);
|
||||
d_alg->calc(d_img1, d_img2, d_flow[i], stream);
|
||||
stream.waitForCompletion();
|
||||
}
|
||||
}
|
||||
protected:
|
||||
const cv::cuda::GpuMat& d_img1;
|
||||
const cv::cuda::GpuMat& d_img2;
|
||||
cv::cuda::GpuMat* d_flow;
|
||||
int iterations;
|
||||
double gamma;
|
||||
};
|
||||
|
||||
#define NUM_STREAMS 16
|
||||
|
||||
CUDA_TEST_P(OpticalFlowDual_TVL1, Async)
|
||||
{
|
||||
if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
|
||||
{
|
||||
throw SkipTestException("CUDA device doesn't support texture objects");
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(frame0.empty());
|
||||
|
||||
cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(frame1.empty());
|
||||
|
||||
const int iterations = 10;
|
||||
|
||||
// Synchronous call
|
||||
cv::Ptr<cv::cuda::OpticalFlowDual_TVL1> d_alg =
|
||||
cv::cuda::OpticalFlowDual_TVL1::create();
|
||||
d_alg->setNumIterations(iterations);
|
||||
d_alg->setGamma(gamma);
|
||||
|
||||
cv::cuda::GpuMat d_flow_gold;
|
||||
d_alg->calc(loadMat(frame0), loadMat(frame1), d_flow_gold);
|
||||
|
||||
// Asynchronous call
|
||||
cv::cuda::GpuMat d_flow[NUM_STREAMS];
|
||||
cv::parallel_for_(cv::Range(0, NUM_STREAMS), TVL1AsyncParallelLoopBody(loadMat(frame0), loadMat(frame1), d_flow, iterations, gamma));
|
||||
|
||||
// Compare the results of synchronous call and asynchronous call
|
||||
for (int i = 0; i < NUM_STREAMS; i++)
|
||||
EXPECT_MAT_NEAR(d_flow_gold, d_flow[i], 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
testing::Values(Gamma(0.0), Gamma(1.0))));
|
||||
|
||||
|
||||
}} // namespace
|
||||
#endif // HAVE_CUDA
|
||||
|
||||
Loading…
Reference in New Issue
Block a user