478 lines
19 KiB
C++
478 lines
19 KiB
C++
// This file is part of OpenCV project.
|
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
|
// of this distribution and at http://opencv.org/license.html.
|
|
|
|
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
|
|
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
|
|
|
|
#include "stream.hpp"
|
|
#include "tensor.hpp"
|
|
#include "pointer.hpp"
|
|
#include "cublas.hpp"
|
|
#include "cudnn.hpp"
|
|
#include "workspace.hpp"
|
|
|
|
#include "cudnn/convolution.hpp"
|
|
#include "cudnn/pooling.hpp"
|
|
#include "cudnn/lrn.hpp"
|
|
#include "cudnn/softmax.hpp"
|
|
#include "cudnn/transform.hpp"
|
|
#include "cudnn/transpose_convolution.hpp"
|
|
|
|
#include <opencv2/core.hpp>
|
|
|
|
#include <cstddef>
|
|
#include <array>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
|
|
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
|
|
|
|
namespace tensor_ops {
|
|
|
|
/** @brief copies data between tensors
|
|
*
|
|
* Pre-conditions:
|
|
* - \p dest and \p src must have the same shape
|
|
*
|
|
* Exception Guarantee: Basic
|
|
*/
|
|
template <class T> inline
|
|
void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
|
|
CV_Assert(is_shape_same(dest, src));
|
|
if (dest.get() != src.get())
|
|
memcpy(dest.get(), src.get(), dest.size(), stream);
|
|
}
|
|
|
|
namespace detail {
|
|
template <class T>
|
|
void assertGEMMCompatiblity(const TensorSpan<T>& result, bool transa, const TensorView<T>& A, bool transb, const TensorView<T>& B) {
|
|
/* check dimension requirements for matrix multiplication */
|
|
if (!transa && !transb) {
|
|
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
|
|
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
|
|
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
|
|
} else if (!transa && transb) {
|
|
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
|
|
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
|
|
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
|
|
} else if (transa && !transb) {
|
|
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
|
|
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
|
|
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
|
|
} else {
|
|
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
|
|
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
|
|
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
|
|
}
|
|
}
|
|
}
|
|
|
|
/** @brief performs generalized matrix-multiplication
|
|
*
|
|
* Pre-conditions:
|
|
* - \p A and \p B must meet the mathematical requirements for matrix multiplication
|
|
* - \p result must be large enough to hold the result
|
|
*
|
|
* Exception Guarantee: Basic
|
|
*/
|
|
template <class T> inline
|
|
void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
|
|
/* matrix operations can be performed only on tensors with rank two or below */
|
|
CV_Assert(get_effective_rank(A) <= 2);
|
|
CV_Assert(get_effective_rank(B) <= 2);
|
|
CV_Assert(get_effective_rank(result) <= 2);
|
|
|
|
const auto result_nr = result.get_axis_size(-2);
|
|
const auto result_nc = result.get_axis_size(-1);
|
|
const auto common_dim = A.get_axis_size(transa ? -2 : -1);
|
|
const auto A_nc = A.get_axis_size(-1);
|
|
const auto B_nc = B.get_axis_size(-1);
|
|
|
|
detail::assertGEMMCompatiblity(result, transa, A, transb, B);
|
|
|
|
/* tensors are stored in row-major but cublas::gemm operates on column-major matrices
|
|
* a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
|
|
*
|
|
* Required: C = AB
|
|
* what cuBLAS sees: C^T = A^TB^T = (BA)^T
|
|
*
|
|
* By reversing operands, we effectively perform:
|
|
* C^T = B^TA^T = (AB)^T
|
|
*
|
|
* which gives C = AB
|
|
*/
|
|
cublas::gemm<T>(handle,
|
|
transb, transa,
|
|
result_nc, result_nr, common_dim,
|
|
alpha, B.get(), B_nc,
|
|
A.get(), A_nc,
|
|
beta, result.get(), result_nc);
|
|
}
|
|
|
|
/** @brief performs generalized matrix-multiplication for a strided batch of matrices
|
|
*
|
|
* Pre-conditions:
|
|
* - A, B and C must be rank three tensors with dimensions (batch, rows, cols)
|
|
* - the last two axes of \p A and \p B must meet the mathematical requirements for matrix multiplication
|
|
* - \p result must be large enough to hold the result and the matrices must not overlap in memory
|
|
* - batch dimension should be same in \p A, \p B and \p result
|
|
*
|
|
* Exception Guarantee: Basic
|
|
*/
|
|
template <class T> inline
|
|
void gemmStridedBatched(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
|
|
CV_Assert(A.rank() == 3);
|
|
CV_Assert(B.rank() == 3);
|
|
CV_Assert(result.rank() == 3);
|
|
|
|
const auto batch_size = result.get_axis_size(0);
|
|
CV_Assert(batch_size == A.get_axis_size(0));
|
|
CV_Assert(batch_size == B.get_axis_size(0));
|
|
|
|
detail::assertGEMMCompatiblity(result, transa, A, transb, B);
|
|
|
|
const auto result_nr = result.get_axis_size(-2);
|
|
const auto result_nc = result.get_axis_size(-1);
|
|
const auto common_dim = A.get_axis_size(transa ? -2 : -1);
|
|
const auto A_nc = A.get_axis_size(-1);
|
|
const auto B_nc = B.get_axis_size(-1);
|
|
|
|
std::size_t strideA = (A.size() / batch_size),
|
|
strideB = (B.size() / batch_size),
|
|
strideC = (result.size() / batch_size);
|
|
|
|
cublas::gemmStridedBatched<T>(handle,
|
|
transb, transa,
|
|
result_nc, result_nr, common_dim,
|
|
alpha, B.get(), B_nc, strideB,
|
|
A.get(), A_nc, strideA,
|
|
beta, result.get(), result_nc, strideC,
|
|
batch_size);
|
|
}
|
|
|
|
/** @brief performs element-wise addition with broadcasting
|
|
*
|
|
* Pre-conditions:
|
|
* - \p A and \p result must be compatible tensors
|
|
*
|
|
* Exception Guarantee: Basic
|
|
*/
|
|
template <class T> inline
|
|
void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
|
|
CV_Assert(is_shape_same(output, input));
|
|
|
|
channel_axis = clamp_axis(channel_axis, input.rank());
|
|
|
|
std::size_t outer_size = input.size_range(0, channel_axis);
|
|
auto channel_size = input.get_axis_size(channel_axis);
|
|
std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
|
|
|
|
std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
|
|
|
|
using cudnn::TensorDescriptor;
|
|
auto inputDesc = TensorDescriptor<T>(shape);
|
|
auto outputDesc = TensorDescriptor<T>(shape);
|
|
cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
|
|
}
|
|
}
|
|
|
|
template <class T>
|
|
class Convolution {
|
|
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
|
using FilterDescriptor = cudnn::FilterDescriptor<T>;
|
|
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
|
|
using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
|
|
using ActivationDescriptor = cudnn::ActivationDescriptor;
|
|
|
|
public:
|
|
using ActivationType = ActivationDescriptor::ActivationType;
|
|
|
|
struct params_type {
|
|
/* convolution */
|
|
std::vector<std::size_t> input_shape;
|
|
std::vector<std::size_t> filter_shape;
|
|
std::vector<std::size_t> padding;
|
|
std::vector<std::size_t> stride;
|
|
std::vector<std::size_t> dilation;
|
|
std::size_t groups;
|
|
|
|
/* bias and activation (only RELU supported) */
|
|
std::vector<std::size_t> bias_shape;
|
|
ActivationType activation_type; /* MUST BE identity if there is no bias and ReLU if there is bias */
|
|
bool eltwise;
|
|
};
|
|
|
|
Convolution() = default;
|
|
Convolution(const Convolution&) = delete;
|
|
Convolution(Convolution&&) = default;
|
|
Convolution(cudnn::Handle handle, const params_type& params) {
|
|
cudnnHandle = std::move(handle);
|
|
|
|
inputTensorDesc = TensorDescriptor(params.input_shape);
|
|
filterDesc = FilterDescriptor(params.filter_shape);
|
|
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
|
|
|
|
std::vector<int> output_dims;
|
|
getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
|
|
outputTensorDesc = TensorDescriptor(output_dims);
|
|
|
|
algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
|
|
|
|
if (!params.bias_shape.empty()) {
|
|
CV_Assert(params.activation_type == ActivationType::RELU);
|
|
biasTensorDesc = TensorDescriptor(params.bias_shape);
|
|
if (params.eltwise)
|
|
eltwiseTensorDesc = TensorDescriptor(output_dims);
|
|
activationDesc = ActivationDescriptor(params.activation_type, 0.0);
|
|
} else {
|
|
CV_Assert(params.activation_type == ActivationType::IDENTITY);
|
|
}
|
|
}
|
|
|
|
Convolution& operator=(const Convolution&) = delete;
|
|
Convolution& operator=(Convolution&&) = default;
|
|
|
|
std::size_t get_workspace_size() const noexcept {
|
|
return algo.get_workspace_size();
|
|
}
|
|
|
|
void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
|
|
cudnn::convolve<T>(
|
|
cudnnHandle,
|
|
convDesc, algo, scratchpad,
|
|
filterDesc, filters.get(),
|
|
inputTensorDesc, input.get(),
|
|
1.0, 0.0, outputTensorDesc, output.get()
|
|
);
|
|
}
|
|
|
|
void convolve_with_bias_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, WorkspaceInstance scratchpad) {
|
|
cudnn::convolve_with_bias_activation<T>(
|
|
cudnnHandle,
|
|
1.0, convDesc, algo, scratchpad,
|
|
filterDesc, filters.get(),
|
|
inputTensorDesc, input.get(),
|
|
biasTensorDesc, bias.get(),
|
|
activationDesc,
|
|
outputTensorDesc, output.get()
|
|
);
|
|
}
|
|
|
|
void convolve_with_bias_eltwise_activation(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, TensorView<T> bias, TensorView<T> eltwise, WorkspaceInstance scratchpad) {
|
|
cudnn::convolve_with_bias_eltwise_activation<T>(
|
|
cudnnHandle,
|
|
1.0, convDesc, algo, scratchpad,
|
|
filterDesc, filters.get(),
|
|
inputTensorDesc, input.get(),
|
|
biasTensorDesc, bias.get(),
|
|
1.0, eltwiseTensorDesc, eltwise.get(),
|
|
activationDesc,
|
|
outputTensorDesc, output.get()
|
|
);
|
|
}
|
|
|
|
private:
|
|
cudnn::Handle cudnnHandle;
|
|
TensorDescriptor inputTensorDesc, outputTensorDesc;
|
|
FilterDescriptor filterDesc;
|
|
ConvolutionDescriptor convDesc;
|
|
ConvolutionAlgorithm algo;
|
|
TensorDescriptor biasTensorDesc;
|
|
TensorDescriptor eltwiseTensorDesc;
|
|
ActivationDescriptor activationDesc;
|
|
};
|
|
|
|
template <class T>
|
|
class TransposeConvolution {
|
|
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
|
using FilterDescriptor = cudnn::FilterDescriptor<T>;
|
|
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
|
|
using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
|
|
|
|
public:
|
|
struct params_type {
|
|
std::vector<std::size_t> input_shape;
|
|
std::vector<std::size_t> output_shape;
|
|
|
|
std::vector<std::size_t> filter_shape;
|
|
|
|
std::vector<std::size_t> padding;
|
|
std::vector<std::size_t> stride;
|
|
std::vector<std::size_t> dilation;
|
|
|
|
std::size_t groups;
|
|
};
|
|
|
|
TransposeConvolution() = default;
|
|
TransposeConvolution(const TransposeConvolution&) = delete;
|
|
TransposeConvolution(TransposeConvolution&&) = default;
|
|
TransposeConvolution(cudnn::Handle handle, const params_type& params) {
|
|
cudnnHandle = std::move(handle);
|
|
|
|
filterDesc = FilterDescriptor(params.filter_shape);
|
|
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
|
|
|
|
/* input_shape is the output shape for convolution
|
|
* output_shape is the input shape for convolution
|
|
*/
|
|
convInputTensorDesc = TensorDescriptor(params.output_shape);
|
|
|
|
std::vector<int> conv_output_dims;
|
|
getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
|
|
|
|
/* the convolution output must be identical to what cuDNN expects */
|
|
CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
|
|
|
|
convOutputTensorDesc = TensorDescriptor(params.input_shape);
|
|
|
|
algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
|
|
}
|
|
|
|
TransposeConvolution& operator=(const TransposeConvolution&) = delete;
|
|
TransposeConvolution& operator=(TransposeConvolution&&) = default;
|
|
|
|
std::size_t get_workspace_size() const noexcept {
|
|
return algo.get_workspace_size();
|
|
}
|
|
|
|
void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
|
|
cudnn::transpose_convolve<T>(
|
|
cudnnHandle,
|
|
convDesc, algo, scratchpad,
|
|
filterDesc, filters.get(),
|
|
convOutputTensorDesc, input.get(),
|
|
1.0, 0.0, convInputTensorDesc, output.get()
|
|
);
|
|
}
|
|
|
|
private:
|
|
cudnn::Handle cudnnHandle;
|
|
TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
|
|
FilterDescriptor filterDesc;
|
|
ConvolutionDescriptor convDesc;
|
|
TransposeConvolutionAlgorithm algo;
|
|
};
|
|
|
|
template <class T>
|
|
class Pooling {
|
|
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
|
using PoolingDescriptor = cudnn::PoolingDescriptor;
|
|
|
|
public:
|
|
using PoolingType = PoolingDescriptor::PoolingType;
|
|
|
|
struct params_type {
|
|
std::vector<std::size_t> input_shape;
|
|
std::vector<std::size_t> output_shape;
|
|
|
|
std::vector<std::size_t> window_size;
|
|
std::vector<std::size_t> padding;
|
|
std::vector<std::size_t> stride;
|
|
|
|
PoolingType type;
|
|
};
|
|
|
|
Pooling() = default;
|
|
Pooling(const Pooling&) = delete;
|
|
Pooling(Pooling&&) = default;
|
|
Pooling(cudnn::Handle handle, const params_type& params) {
|
|
cudnnHandle = std::move(handle);
|
|
|
|
inputTensorDesc = TensorDescriptor(params.input_shape);
|
|
poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
|
|
|
|
//std::vector<int> output_dim;
|
|
//getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
|
|
outputTensorDesc = TensorDescriptor(params.output_shape);
|
|
}
|
|
|
|
Pooling& operator=(const Pooling&) = delete;
|
|
Pooling& operator=(Pooling&&) = default;
|
|
|
|
void pool(TensorView<T> input, TensorSpan<T> output) {
|
|
cudnn::pool<T>(
|
|
cudnnHandle,
|
|
poolingDesc,
|
|
inputTensorDesc, input.get(),
|
|
1.0, 0.0, outputTensorDesc, output.get()
|
|
);
|
|
}
|
|
|
|
private:
|
|
cudnn::Handle cudnnHandle;
|
|
TensorDescriptor inputTensorDesc, outputTensorDesc;
|
|
PoolingDescriptor poolingDesc;
|
|
};
|
|
|
|
template <class T>
|
|
class LRN {
|
|
using LRNDescriptor = cudnn::LRNDescriptor;
|
|
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
|
|
|
public:
|
|
using LRNType = LRNDescriptor::LRNType;
|
|
|
|
LRN() = default;
|
|
LRN(const LRN&) = delete;
|
|
LRN(LRN&&) = default;
|
|
LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
|
|
cudnnHandle = std::move(handle);
|
|
lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
|
|
}
|
|
|
|
LRN& operator=(const LRN&) = delete;
|
|
LRN& operator=(LRN&&) = default;
|
|
|
|
void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
|
|
cudnn::LRNForward<T>(
|
|
cudnnHandle,
|
|
lrnDesc,
|
|
TensorDescriptor(input.shape_as_vector()), input.get(),
|
|
1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
|
|
workspace
|
|
);
|
|
}
|
|
|
|
private:
|
|
cudnn::Handle cudnnHandle;
|
|
LRNDescriptor lrnDesc;
|
|
};
|
|
|
|
template <class T>
|
|
class TensorTransform {
|
|
using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
|
|
using TensorDescriptor = cudnn::TensorDescriptor<T>;
|
|
|
|
public:
|
|
TensorTransform() = default;
|
|
TensorTransform(const TensorTransform&) = delete;
|
|
TensorTransform(TensorTransform&&) = default;
|
|
|
|
template <class SequenceContainer>
|
|
TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
|
|
cudnnHandle = std::move(handle);
|
|
transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
|
|
}
|
|
|
|
TensorTransform& operator=(const TensorTransform&) = delete;
|
|
TensorTransform& operator=(TensorTransform&&) = default;
|
|
|
|
void transform(TensorView<T> input, TensorSpan<T> output) {
|
|
cudnn::transform<T>(
|
|
cudnnHandle,
|
|
transDesc,
|
|
TensorDescriptor(input.shape_as_vector()), input.get(),
|
|
TensorDescriptor(output.shape_as_vector()), output.get()
|
|
);
|
|
}
|
|
|
|
private:
|
|
cudnn::Handle cudnnHandle;
|
|
TensorTransformDescriptor transDesc;
|
|
};
|
|
|
|
}}}} /* namespace cv::dnn::cuda4dnn::csl */
|
|
|
|
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */
|