From d0e6d2438c30edd244a62015aa5dde72be4d3aa4 Mon Sep 17 00:00:00 2001 From: Yashas Samaga B L Date: Thu, 9 Jul 2020 18:32:21 +0530 Subject: [PATCH] Merge pull request #17363 from YashasSamaga:cuda4dnn-eltwise-fusion2 cuda4dnn(conv): fuse eltwise with convolutions * fuse eltwise with convolutions * manually rebase to avoid bad git merge --- modules/dnn/src/cuda/activation_eltwise.cu | 121 ++++++++ modules/dnn/src/cuda/activations.cu | 124 ++++---- modules/dnn/src/cuda/bias_activation.cu | 68 ++--- .../dnn/src/cuda/bias_activation_eltwise.cu | 125 ++++++++ .../dnn/src/cuda/bias_eltwise_activation.cu | 132 ++++++++ modules/dnn/src/cuda/eltwise_activation.cu | 125 ++++++++ modules/dnn/src/cuda/eltwise_ops.cu | 36 +-- modules/dnn/src/cuda/functors.hpp | 249 +++++++++++---- modules/dnn/src/cuda/scale_shift.cu | 6 +- .../src/cuda4dnn/csl/cudnn/convolution.hpp | 95 ++++++ modules/dnn/src/cuda4dnn/csl/span.hpp | 11 +- modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp | 33 +- .../cuda4dnn/kernels/activation_eltwise.hpp | 40 +++ .../dnn/src/cuda4dnn/kernels/activations.hpp | 18 +- .../src/cuda4dnn/kernels/bias_activation.hpp | 12 +- .../kernels/bias_activation_eltwise.hpp | 42 +++ .../kernels/bias_eltwise_activation.hpp | 45 +++ .../cuda4dnn/kernels/eltwise_activation.hpp | 40 +++ .../src/cuda4dnn/primitives/convolution.hpp | 289 ++++++++++++++++-- .../dnn/src/cuda4dnn/primitives/eltwise.hpp | 29 +- modules/dnn/src/dnn.cpp | 240 +++++++++++---- modules/dnn/src/layers/convolution_layer.cpp | 56 +++- 22 files changed, 1636 insertions(+), 300 deletions(-) create mode 100644 modules/dnn/src/cuda/activation_eltwise.cu create mode 100644 modules/dnn/src/cuda/bias_activation_eltwise.cu create mode 100644 modules/dnn/src/cuda/bias_eltwise_activation.cu create mode 100644 modules/dnn/src/cuda/eltwise_activation.cu create mode 100644 modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp create mode 100644 modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp create mode 100644 modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp create mode 100644 modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp diff --git a/modules/dnn/src/cuda/activation_eltwise.cu b/modules/dnn/src/cuda/activation_eltwise.cu new file mode 100644 index 0000000000..0ad6984124 --- /dev/null +++ b/modules/dnn/src/cuda/activation_eltwise.cu @@ -0,0 +1,121 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include +#include + +#include "functors.hpp" +#include "vector_traits.hpp" +#include "grid_stride_range.hpp" +#include "execution.hpp" + +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" + +using namespace cv::dnn::cuda4dnn::csl; +using namespace cv::dnn::cuda4dnn::csl::device; + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + +namespace raw { + + template + __global__ void generic_op_eltwise_op_inplace_vec(Span inplace_output, View eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + auto eltwise_vPtr = vector_type::get_pointer(eltwise.data()); + + ActivationOp activation_op(act_params); + EltwiseOp eltwise_op(eltwise_params); + + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + vector_type output_vec, eltwise_vec; + v_load(output_vec, inplace_output_vPtr[i]); + v_load(eltwise_vec, eltwise_vPtr[i]); + for(int j = 0; j < output_vec.size(); j++) + output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j]), eltwise_vec.data[j]); + v_store(inplace_output_vPtr[i], output_vec); + } + } +} + +template static +void launch_vectorized_generic_op_eltwise_op_inplace(const Stream& stream, Span inplace_output, View eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(is_fully_aligned(eltwise, N)); + + auto kernel = raw::generic_op_eltwise_op_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, eltwise, act_params, eltwise_params); +} + +template static +void generic_op_eltwise_op_inplace(const Stream& stream, Span inplace_output, View eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) { + CV_Assert(inplace_output.size() == eltwise.size()); + + if (is_fully_aligned(inplace_output, 4) && is_fully_aligned(eltwise, 4)) { + launch_vectorized_generic_op_eltwise_op_inplace(stream, inplace_output, eltwise, act_params, eltwise_params); + } else if (is_fully_aligned(inplace_output, 2) && is_fully_aligned(eltwise, 2)) { + launch_vectorized_generic_op_eltwise_op_inplace(stream, inplace_output, eltwise, act_params, eltwise_params); + } else { + launch_vectorized_generic_op_eltwise_op_inplace(stream, inplace_output, eltwise, act_params, eltwise_params); + } +} + +template +void relu_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, View eltwise, T slope) { + generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, eltwise, {slope}); +} + +template +void clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, View eltwise, T floor, T ceiling) { + CV_Assert(static_cast(floor) <= static_cast(ceiling)); + generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, eltwise, {floor, ceiling}); +} + +template +void tanh_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, View eltwise) { + generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, eltwise); +} + +template +void swish_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, View eltwise) { + generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, eltwise); +} + +template +void mish_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, View eltwise) { + generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, eltwise); +} + +template +void sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, View eltwise) { + generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, eltwise); +} + +template +void power_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, View eltwise, T exp, T scale, T shift) { + generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, eltwise, {exp, scale, shift}); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half); +template void clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half); +template void tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>); +template void swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>); +template void mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>); +template void sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>); +template void power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half); +#endif + +template void relu_eltwise_sum_2_inplace(const Stream&, Span, View, float); +template void clipped_relu_eltwise_sum_2_inplace(const Stream&, Span, View, float, float); +template void tanh_eltwise_sum_2_inplace(const Stream&, Span, View); +template void swish_eltwise_sum_2_inplace(const Stream&, Span, View); +template void mish_eltwise_sum_2_inplace(const Stream&, Span, View); +template void sigmoid_eltwise_sum_2_inplace(const Stream&, Span, View); +template void power_eltwise_sum_2_inplace(const Stream&, Span, View, float, float, float); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/activations.cu b/modules/dnn/src/cuda/activations.cu index 221516dddc..6a991baea2 100644 --- a/modules/dnn/src/cuda/activations.cu +++ b/modules/dnn/src/cuda/activations.cu @@ -26,20 +26,20 @@ using namespace cv::dnn::cuda4dnn::csl::device; namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { namespace raw { - template - __global__ void generic_op_vec(Span output, View input, FunctorArgs ...functorArgs) { + template + __global__ void generic_op_vec(Span output, View input, const typename ActivationOp::Params params) { using vector_type = get_vector_type_t; auto output_vPtr = vector_type::get_pointer(output.data()); auto input_vPtr = vector_type::get_pointer(input.data()); - Functor functor(functorArgs...); + ActivationOp activation_op(params); for (auto i : grid_stride_range(output.size() / vector_type::size())) { vector_type vec; v_load(vec, input_vPtr[i]); for (int j = 0; j < vector_type::size(); j++) - vec.data[j] = functor(vec.data[j]); + vec.data[j] = activation_op(vec.data[j]); v_store(output_vPtr[i], vec); } } @@ -51,9 +51,8 @@ namespace raw { auto output_vPtr = vector_type::get_pointer(output.data()); auto input_vPtr = vector_type::get_pointer(input.data()); - inner_size /= vector_type::size(); for (auto i : grid_stride_range(output.size() / vector_type::size())) { - const index_type c = (i / inner_size) % static_cast(slope.size()); + const index_type c = (i / inner_size) % slope.size(); vector_type vec; v_load(vec, input_vPtr[i]); @@ -65,73 +64,73 @@ namespace raw { } /* namespace raw */ -template class Activation, std::size_t N, class ...ActivationArgs> static -void launch_vectorized_generic_op(const Stream& stream, Span output, View input, ActivationArgs ...activationArgs) { +template static +void launch_vectorized_generic_op(const Stream& stream, Span output, View input, const typename ActivationOp::Params& params) { CV_Assert(is_fully_aligned(output, N)); CV_Assert(is_fully_aligned(input, N)); - auto kernel = raw::generic_op_vec, N, ActivationArgs...>; + auto kernel = raw::generic_op_vec; auto policy = make_policy(kernel, output.size() / N, 0, stream); - launch_kernel(kernel, policy, output, input, activationArgs...); + launch_kernel(kernel, policy, output, input, params); } -template class Activation, class ...ActivationArgs> static -void generic_op(const Stream& stream, Span output, View input, ActivationArgs ...activationArgs) { +template static +void generic_op(const Stream& stream, Span output, View input, const typename ActivationOp::Params& params = {}) { CV_Assert(input.size() == output.size()); if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4)) { - launch_vectorized_generic_op(stream, output, input, activationArgs...); + launch_vectorized_generic_op(stream, output, input, params); } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2)) { - launch_vectorized_generic_op(stream, output, input, activationArgs...); + launch_vectorized_generic_op(stream, output, input, params); } else { - launch_vectorized_generic_op(stream, output, input, activationArgs...); + launch_vectorized_generic_op(stream, output, input, params); } } -template -void abs(const Stream& stream, Span output, View input) { - generic_op(stream, output, input); -} - -template -void tanh(const Stream& stream, Span output, View input) { - generic_op(stream, output, input); -} - -template -void swish(const Stream& stream, Span output, View input) { - generic_op(stream, output, input); -} - -template -void mish(const Stream& stream, Span output, View input) { - generic_op(stream, output, input); -} - -template -void sigmoid(const Stream& stream, Span output, View input) { - generic_op(stream, output, input); -} - -template -void bnll(const Stream& stream, Span output, View input) { - generic_op(stream, output, input); -} - -template -void elu(const Stream& stream, Span output, View input) { - generic_op(stream, output, input); -} - template void relu(const Stream& stream, Span output, View input, T slope) { - generic_op(stream, output, input, slope); + generic_op>(stream, output, input, {slope}); } template void clipped_relu(const Stream& stream, Span output, View input, T floor, T ceiling) { CV_Assert(static_cast(floor) <= static_cast(ceiling)); - generic_op(stream, output, input, floor, ceiling); + generic_op>(stream, output, input, {floor, ceiling}); +} + +template +void tanh(const Stream& stream, Span output, View input) { + generic_op>(stream, output, input); +} + +template +void swish(const Stream& stream, Span output, View input) { + generic_op>(stream, output, input); +} + +template +void mish(const Stream& stream, Span output, View input) { + generic_op>(stream, output, input); +} + +template +void sigmoid(const Stream& stream, Span output, View input) { + generic_op>(stream, output, input); +} + +template +void elu(const Stream& stream, Span output, View input) { + generic_op>(stream, output, input); +} + +template +void bnll(const Stream& stream, Span output, View input) { + generic_op>(stream, output, input); +} + +template +void abs(const Stream& stream, Span output, View input) { + generic_op>(stream, output, input); } template @@ -143,31 +142,32 @@ void power(const Stream& stream, Span output, View input, T exp, T scale, return; } - generic_op(stream, output, input, exp, scale, shift); + generic_op>(stream, output, input, {exp, scale, shift}); } #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) -template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input); +template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half); +template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half); template void tanh<__half>(const Stream&, Span<__half>, View<__half>); template void swish<__half>(const Stream&, Span<__half>, View<__half>); template void mish<__half>(const Stream&, Span<__half>, View<__half>); template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>); -template void bnll<__half>(const Stream&, Span<__half>, View<__half>); template void elu<__half>(const Stream&, Span<__half>, View<__half>); -template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half); -template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half); +template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input); +template void bnll<__half>(const Stream&, Span<__half>, View<__half>); template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half); #endif -template void abs(const Stream& stream, Span output, View input); + +template void relu(const Stream&, Span, View, float); +template void clipped_relu(const Stream&, Span, View, float, float); template void tanh(const Stream&, Span, View); template void swish(const Stream&, Span, View); template void mish(const Stream&, Span, View); template void sigmoid(const Stream&, Span, View); -template void bnll(const Stream&, Span, View); template void elu(const Stream&, Span, View); -template void relu(const Stream&, Span, View, float); -template void clipped_relu(const Stream&, Span, View, float, float); +template void abs(const Stream& stream, Span output, View input); +template void bnll(const Stream&, Span, View); template void power(const Stream&, Span, View, float, float, float); template static @@ -178,7 +178,7 @@ void launch_vectorized_axiswise_relu(const Stream& stream, Span output, View< auto kernel = raw::axiswise_relu_vec; auto policy = make_policy(kernel, output.size() / N, 0, stream); - launch_kernel(kernel, policy, output, input, inner_size, slope); + launch_kernel(kernel, policy, output, input, inner_size / N, slope); } template diff --git a/modules/dnn/src/cuda/bias_activation.cu b/modules/dnn/src/cuda/bias_activation.cu index 0acc2ff54d..fc5bcf351d 100644 --- a/modules/dnn/src/cuda/bias_activation.cu +++ b/modules/dnn/src/cuda/bias_activation.cu @@ -20,103 +20,101 @@ using namespace cv::dnn::cuda4dnn::csl::device; namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { namespace raw { - template - __global__ void biasN_generic_op_inplace_vec(Span inplace_output, size_type inner_size, View bias, FunctorArgs ...functorArgs) { + template + __global__ void biasN_generic_op_inplace_vec(Span inplace_output, size_type inner_size, View bias, const typename ActivationOp::Params params) { using vector_type = get_vector_type_t; auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); - Functor functor(functorArgs...); + ActivationOp activation_op(params); - inner_size /= vector_type::size(); for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { - const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + const index_type bias_idx = (i / inner_size) % bias.size(); vector_type vec; v_load(vec, inplace_output_vPtr[i]); for(int j = 0; j < vec.size(); j++) - vec.data[j] = functor(vec.data[j] + bias[bias_idx]); + vec.data[j] = activation_op(vec.data[j] + bias[bias_idx]); v_store(inplace_output_vPtr[i], vec); } } } /* namespace raw */ -template class Activation, std::size_t N, class ...ActivationArgs> static -void launch_vectorized_biasN_generic_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, ActivationArgs ...activationArgs) { +template static +void launch_vectorized_biasN_generic_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, const typename ActivationOp::Params& params) { CV_Assert(inplace_output.size() % inner_size == 0); - CV_Assert(inplace_output.size() % bias.size() == 0); CV_Assert(is_fully_aligned(inplace_output, N)); CV_Assert(inner_size % N == 0); - auto kernel = raw::biasN_generic_op_inplace_vec, N, ActivationArgs...>; + auto kernel = raw::biasN_generic_op_inplace_vec; auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); - launch_kernel(kernel, policy, inplace_output, inner_size, bias, activationArgs...); + launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, params); } -template class Activation, class ...ActivationArgs> static -void biasN_generic_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, ActivationArgs ...activationArgs) { +template static +void biasN_generic_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, const typename ActivationOp::Params& params = {}) { if (is_fully_aligned(inplace_output, 4) && inner_size % 4 == 0) { - launch_vectorized_biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, activationArgs...); + launch_vectorized_biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, params); } else if (is_fully_aligned(inplace_output, 2) && inner_size % 2 == 0) { - launch_vectorized_biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, activationArgs...); + launch_vectorized_biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, params); } else { - launch_vectorized_biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, activationArgs...); + launch_vectorized_biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, params); } } template void biasN_relu_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T slope) { - biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, slope); + biasN_generic_op_inplace>(stream, inplace_output, inner_size, bias, {slope}); } template void biasN_clipped_relu_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T floor, T ceil) { CV_Assert(static_cast(floor) <= static_cast(ceil)); - biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, floor, ceil); -} - -template -void biasN_power_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T power, T scale, T shift) { - biasN_generic_op_inplace(stream, inplace_output, inner_size, bias, power, scale, shift); + biasN_generic_op_inplace>(stream, inplace_output, inner_size, bias, {floor, ceil}); } template void biasN_tanh_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { - biasN_generic_op_inplace(stream, inplace_output, inner_size, bias); -} - -template -void biasN_sigmoid_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { - biasN_generic_op_inplace(stream, inplace_output, inner_size, bias); + biasN_generic_op_inplace>(stream, inplace_output, inner_size, bias); } template void biasN_swish_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { - biasN_generic_op_inplace(stream, inplace_output, inner_size, bias); + biasN_generic_op_inplace>(stream, inplace_output, inner_size, bias); } template void biasN_mish_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { - biasN_generic_op_inplace(stream, inplace_output, inner_size, bias); + biasN_generic_op_inplace>(stream, inplace_output, inner_size, bias); +} + +template +void biasN_sigmoid_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias) { + biasN_generic_op_inplace>(stream, inplace_output, inner_size, bias); +} + +template +void biasN_power_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, T power, T scale, T shift) { + biasN_generic_op_inplace>(stream, inplace_output, inner_size, bias, {power, scale, shift}); } #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half); template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half); -template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half, __half); template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); -template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); +template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>); +template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half, __half); #endif template void biasN_relu_inplace(const Stream&, Span, std::size_t, View, float); template void biasN_clipped_relu_inplace(const Stream&, Span, std::size_t, View, float, float); -template void biasN_power_inplace(const Stream&, Span, std::size_t, View, float, float, float); template void biasN_tanh_inplace(const Stream&, Span, std::size_t, View); -template void biasN_sigmoid_inplace(const Stream&, Span, std::size_t, View); template void biasN_swish_inplace(const Stream&, Span, std::size_t, View); template void biasN_mish_inplace(const Stream&, Span, std::size_t, View); +template void biasN_sigmoid_inplace(const Stream&, Span, std::size_t, View); +template void biasN_power_inplace(const Stream&, Span, std::size_t, View, float, float, float); }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/bias_activation_eltwise.cu b/modules/dnn/src/cuda/bias_activation_eltwise.cu new file mode 100644 index 0000000000..cb228eb092 --- /dev/null +++ b/modules/dnn/src/cuda/bias_activation_eltwise.cu @@ -0,0 +1,125 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include +#include + +#include "functors.hpp" +#include "types.hpp" +#include "vector_traits.hpp" +#include "grid_stride_range.hpp" +#include "execution.hpp" + +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" + +using namespace cv::dnn::cuda4dnn::csl; +using namespace cv::dnn::cuda4dnn::csl::device; + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + +namespace raw { + + template + __global__ void biasN_generic_op_eltwise_op_inplace_vec(Span inplace_output, size_type inner_size, View bias, View eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + auto eltwise_vPtr = vector_type::get_pointer(eltwise.data()); + + ActivationOp activation_op(act_params); + EltwiseOp eltwise_op(eltwise_params); + + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % bias.size(); + + vector_type output_vec, eltwise_vec; + v_load(output_vec, inplace_output_vPtr[i]); + v_load(eltwise_vec, eltwise_vPtr[i]); + for(int j = 0; j < output_vec.size(); j++) + output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j] + bias[bias_idx]), eltwise_vec.data[j]); + v_store(inplace_output_vPtr[i], output_vec); + } + } +} + +template static +void launch_vectorized_biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(is_fully_aligned(eltwise, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_generic_op_eltwise_op_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, act_params, eltwise_params); +} + +template static +void biasN_generic_op_eltwise_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) { + CV_Assert(inplace_output.size() == eltwise.size()); + + if (is_fully_aligned(inplace_output, 4) && is_fully_aligned(eltwise, 4) && inner_size % 4 == 0) { + launch_vectorized_biasN_generic_op_eltwise_op_inplace(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params); + } else if (is_fully_aligned(inplace_output, 2) && is_fully_aligned(eltwise, 2) && inner_size % 2 == 0) { + launch_vectorized_biasN_generic_op_eltwise_op_inplace(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params); + } else { + launch_vectorized_biasN_generic_op_eltwise_op_inplace(stream, inplace_output, inner_size, bias, eltwise, act_params, eltwise_params); + } +} + +template +void biasN_relu_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, T slope) { + biasN_generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, inner_size, bias, eltwise, {slope}); +} + +template +void biasN_clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, T floor, T ceiling) { + CV_Assert(static_cast(floor) <= static_cast(ceiling)); + biasN_generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, inner_size, bias, eltwise, {floor, ceiling}); +} + +template +void biasN_tanh_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_swish_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_mish_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_power_eltwise_sum_2_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, T exp, T scale, T shift) { + biasN_generic_op_eltwise_op_inplace, SumFunctor>(stream, inplace_output, inner_size, bias, eltwise, {exp, scale, shift}); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void biasN_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half); +template void biasN_clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half); +template void biasN_tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half); +#endif + +template void biasN_relu_eltwise_sum_2_inplace(const Stream&, Span, std::size_t, View, View, float); +template void biasN_clipped_relu_eltwise_sum_2_inplace(const Stream&, Span, std::size_t, View, View, float, float); +template void biasN_tanh_eltwise_sum_2_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_swish_eltwise_sum_2_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_mish_eltwise_sum_2_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_sigmoid_eltwise_sum_2_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_power_eltwise_sum_2_inplace(const Stream&, Span, std::size_t, View, View, float, float, float); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/bias_eltwise_activation.cu b/modules/dnn/src/cuda/bias_eltwise_activation.cu new file mode 100644 index 0000000000..2498b859be --- /dev/null +++ b/modules/dnn/src/cuda/bias_eltwise_activation.cu @@ -0,0 +1,132 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include +#include + +#include "functors.hpp" +#include "types.hpp" +#include "vector_traits.hpp" +#include "grid_stride_range.hpp" +#include "execution.hpp" + +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" + +using namespace cv::dnn::cuda4dnn::csl; +using namespace cv::dnn::cuda4dnn::csl::device; + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + +namespace raw { + template + __global__ void biasN_eltwise_op_generic_op_inplace_vec(Span inplace_output, size_type inner_size, View bias, View eltwise, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) { + using vector_type = get_vector_type_t; + + auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data()); + auto eltwise_vPtr = vector_type::get_pointer(eltwise.data()); + + EltwiseOp eltwise_op(eltwise_params); + ActivationOp activation_op(act_params); + + for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) { + const index_type bias_idx = (i / inner_size) % bias.size(); + + vector_type output_vec, eltwise_vec; + v_load(output_vec, inplace_output_vPtr[i]); + v_load(eltwise_vec, eltwise_vPtr[i]); + for(int j = 0; j < output_vec.size(); j++) + output_vec.data[j] = activation_op(eltwise_op(output_vec.data[j] + bias[bias_idx], eltwise_vec.data[j])); + v_store(inplace_output_vPtr[i], output_vec); + } + } +} + +template static +void launch_vectorized_biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) { + CV_Assert(is_fully_aligned(inplace_output, N)); + CV_Assert(inplace_output.size() % bias.size() == 0); + CV_Assert(is_fully_aligned(eltwise, N)); + CV_Assert(inner_size % N == 0); + + auto kernel = raw::biasN_eltwise_op_generic_op_inplace_vec; + auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream); + launch_kernel(kernel, policy, inplace_output, inner_size / N, bias, eltwise, eltwise_params, act_params); +} + +template static +void biasN_eltwise_op_generic_op_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) { + CV_Assert(inplace_output.size() == eltwise.size()); + + if (is_fully_aligned(inplace_output, 4) && is_fully_aligned(eltwise, 4) && inner_size % 4 == 0) { + launch_vectorized_biasN_eltwise_op_generic_op_inplace(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params); + } else if (is_fully_aligned(inplace_output, 2) && is_fully_aligned(eltwise, 2) && inner_size % 2 == 0) { + launch_vectorized_biasN_eltwise_op_generic_op_inplace(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params); + } else { + launch_vectorized_biasN_eltwise_op_generic_op_inplace(stream, inplace_output, inner_size, bias, eltwise, eltwise_params, act_params); + } +} + +template +void biasN_eltwise_sum_2_identity_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_eltwise_op_generic_op_inplace, IdentityFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_eltwise_sum_2_relu_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, T slope) { + biasN_eltwise_op_generic_op_inplace, ReLUFunctor>(stream, inplace_output, inner_size, bias, eltwise, {}, {slope}); +} + +template +void biasN_eltwise_sum_2_clipped_relu_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, T floor, T ceiling) { + CV_Assert(static_cast(floor) <= static_cast(ceiling)); + biasN_eltwise_op_generic_op_inplace, ClippedReLUFunctor>(stream, inplace_output, inner_size, bias, eltwise, {}, {floor, ceiling}); +} + +template +void biasN_eltwise_sum_2_tanh_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_eltwise_op_generic_op_inplace, TanHFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_eltwise_sum_2_swish_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_eltwise_op_generic_op_inplace, SwishFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_eltwise_sum_2_mish_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_eltwise_op_generic_op_inplace, MishFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_eltwise_sum_2_sigmoid_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise) { + biasN_eltwise_op_generic_op_inplace, SigmoidFunctor>(stream, inplace_output, inner_size, bias, eltwise); +} + +template +void biasN_eltwise_sum_2_power_inplace(const Stream& stream, Span inplace_output, std::size_t inner_size, View bias, View eltwise, T exp, T scale, T shift) { + biasN_eltwise_op_generic_op_inplace, PowerFunctor>(stream, inplace_output, inner_size, bias, eltwise, {}, {exp, scale, shift}); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void biasN_eltwise_sum_2_identity_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_eltwise_sum_2_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half); +template void biasN_eltwise_sum_2_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half); +template void biasN_eltwise_sum_2_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_eltwise_sum_2_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_eltwise_sum_2_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_eltwise_sum_2_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>); +template void biasN_eltwise_sum_2_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, View<__half>, __half, __half, __half); +#endif + +template void biasN_eltwise_sum_2_identity_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_eltwise_sum_2_relu_inplace(const Stream&, Span, std::size_t, View, View, float); +template void biasN_eltwise_sum_2_clipped_relu_inplace(const Stream&, Span, std::size_t, View, View, float, float); +template void biasN_eltwise_sum_2_tanh_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_eltwise_sum_2_swish_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_eltwise_sum_2_mish_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_eltwise_sum_2_sigmoid_inplace(const Stream&, Span, std::size_t, View, View); +template void biasN_eltwise_sum_2_power_inplace(const Stream&, Span, std::size_t, View, View, float, float, float); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/eltwise_activation.cu b/modules/dnn/src/cuda/eltwise_activation.cu new file mode 100644 index 0000000000..6a40311b72 --- /dev/null +++ b/modules/dnn/src/cuda/eltwise_activation.cu @@ -0,0 +1,125 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include +#include + +#include "functors.hpp" +#include "types.hpp" +#include "vector_traits.hpp" +#include "grid_stride_range.hpp" +#include "execution.hpp" + +#include "../cuda4dnn/csl/stream.hpp" +#include "../cuda4dnn/csl/span.hpp" + +using namespace cv::dnn::cuda4dnn::csl; +using namespace cv::dnn::cuda4dnn::csl::device; + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + +namespace raw { + + template + __global__ void eltwise_op_generic_op_vec(Span output, View x, View y, const typename EltwiseOp::Params eltwise_params, const typename ActivationOp::Params act_params) { + using vector_type = get_vector_type_t; + + auto output_vPtr = vector_type::get_pointer(output.data()); + auto x_vPtr = vector_type::get_pointer(x.data()); + auto y_vPtr = vector_type::get_pointer(y.data()); + + EltwiseOp eltwise_op(eltwise_params); + ActivationOp activation_op(act_params); + + for (auto i : grid_stride_range(output.size() / vector_type::size())) { + vector_type vec_x, vec_y; + v_load(vec_x, x_vPtr[i]); + v_load(vec_y, y_vPtr[i]); + for(int j = 0; j < vec_x.size(); j++) + vec_x.data[j] = activation_op(eltwise_op(vec_x.data[j], vec_y.data[j])); + v_store(output_vPtr[i], vec_x); + } + } +} + +template static +void launch_vectorized_eltwise_op_generic_op(const Stream& stream, Span output, View x, View y, const typename EltwiseOp::Params& eltwise_params, const typename ActivationOp::Params& act_params) { + CV_Assert(is_fully_aligned(output, N)); + CV_Assert(is_fully_aligned(x, N)); + CV_Assert(is_fully_aligned(y, N)); + + auto kernel = raw::eltwise_op_generic_op_vec; + auto policy = make_policy(kernel, output.size() / N, 0, stream); + launch_kernel(kernel, policy, output, x, y, eltwise_params, act_params); +} + +template static +void eltwise_op_generic_op(const Stream& stream, Span output, View x, View y, const typename EltwiseOp::Params& eltwise_params = {}, const typename ActivationOp::Params& act_params = {}) { + CV_Assert(output.size() == x.size()); + CV_Assert(output.size() == y.size()); + + if (is_fully_aligned(output, 4) && is_fully_aligned(x, 4) && is_fully_aligned(y, 4)) { + launch_vectorized_eltwise_op_generic_op(stream, output, x, y, eltwise_params, act_params); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(x, 2) && is_fully_aligned(y, 4)) { + launch_vectorized_eltwise_op_generic_op(stream, output, x, y, eltwise_params, act_params); + } else { + launch_vectorized_eltwise_op_generic_op(stream, output, x, y, eltwise_params, act_params); + } +} + +template +void eltwise_sum_2_relu(const Stream& stream, Span output, View x, View y, T slope) { + eltwise_op_generic_op, ReLUFunctor>(stream, output, x, y, {}, {slope}); +} + +template +void eltwise_sum_2_clipped_relu(const Stream& stream, Span output, View x, View y, T floor, T ceiling) { + CV_Assert(static_cast(floor) <= static_cast(ceiling)); + eltwise_op_generic_op, ClippedReLUFunctor>(stream, output, x, y, {}, {floor, ceiling}); +} + +template +void eltwise_sum_2_tanh(const Stream& stream, Span output, View x, View y) { + eltwise_op_generic_op, TanHFunctor>(stream, output, x, y); +} + +template +void eltwise_sum_2_swish(const Stream& stream, Span output, View x, View y) { + eltwise_op_generic_op, SwishFunctor>(stream, output, x, y); +} + +template +void eltwise_sum_2_mish(const Stream& stream, Span output, View x, View y) { + eltwise_op_generic_op, MishFunctor>(stream, output, x, y); +} + +template +void eltwise_sum_2_sigmoid(const Stream& stream, Span output, View x, View y) { + eltwise_op_generic_op, SigmoidFunctor>(stream, output, x, y); +} + +template +void eltwise_sum_2_power(const Stream& stream, Span output, View x, View y, T exp, T scale, T shift) { + eltwise_op_generic_op, PowerFunctor>(stream, output, x, y, {}, {exp, scale, shift}); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +template void eltwise_sum_2_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half); +template void eltwise_sum_2_clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half); +template void eltwise_sum_2_tanh<__half>(const Stream&, Span<__half>, View<__half>, View<__half>); +template void eltwise_sum_2_swish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>); +template void eltwise_sum_2_mish<__half>(const Stream&, Span<__half>, View<__half>, View<__half>); +template void eltwise_sum_2_sigmoid<__half>(const Stream&, Span<__half>, View<__half>, View<__half>); +template void eltwise_sum_2_power<__half>(const Stream&, Span<__half>, View<__half>, View<__half>, __half, __half, __half); +#endif + +template void eltwise_sum_2_relu(const Stream&, Span, View, View, float); +template void eltwise_sum_2_clipped_relu(const Stream&, Span, View, View, float, float); +template void eltwise_sum_2_tanh(const Stream&, Span, View, View); +template void eltwise_sum_2_swish(const Stream&, Span, View, View); +template void eltwise_sum_2_mish(const Stream&, Span, View, View); +template void eltwise_sum_2_sigmoid(const Stream&, Span, View, View); +template void eltwise_sum_2_power(const Stream&, Span, View, View, float, float, float); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index a7d06e63a1..b24801531f 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -21,77 +21,77 @@ using namespace cv::dnn::cuda4dnn::csl::device; namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { namespace raw { - template - __global__ void eltwise_op_vec(Span output, View x, View y, FunctorArgs ...functorArgs) { + template + __global__ void eltwise_op_vec(Span output, View x, View y, const typename EltwiseOp::Params params) { using vector_type = get_vector_type_t; auto output_vPtr = vector_type::get_pointer(output.data()); auto x_vPtr = vector_type::get_pointer(x.data()); auto y_vPtr = vector_type::get_pointer(y.data()); - Functor functor(functorArgs...); + EltwiseOp eltwise_op(params); for (auto i : grid_stride_range(output.size() / vector_type::size())) { vector_type vec_x, vec_y; v_load(vec_x, x_vPtr[i]); v_load(vec_y, y_vPtr[i]); for (int j = 0; j < vector_type::size(); j++) - vec_x.data[j] = functor(vec_x.data[j], vec_y.data[j]); + vec_x.data[j] = eltwise_op(vec_x.data[j], vec_y.data[j]); v_store(output_vPtr[i], vec_x); } } } -template class EltwiseOp, std::size_t N, class ...EltwiseOpArgs> static -void launch_vectorized_eltwise_op(const Stream& stream, Span output, View x, View y, EltwiseOpArgs ...eltwiseOpArgs) { +template static +void launch_vectorized_eltwise_op(const Stream& stream, Span output, View x, View y, const typename EltwiseOp::Params& params) { CV_Assert(x.size() == y.size()); CV_Assert(x.size() == output.size()); CV_Assert(is_fully_aligned(output, N)); CV_Assert(is_fully_aligned(x, N)); CV_Assert(is_fully_aligned(y, N)); - auto kernel = raw::eltwise_op_vec, N, EltwiseOpArgs...>; + auto kernel = raw::eltwise_op_vec; auto policy = make_policy(kernel, output.size() / N, 0, stream); - launch_kernel(kernel, policy, output, x, y, eltwiseOpArgs...); + launch_kernel(kernel, policy, output, x, y, params); } -template class EltwiseOp, class ...EltwiseOpArgs> static -void eltwise_op(const Stream& stream, Span output, View x, View y, EltwiseOpArgs ...eltwiseOpArgs) { +template static +void eltwise_op(const Stream& stream, Span output, View x, View y, const typename EltwiseOp::Params& params = {}) { CV_Assert(x.size() == y.size()); CV_Assert(x.size() == output.size()); if (is_fully_aligned(output, 4) && is_fully_aligned(x, 4) && is_fully_aligned(y, 4)) { - launch_vectorized_eltwise_op(stream, output, x, y, eltwiseOpArgs...); + launch_vectorized_eltwise_op(stream, output, x, y, params); } else if (is_fully_aligned(output, 2) && is_fully_aligned(x, 2) && is_fully_aligned(y, 2)) { - launch_vectorized_eltwise_op(stream, output, x, y, eltwiseOpArgs...); + launch_vectorized_eltwise_op(stream, output, x, y, params); } else { - launch_vectorized_eltwise_op(stream, output, x, y, eltwiseOpArgs...); + launch_vectorized_eltwise_op(stream, output, x, y, params); } } template void eltwise_max_2(const Stream& stream, Span output, View x, View y) { - eltwise_op(stream, output, x, y); + eltwise_op>(stream, output, x, y); } template void eltwise_sum_2(const Stream& stream, Span output, View x, View y) { - eltwise_op(stream, output, x, y); + eltwise_op>(stream, output, x, y); } template void eltwise_sum_coeff_2(const Stream& stream, Span output, T coeff_x, View x, T coeff_y, View y) { - eltwise_op(stream, output, x, y, coeff_x, coeff_y); + eltwise_op>(stream, output, x, y, {coeff_x, coeff_y}); } template void eltwise_prod_2(const Stream& stream, Span output, View x, View y) { - eltwise_op(stream, output, x, y); + eltwise_op>(stream, output, x, y); } template void eltwise_div_2(const Stream& stream, Span output, View x, View y) { - eltwise_op(stream, output, x, y); + eltwise_op>(stream, output, x, y); } #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp index 5206522abf..9ba61f7145 100644 --- a/modules/dnn/src/cuda/functors.hpp +++ b/modules/dnn/src/cuda/functors.hpp @@ -9,27 +9,87 @@ #include "math.hpp" +#include "../cuda4dnn/csl/nvcc_defs.hpp" + namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template -struct abs_functor { - __device__ T operator()(T value) { - using csl::device::abs; - return abs(value); - } +struct IdentityFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE IdentityFunctor() { } + CUDA4DNN_DEVICE IdentityFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { + return value; + }; }; template -struct tanh_functor { - __device__ T operator()(T value) { +struct ReLUFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() : slope(0) { } + CUDA4DNN_HOST_DEVICE Params(T slope_) : slope(slope_) { } + T slope; + }; + + CUDA4DNN_DEVICE ReLUFunctor() : ReLUFunctor(Params{}) { } + CUDA4DNN_DEVICE ReLUFunctor(const Params& params) : slope(params.slope) { } + + CUDA4DNN_DEVICE T operator()(T value) { + using csl::device::log1pexp; + return value >= T(0) ? value : slope * value; + } + + T slope; +}; + +template +struct ClippedReLUFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() : floor(0), ceiling(6) { } + CUDA4DNN_HOST_DEVICE Params(T floor_, T ceiling_) : floor(floor_), ceiling(ceiling_) { } + T floor, ceiling; + }; + + CUDA4DNN_DEVICE ClippedReLUFunctor() : ClippedReLUFunctor(Params{}) { } + CUDA4DNN_DEVICE ClippedReLUFunctor(const Params& params) : floor{params.floor}, ceiling{params.ceiling} { } + + CUDA4DNN_DEVICE T operator()(T value) { + using csl::device::clamp; + return clamp(value, floor, ceiling); + } + + T floor, ceiling; +}; + +template +struct TanHFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE TanHFunctor() { } + CUDA4DNN_DEVICE TanHFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { using csl::device::tanh; return tanh(value); } }; template -struct swish_functor { - __device__ T operator()(T value) { +struct SwishFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE SwishFunctor() { } + CUDA4DNN_DEVICE SwishFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { // f(x) = x * sigmoid(x) using csl::device::fast_divide; using csl::device::fast_exp; @@ -38,8 +98,15 @@ struct swish_functor { }; template -struct mish_functor { - __device__ T operator()(T value) { +struct MishFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE MishFunctor() { } + CUDA4DNN_DEVICE MishFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { using csl::device::tanh; using csl::device::log1pexp; return value * tanh(log1pexp(value)); @@ -47,8 +114,15 @@ struct mish_functor { }; template <> -struct mish_functor { - __device__ float operator()(float value) { +struct MishFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE MishFunctor() { } + CUDA4DNN_DEVICE MishFunctor(const Params& params) { } + + CUDA4DNN_DEVICE float operator()(float value) { // f(x) = x * tanh(log1pexp(x)); using csl::device::fast_divide; using csl::device::fast_exp; @@ -63,63 +137,90 @@ struct mish_functor { #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) template <> -struct mish_functor<__half> { - __device__ __half operator()(__half value) { - return mish_functor()(value); +struct MishFunctor<__half> { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE MishFunctor() { } + CUDA4DNN_DEVICE MishFunctor(const Params& params) { } + + CUDA4DNN_DEVICE __half operator()(__half value) { + return MishFunctor()(value); } }; #endif template -struct sigmoid_functor { - __device__ T operator()(T value) { +struct SigmoidFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE SigmoidFunctor() { } + CUDA4DNN_DEVICE SigmoidFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { using csl::device::fast_sigmoid; return fast_sigmoid(value); } }; template -struct bnll_functor { - __device__ T operator()(T value) { - using csl::device::log1pexp; - return value > T(0) ? value + log1pexp(-value) : log1pexp(value); - } -}; +struct ELUFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; -template -struct elu_functor { - __device__ T operator()(T value) { + CUDA4DNN_DEVICE ELUFunctor() { } + CUDA4DNN_DEVICE ELUFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { using csl::device::expm1; return value >= T(0) ? value : expm1(value); } }; template -struct relu_functor { - __device__ relu_functor(T slope_) : slope{slope_} { } - __device__ T operator()(T value) { +struct AbsFunctor { + struct Params { }; + + CUDA4DNN_DEVICE AbsFunctor() { } + CUDA4DNN_DEVICE AbsFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { + using csl::device::abs; + return abs(value); + } +}; + +template +struct BNLLFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE BNLLFunctor() { } + CUDA4DNN_DEVICE BNLLFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T value) { using csl::device::log1pexp; - return value >= T(0) ? value : slope * value; + return value > T(0) ? value + log1pexp(-value) : log1pexp(value); } - - T slope; }; template -struct clipped_relu_functor { - __device__ clipped_relu_functor(T floor_, T ceiling_) : floor{floor_}, ceiling{ceiling_} { } - __device__ T operator()(T value) { - using csl::device::clamp; - return clamp(value, floor, ceiling); - } +struct PowerFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() : exp(1), scale(1), shift(0) { } + CUDA4DNN_HOST_DEVICE Params(T exp_, T scale_, T shift_) : exp(exp_), scale(scale_), shift(shift_) { } + T exp, scale, shift; + }; - T floor, ceiling; -}; + CUDA4DNN_DEVICE PowerFunctor() : PowerFunctor(Params{}) { } + CUDA4DNN_DEVICE PowerFunctor(const Params& params) : exp{params.exp}, scale{params.scale}, shift{params.shift} { } -template -struct power_functor { - __device__ power_functor(T exp_, T scale_, T shift_) : exp{exp_}, scale{scale_}, shift{shift_} { } - __device__ T operator()(T value) { + CUDA4DNN_DEVICE T operator()(T value) { using csl::device::pow; return pow(shift + scale * value, exp); } @@ -128,36 +229,70 @@ struct power_functor { }; template -struct max_functor { - __device__ T operator()(T x, T y) { +struct MaxFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE MaxFunctor() { } + CUDA4DNN_DEVICE MaxFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T x, T y) { using csl::device::max; return max(x, y); } }; template -struct sum_functor { - __device__ T operator()(T x, T y) { return x + y; } +struct SumFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE SumFunctor() { } + CUDA4DNN_DEVICE SumFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T x, T y) { return x + y; } }; template -struct scaled_sum_functor { - __device__ scaled_sum_functor(T scale_x_, T scale_y_) - : scale_x{scale_x_}, scale_y{scale_y_} { } +struct ScaledSumFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() : scale_x(1), scale_y(1) { } + CUDA4DNN_HOST_DEVICE Params(T scale_x_, T scale_y_) : scale_x(scale_x_), scale_y(scale_y_) { } + T scale_x, scale_y; + }; - __device__ T operator()(T x, T y) { return scale_x * x + scale_y * y; } + CUDA4DNN_DEVICE ScaledSumFunctor() : scale_x(1), scale_y(1) { } + CUDA4DNN_DEVICE ScaledSumFunctor(const Params& params) : scale_x{params.scale_x}, scale_y{params.scale_y} { } + + CUDA4DNN_DEVICE T operator()(T x, T y) { return scale_x * x + scale_y * y; } T scale_x, scale_y; }; template -struct product_functor { - __device__ T operator()(T x, T y) { return x * y; } +struct ProductFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE ProductFunctor() { } + CUDA4DNN_DEVICE ProductFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T x, T y) { return x * y; } }; template -struct div_functor { - __device__ T operator()(T x, T y) { return x / y; } +struct DivFunctor { + struct Params { + CUDA4DNN_HOST_DEVICE Params() { } + }; + + CUDA4DNN_DEVICE DivFunctor() { } + CUDA4DNN_DEVICE DivFunctor(const Params& params) { } + + CUDA4DNN_DEVICE T operator()(T x, T y) { return x / y; } }; }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/scale_shift.cu b/modules/dnn/src/cuda/scale_shift.cu index 36bdb7a261..3c20a884ef 100644 --- a/modules/dnn/src/cuda/scale_shift.cu +++ b/modules/dnn/src/cuda/scale_shift.cu @@ -33,7 +33,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { inner_size /= vector_type::size(); for (auto i : grid_stride_range(output.size() / vector_type::size())) { - const index_type bias_idx = (i / inner_size) % static_cast(bias.size()); + const index_type bias_idx = (i / inner_size) % bias.size(); vector_type vec; v_load(vec, input_vPtr[i]); @@ -53,7 +53,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { inner_size /= vector_type::size(); for (auto i : grid_stride_range(output.size() / vector_type::size())) { - const index_type scale_idx = (i / inner_size) % static_cast(weights.size()); + const index_type scale_idx = (i / inner_size) % weights.size(); vector_type vec; v_load(vec, input_vPtr[i]); @@ -90,7 +90,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { inner_size /= vector_type::size(); for (auto i : grid_stride_range(output.size() / vector_type::size())) { - const index_type scale_idx = (i / inner_size) % static_cast(weights.size()); + const index_type scale_idx = (i / inner_size) % weights.size(); vector_type vec; v_load(vec, input_vPtr[i]); diff --git a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp index cad4b294d1..93f3101bf6 100644 --- a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp +++ b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp @@ -537,6 +537,101 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cu outputDesc.get(), outputPtr.get())); } + /** @brief performs convolution, bias addition, eltwise addition and activation simultaneously + * + * dstValue = act(alpha1 * conv(input) + bias + alpha2 * eltwise) + * + * @tparam T convolution element type (must be `half` or `float`) + * + * @param handle valid cuDNN Handle + * @param convDesc convolution description + * @param convAlgo algorithm to use for convolution + * @param workspace workspace memory which meets the requirements of \p convAlgo + * @param filterDesc filter descriptor + * @param[in] filterPtr pointer to device memory containing the filters + * @param alpha1 convolution scale factor + * @param inputDesc tensor descriptor describing the input + * @param[in] inputPtr pointer to input tensor in device memory + * @param biasDesc tensor descriptor describing the bias + * @param[in] biasPtr pointer to bias tensor in device memory + * @param alpha2 eltwise scale factor + * @param eltwiseDesc tensor descriptor describing the eltwise tensor + * @param[in] eltwisePtr pointer to the eltwise tensor in device memory + * @param actDesc activation descriptor + * @param outputDesc tensor descriptor describing the output + * @param[out] outputPtr pointer to output tensor in device memory + * + * Exception Guarantee: Basic + */ + template + void convolve_with_bias_eltwise_activation( + const Handle& handle, + T alpha1, + const ConvolutionDescriptor& convDesc, + const ConvolutionAlgorithm& convAlgo, + WorkspaceInstance workspace, + const FilterDescriptor& filterDesc, + DevicePtr filterPtr, + const TensorDescriptor& inputDesc, + DevicePtr inputPtr, + const TensorDescriptor& biasDesc, + DevicePtr biasPtr, + T alpha2, + const TensorDescriptor& eltwiseDesc, + DevicePtr eltwisePtr, + const ActivationDescriptor& actDesc, + const TensorDescriptor& outputDesc, + DevicePtr outputPtr) + { + CV_Assert(handle); + + CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward( + handle.get(), + &alpha1, inputDesc.get(), inputPtr.get(), + filterDesc.get(), filterPtr.get(), + convDesc.get(), convAlgo.get(), + static_cast(workspace.get()), workspace.size_in_bytes(), + &alpha2, eltwiseDesc.get(), eltwisePtr.get(), + biasDesc.get(), biasPtr.get(), + actDesc.get(), + outputDesc.get(), outputPtr.get())); + } + + template <> inline + void convolve_with_bias_eltwise_activation( + const Handle& handle, + half alpha1, + const ConvolutionDescriptor& convDesc, + const ConvolutionAlgorithm& convAlgo, + WorkspaceInstance workspace, + const FilterDescriptor& filterDesc, + DevicePtr filterPtr, + const TensorDescriptor& inputDesc, + DevicePtr inputPtr, + const TensorDescriptor& biasDesc, + DevicePtr biasPtr, + half alpha2, + const TensorDescriptor& eltwiseDesc, + DevicePtr eltwisePtr, + const ActivationDescriptor& actDesc, + const TensorDescriptor& outputDesc, + DevicePtr outputPtr) + { + CV_Assert(handle); + + float alpha1_ = alpha1, alpha2_ = alpha2; + CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward( + handle.get(), + &alpha1_, inputDesc.get(), inputPtr.get(), + filterDesc.get(), filterPtr.get(), + convDesc.get(), convAlgo.get(), + static_cast(workspace.get()), workspace.size_in_bytes(), + &alpha2_, eltwiseDesc.get(), eltwisePtr.get(), + biasDesc.get(), biasPtr.get(), + actDesc.get(), + outputDesc.get(), outputPtr.get())); + } + }}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ #endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/csl/span.hpp b/modules/dnn/src/cuda4dnn/csl/span.hpp index 55e8e5f47c..ea87b17ef4 100644 --- a/modules/dnn/src/cuda4dnn/csl/span.hpp +++ b/modules/dnn/src/cuda4dnn/csl/span.hpp @@ -8,6 +8,8 @@ #include "pointer.hpp" #include "nvcc_defs.hpp" +#include "../../cuda/types.hpp" + #include #include @@ -24,17 +26,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { public: using value_type = T; - using size_type = std::size_t; - using difference_type = std::ptrdiff_t; + using size_type = device::size_type; + using index_type = device::index_type; using pointer = DevicePtr; using const_pointer = DevicePtr::type>; using reference = typename std::add_lvalue_reference::type; using const_reference = typename std::add_lvalue_reference::type>; - using iterator = pointer; - using const_iterator = const_pointer; - Span() noexcept : ptr{ nullptr }, sz{ 0 } { } CUDA4DNN_HOST_DEVICE Span(pointer first, pointer last) noexcept : ptr{ first }, sz{ last - first } { } CUDA4DNN_HOST_DEVICE Span(pointer first, size_type count) noexcept : ptr{ first }, sz{ count } { } @@ -42,7 +41,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { CUDA4DNN_HOST_DEVICE size_type size() const noexcept { return sz; } CUDA4DNN_HOST_DEVICE bool empty() const noexcept { return size() == 0; } - CUDA4DNN_DEVICE reference operator[](difference_type index) const { return ptr[index]; } + CUDA4DNN_DEVICE reference operator[](index_type index) const { return ptr[index]; } CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; } template::type, diff --git a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp index efea967650..aeddaf353b 100644 --- a/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp +++ b/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp @@ -152,6 +152,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { /* bias and activation (only RELU supported) */ std::vector bias_shape; ActivationType activation_type; /* MUST BE identity if there is no bias and ReLU if there is bias */ + bool eltwise; }; Convolution() = default; @@ -164,19 +165,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { filterDesc = FilterDescriptor(params.filter_shape); convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups); - if (!params.bias_shape.empty()) { - CV_Assert(params.activation_type == ActivationType::RELU); - biasTensorDesc = TensorDescriptor(params.bias_shape); - activationDesc = ActivationDescriptor(params.activation_type, 0.0); - } else { - CV_Assert(params.activation_type == ActivationType::IDENTITY); - } - std::vector output_dims; getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims); outputTensorDesc = TensorDescriptor(output_dims); algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc); + + if (!params.bias_shape.empty()) { + CV_Assert(params.activation_type == ActivationType::RELU); + biasTensorDesc = TensorDescriptor(params.bias_shape); + if (params.eltwise) + eltwiseTensorDesc = TensorDescriptor(output_dims); + activationDesc = ActivationDescriptor(params.activation_type, 0.0); + } else { + CV_Assert(params.activation_type == ActivationType::IDENTITY); + } } Convolution& operator=(const Convolution&) = delete; @@ -208,6 +211,19 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { ); } + void convolve_with_bias_eltwise_activation(TensorSpan output, TensorView input, TensorView filters, TensorView bias, TensorView eltwise, WorkspaceInstance scratchpad) { + cudnn::convolve_with_bias_eltwise_activation( + cudnnHandle, + 1.0, convDesc, algo, scratchpad, + filterDesc, filters.get(), + inputTensorDesc, input.get(), + biasTensorDesc, bias.get(), + 1.0, eltwiseTensorDesc, eltwise.get(), + activationDesc, + outputTensorDesc, output.get() + ); + } + private: cudnn::Handle cudnnHandle; TensorDescriptor inputTensorDesc, outputTensorDesc; @@ -215,6 +231,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { ConvolutionDescriptor convDesc; ConvolutionAlgorithm algo; TensorDescriptor biasTensorDesc; + TensorDescriptor eltwiseTensorDesc; ActivationDescriptor activationDesc; }; diff --git a/modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp b/modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp new file mode 100644 index 0000000000..62f144e7f8 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/kernels/activation_eltwise.hpp @@ -0,0 +1,40 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + + /* inplace_output = activation(inplace_output) + eltwise */ + + template + void relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, csl::View eltwise, T slope); + + template + void clipped_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, csl::View eltwise, T floor, T ceiling); + + template + void tanh_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, csl::View eltwise); + + template + void swish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, csl::View eltwise); + + template + void mish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, csl::View eltwise); + + template + void sigmoid_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, csl::View eltwise); + + template + void power_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, csl::View eltwise, T exp, T scale, T shift); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATION_ELTWISE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/kernels/activations.hpp b/modules/dnn/src/cuda4dnn/kernels/activations.hpp index 8a7ebb26f5..46f697fce3 100644 --- a/modules/dnn/src/cuda4dnn/kernels/activations.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/activations.hpp @@ -13,7 +13,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template - void abs(const csl::Stream& stream, csl::Span output, csl::View input); + void relu(const csl::Stream& stream, csl::Span output, csl::View input, T slope); + + template + void clipped_relu(const csl::Stream& stream, csl::Span output, csl::View input, T floor, T ceiling); + + template + void axiswise_relu(const csl::Stream& stream, csl::Span output, csl::View input, std::size_t inner_size, csl::View slope); template void tanh(const csl::Stream& stream, csl::Span output, csl::View input); @@ -27,20 +33,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void sigmoid(const csl::Stream& stream, csl::Span output, csl::View input); - template - void bnll(const csl::Stream& stream, csl::Span output, csl::View input); - template void elu(const csl::Stream& stream, csl::Span output, csl::View input); template - void relu(const csl::Stream& stream, csl::Span output, csl::View input, T slope); + void abs(const csl::Stream& stream, csl::Span output, csl::View input); template - void clipped_relu(const csl::Stream& stream, csl::Span output, csl::View input, T floor, T ceiling); - - template - void axiswise_relu(const csl::Stream& stream, csl::Span output, csl::View input, std::size_t inner_size, csl::View slope); + void bnll(const csl::Stream& stream, csl::Span output, csl::View input); template void power(const csl::Stream& stream, csl::Span output, csl::View input, T exp, T scale, T shift); diff --git a/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp b/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp index 500f9bb567..252193cef2 100644 --- a/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp @@ -18,21 +18,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, T floor, T ceiling); - template - void biasN_power_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, T exp, T scale, T shift); - template void biasN_tanh_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); - template - void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); - template void biasN_swish_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); template void biasN_mish_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); + template + void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias); + + template + void biasN_power_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, T exp, T scale, T shift); + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp b/modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp new file mode 100644 index 0000000000..0607a80ef3 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/kernels/bias_activation_eltwise.hpp @@ -0,0 +1,42 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + + /* inplace_output = activation(inplace_output + bias) + eltwise + * broadcasting on `bias` is allowed but not on `eltwise` + */ + + template + void biasN_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise, T slope); + + template + void biasN_clipped_relu_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise, T floor, T ceiling); + + template + void biasN_tanh_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_sigmoid_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_swish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_mish_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_power_eltwise_sum_2_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise, T exp, T scale, T shift); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_ELTWISE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp b/modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp new file mode 100644 index 0000000000..9f8bd630e4 --- /dev/null +++ b/modules/dnn/src/cuda4dnn/kernels/bias_eltwise_activation.hpp @@ -0,0 +1,45 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + + /* inplace_output = activation(inplace_output + bias + eltwise) + * broadcasting on `bias` is allowed but not on `eltwise` + */ + + template + void biasN_eltwise_sum_2_identity_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_eltwise_sum_2_relu_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise, T slope); + + template + void biasN_eltwise_sum_2_clipped_relu_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise, T floor, T ceiling); + + template + void biasN_eltwise_sum_2_tanh_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_eltwise_sum_2_swish_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_eltwise_sum_2_mish_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_eltwise_sum_2_sigmoid_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise); + + template + void biasN_eltwise_sum_2_power_inplace(const csl::Stream& stream, csl::Span inplace_output, std::size_t inner_size, csl::View bias, csl::View eltwise, T exp, T scale, T shift); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ELTWISE_ACTIVATION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp new file mode 100644 index 0000000000..2601de223a --- /dev/null +++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_activation.hpp @@ -0,0 +1,40 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP + +#include "../csl/stream.hpp" +#include "../csl/span.hpp" + +#include + +namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { + + /* output = activation(x + y) */ + + template + void eltwise_sum_2_relu(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y, T slope); + + template + void eltwise_sum_2_clipped_relu(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y, T floor, T ceiling); + + template + void eltwise_sum_2_tanh(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + + template + void eltwise_sum_2_swish(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + + template + void eltwise_sum_2_mish(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + + template + void eltwise_sum_2_sigmoid(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y); + + template + void eltwise_sum_2_power(const csl::Stream& stream, csl::Span output, csl::View x, csl::View y, T exp, T scale, T shift); + +}}}} /* namespace cv::dnn::cuda4dnn::kernels */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_ACTIVATION_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp index 282ae7cf77..0129a7ed2a 100644 --- a/modules/dnn/src/cuda4dnn/primitives/convolution.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/convolution.hpp @@ -11,9 +11,16 @@ #include "../csl/stream.hpp" #include "../csl/tensor.hpp" #include "../csl/tensor_ops.hpp" + #include "../kernels/scale_shift.hpp" #include "../kernels/activations.hpp" +#include "../kernels/activation_eltwise.hpp" #include "../kernels/bias_activation.hpp" +#include "../kernels/bias_eltwise_activation.hpp" +#include "../kernels/bias_activation_eltwise.hpp" +#include "../kernels/activation_eltwise.hpp" +#include "../kernels/eltwise_activation.hpp" +#include "../kernels/eltwise_ops.hpp" #include @@ -47,11 +54,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { /* group count for grouped convolution */ std::size_t groups; + enum class FusionMode { + NONE, + ACTIVATION, /* act(conv) */ + ELTWISE_SUM, /* eltwise + conv */ /* eltwise tensor is passed as second input to forward */ + ELTWISE_SUM_THEN_ACTIVATION, /* act(conv + eltwise) */ + ACTIVATION_THEN_ELTWISE_SUM, /* act(conv) + eltwise */ + }; + + FusionMode fusion_mode; + enum class ActivationType { IDENTITY, RELU, /* uses value provided in `relu_negative_slope` */ CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */ - POWER, /* scale and shift fused beforehand (fuseWeights); only `power_exp` is handled by CUDA */ + POWER, /* scale and shift fused with weights and bias; only `power_exp` is handled here */ TANH, SIGMOID, SWISH, @@ -67,16 +84,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { public: using wrapper_type = GetCUDABackendWrapperType; - ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias) - : stream(std::move(stream_)), cudnnHandle(std::move(handle)) + ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle_, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias) + : stream(std::move(stream_)), cudnnHandle(std::move(handle_)) { const auto& kernel_size = config.kernel_size; const auto& dilations = config.dilations; const auto& strides = config.strides; const auto convolution_order = kernel_size.size(); - CV_Assert(convolution_order > 1); - CV_Assert(convolution_order == dilations.size()); CV_Assert(convolution_order == strides.size()); @@ -87,8 +102,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { const auto groups = config.groups; - if (convolution_order > 3) - CV_Error(Error::StsNotImplemented, "Only 2D/3D convolution is supported."); + CV_Assert (1 < convolution_order && convolution_order <= 3); const auto rank = input_shape.size(); const auto output_feature_maps = output_shape[1]; @@ -204,32 +218,63 @@ namespace cv { namespace dnn { namespace cuda4dnn { params.dilation = dilations; params.groups = config.groups; - /* check if we can perform fused convolution using cudnn */ - params.activation_type = csl::Convolution::ActivationType::IDENTITY; - fusion_location = InternalFusionLocation::NATIVE; - if (!biasTensor.empty() && - biasTensor.size() == output_feature_maps && /* cuDNN requirement */ - config.activation_type == ConvolutionConfiguration::ActivationType::RELU && - config.relu_negative_slope == 0.0) - { - fusion_location = InternalFusionLocation::CUDNN; - auto bias_shape = std::vector(rank, 1); - bias_shape[1] = output_feature_maps; - params.bias_shape = bias_shape; - params.activation_type = csl::Convolution::ActivationType::RELU; - } - - convoluter = csl::Convolution(cudnnHandle, params); - + fusion_mode = config.fusion_mode; activation = config.activation_type; relu_negative_slope = config.relu_negative_slope; crelu_floor = config.crelu_floor; crelu_ceil = config.crelu_ceil; power_exp = config.power_exp; + /* the scale and shift parameters of POWER have already been fused with weights and bias */ if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f) activation = ConvolutionConfiguration::ActivationType::IDENTITY; + /* we normally use cuDNN for convolution and perform bias, activation and eltwise ops ourselves + * hence, the activation for cuDNN is IDENTITY by default + */ + fusion_location = InternalFusionLocation::NATIVE; /* i.e. we perform bias, act and eltwise */ + params.eltwise = false; + params.activation_type = csl::Convolution::ActivationType::IDENTITY; + + /* cuDNN can fuse the operations with convolution in some cases; try if it's possible */ + if (!biasTensor.empty() && 0 && + biasTensor.size() == output_feature_maps && /* cuDNN requirement */ + activation == ConvolutionConfiguration::ActivationType::RELU && /* cuDNN requirement */ + relu_negative_slope == 0.0 && /* cuDNN requirement */ + (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION || /* act(conv + bias) */ + fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) /* act(conv + bias + eltwise) */ + ) + { + bool do_not_fuse = false; + if(std::is_same::value) + { + /* performance degrades if fused with tensor core based convolutions in most cases */ + int device; + CUDA4DNN_CHECK_CUDA(cudaGetDevice(&device)); + + int cc_major; + CUDA4DNN_CHECK_CUDA(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device)); + + if (cc_major >= 7) + do_not_fuse = true; + } + + if (!do_not_fuse) + { + fusion_location = InternalFusionLocation::CUDNN; + auto bias_shape = std::vector(rank, 1); + bias_shape[1] = output_feature_maps; + params.bias_shape = bias_shape; + + if (config.fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) + params.eltwise = true; + + params.activation_type = csl::Convolution::ActivationType::RELU; + } + } + + convoluter = csl::Convolution(cudnnHandle, params); + csl::WorkspaceBuilder builder; if (!transformed_shape.empty()) { @@ -246,7 +291,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { const std::vector>& outputs, csl::Workspace& workspace) override { - CV_Assert(inputs.size() == 1 && outputs.size() == 1); + /* input[0] = conv input, input[1] = bias (from fused eltwise layer) */ + CV_Assert(inputs.size() == 1 || inputs.size() == 2); + CV_Assert(outputs.size() == 1); csl::WorkspaceAllocator allocator(workspace); @@ -270,7 +317,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { { try { - convoluter.convolve_with_bias_activation(output, input, filtersTensor, biasTensor, conv_scratchpad); + if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION) + convoluter.convolve_with_bias_activation(output, input, filtersTensor, biasTensor, conv_scratchpad); + else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) + { + auto eltwise_wrapper = inputs[1].dynamicCast(); + auto eltwise = eltwise_wrapper->getView(); + CV_Assert(is_shape_same(eltwise, output)); + + convoluter.convolve_with_bias_eltwise_activation(output, input, filtersTensor, biasTensor, eltwise, conv_scratchpad); + } } catch(const csl::cudnn::cuDNNException& ex) { @@ -287,8 +343,100 @@ namespace cv { namespace dnn { namespace cuda4dnn { if (fusion_location == InternalFusionLocation::NATIVE) { convoluter.convolve(output, input, filtersTensor, conv_scratchpad); - if (!biasTensor.empty()) + + if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM || + fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION || + fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM) { + CV_Assert(inputs.size() == 2); + } + + if (!biasTensor.empty() && inputs.size() == 2) + { + /* bias and eltwise */ + CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM || + fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION || + fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM); + + auto eltwise_wrapper = inputs[1].dynamicCast(); + auto eltwise = eltwise_wrapper->getView(); + CV_Assert(is_shape_same(eltwise, output)); + + std::size_t inner_size = output.size_range(2, output.rank()); + + if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM) + { + kernels::biasN_eltwise_sum_2_identity_inplace(stream, output, inner_size, biasTensor, eltwise); + } + else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) + { + /* activation(conv + bias + eltwise) */ + switch (activation) + { + case ConvolutionConfiguration::ActivationType::IDENTITY: + kernels::biasN_eltwise_sum_2_identity_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::RELU: + kernels::biasN_eltwise_sum_2_relu_inplace(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope); + break; + case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: + kernels::biasN_eltwise_sum_2_clipped_relu_inplace(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil); + break; + case ConvolutionConfiguration::ActivationType::POWER: + kernels::biasN_eltwise_sum_2_power_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0); + break; + case ConvolutionConfiguration::ActivationType::TANH: + kernels::biasN_eltwise_sum_2_tanh_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SIGMOID: + kernels::biasN_eltwise_sum_2_sigmoid_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SWISH: + kernels::biasN_eltwise_sum_2_swish_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::MISH: + kernels::biasN_eltwise_sum_2_mish_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + } + } + else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM) + { + /* activation(conv + bias) + eltwise */ + switch (activation) + { + case ConvolutionConfiguration::ActivationType::IDENTITY: + kernels::biasN_eltwise_sum_2_identity_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::RELU: + kernels::biasN_relu_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, relu_negative_slope); + break; + case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: + kernels::biasN_clipped_relu_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, crelu_floor, crelu_ceil); + break; + case ConvolutionConfiguration::ActivationType::POWER: + kernels::biasN_power_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise, power_exp, 1.0, 0.0); + break; + case ConvolutionConfiguration::ActivationType::TANH: + kernels::biasN_tanh_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SIGMOID: + kernels::biasN_sigmoid_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SWISH: + kernels::biasN_swish_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + case ConvolutionConfiguration::ActivationType::MISH: + kernels::biasN_mish_eltwise_sum_2_inplace(stream, output, inner_size, biasTensor, eltwise); + break; + } + } + } + else if (!biasTensor.empty() && inputs.size() == 1) + { + /* bias but no eltwise */ + CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE || + fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION); + std::size_t inner_size = output.size_range(2, output.rank()); switch(activation) { @@ -302,7 +450,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { kernels::biasN_clipped_relu_inplace(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil); break; case ConvolutionConfiguration::ActivationType::POWER: - kernels::biasN_power_inplace(stream, output, inner_size, biasTensor, power_exp, T(1.0), T(0.0)); + kernels::biasN_power_inplace(stream, output, inner_size, biasTensor, power_exp, 1.0, 0.0); break; case ConvolutionConfiguration::ActivationType::TANH: kernels::biasN_tanh_inplace(stream, output, inner_size, biasTensor); @@ -318,8 +466,90 @@ namespace cv { namespace dnn { namespace cuda4dnn { break; } } - else + else if (biasTensor.empty() && inputs.size() == 2) { + /* no bias but eltwise */ + CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM || + fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION || + fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM); + + auto eltwise_wrapper = inputs[1].dynamicCast(); + auto eltwise = eltwise_wrapper->getView(); + CV_Assert(is_shape_same(eltwise, output)); + + /* we pass `eltwise` as `bias` (with `inner_size` as one) to bias-activation kernels */ + + if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM) + { + kernels::eltwise_sum_2(stream, output, output, eltwise); + } + else if (fusion_mode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION) + { + switch (activation) + { + case ConvolutionConfiguration::ActivationType::IDENTITY: + kernels::eltwise_sum_2(stream, output, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::RELU: + kernels::eltwise_sum_2_relu(stream, output, output, eltwise, relu_negative_slope); + break; + case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: + kernels::eltwise_sum_2_clipped_relu(stream, output, output, eltwise, crelu_floor, crelu_ceil); + break; + case ConvolutionConfiguration::ActivationType::POWER: + kernels::eltwise_sum_2_power(stream, output, output, eltwise, power_exp, 1.0, 0.0); + break; + case ConvolutionConfiguration::ActivationType::TANH: + kernels::eltwise_sum_2_tanh(stream, output, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SIGMOID: + kernels::eltwise_sum_2_sigmoid(stream, output, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SWISH: + kernels::eltwise_sum_2_swish(stream, output, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::MISH: + kernels::eltwise_sum_2_mish(stream, output, output, eltwise); + break; + } + } + else if (fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM) + { + switch (activation) + { + case ConvolutionConfiguration::ActivationType::IDENTITY: + kernels::eltwise_sum_2(stream, output, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::RELU: + kernels::relu_eltwise_sum_2_inplace(stream, output, eltwise, relu_negative_slope); + break; + case ConvolutionConfiguration::ActivationType::CLIPPED_RELU: + kernels::clipped_relu_eltwise_sum_2_inplace(stream, output, eltwise, crelu_floor, crelu_ceil); + break; + case ConvolutionConfiguration::ActivationType::POWER: + kernels::power_eltwise_sum_2_inplace(stream, output, eltwise, power_exp, 1.0, 0.0); + break; + case ConvolutionConfiguration::ActivationType::TANH: + kernels::tanh_eltwise_sum_2_inplace(stream, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SIGMOID: + kernels::sigmoid_eltwise_sum_2_inplace(stream, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::SWISH: + kernels::swish_eltwise_sum_2_inplace(stream, output, eltwise); + break; + case ConvolutionConfiguration::ActivationType::MISH: + kernels::mish_eltwise_sum_2_inplace(stream, output, eltwise); + break; + } + } + } + else if(biasTensor.empty() && inputs.size() == 1) + { + /* no bias and no eltwise */ + CV_Assert(fusion_mode == ConvolutionConfiguration::FusionMode::NONE || + fusion_mode == ConvolutionConfiguration::FusionMode::ACTIVATION); + switch(activation) { case ConvolutionConfiguration::ActivationType::IDENTITY: @@ -363,6 +593,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { std::size_t scratch_mem_in_bytes; + ConvolutionConfiguration::FusionMode fusion_mode; ConvolutionConfiguration::ActivationType activation; float relu_negative_slope, crelu_floor, crelu_ceil, power_exp; diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp index fd06d015c0..adcd7ab3f8 100644 --- a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp @@ -28,14 +28,28 @@ namespace cv { namespace dnn { namespace cuda4dnn { DIV }; + class EltwiseOpBase : public CUDABackendNode { + public: + EltwiseOpBase(csl::Stream stream_, EltwiseOpType op_, std::vector coeffs_) + : stream(std::move(stream_)), op(op_), coeffs(std::move(coeffs_)) + { + } + + protected: + csl::Stream stream; + + public: + EltwiseOpType op; + std::vector coeffs; + }; + template - class EltwiseOp final : public CUDABackendNode { + class EltwiseOp final : public EltwiseOpBase { public: using wrapper_type = GetCUDABackendWrapperType; - template - EltwiseOp(csl::Stream stream_, EltwiseOpType op_, std::vector coeffs_) - : stream(std::move(stream_)), op{ op_ }, coeffs(std::begin(coeffs_), std::end(coeffs_)) + EltwiseOp(csl::Stream stream_, EltwiseOpType op_, std::vector coeffs_) + : EltwiseOpBase(std::move(stream_), op_, std::move(coeffs_)) { } @@ -98,7 +112,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { else { /* if this is the first op, we must scale output too */ - auto coeff_x = (i == 1) ? coeffs[0] : static_cast(1.0); + T coeff_x = (i == 1) ? coeffs[0] : 1.0; kernels::eltwise_sum_coeff_2(stream, output, coeff_x, output, coeffs[i], input); } break; @@ -106,11 +120,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { } } } - - private: - csl::Stream stream; - EltwiseOpType op; - std::vector coeffs; }; }}} /* namespace cv::dnn::cuda4dnn */ diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 4e65c38df7..bd41528a80 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -46,6 +46,10 @@ #include "op_vkcom.hpp" #include "op_cuda.hpp" +#ifdef HAVE_CUDA +#include "cuda4dnn/primitives/eltwise.hpp" +#endif + #include "halide_scheduler.hpp" #include @@ -2554,6 +2558,11 @@ struct Net::Impl : public detail::NetImplBase LayerPin lpNext(ld.consumers[0].lid, 0); while (nextData) { + /* we use `tryFuse` member of convolution layer to fuse eltwise later + * it's not intended to be fused here; hence, we stop when we encounter eltwise + */ + if (preferableBackend == DNN_BACKEND_CUDA && ld.type == "Convolution" && nextData->type == "Eltwise") + break; Ptr nextLayer = nextData->layerInstance; if (currLayer->tryFuse(nextLayer)) { @@ -2629,15 +2638,41 @@ struct Net::Impl : public detail::NetImplBase break; } - // fuse convolution layer followed by eltwise + relu - if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" ) + // OpenCL: fuse convolution layer followed by eltwise + relu + // CUDA: fuse convolution layer followed by eltwise (and optional activation) + if ((IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) && + ld.layerInstance->type == "Convolution" ) { Ptr nextEltwiseLayer; if( nextData ) nextEltwiseLayer = nextData->layerInstance.dynamicCast(); - if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 && - nextData && nextData->inputBlobsId.size() == 2 ) +#ifdef HAVE_CUDA + // CUDA backend supports fusion with eltwise sum (without variable channels) + // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion + if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty()) + { + // we create a temporary backend node for eltwise layer to obtain the eltwise configuration + auto context = cudaInfo->context; /* make a copy so that initCUDA doesn't modify cudaInfo */ + const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers); + const auto eltwiseNode = node.dynamicCast(); + if (eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty()) + nextEltwiseLayer = Ptr(); + + // check for variable channels + auto& inputs = nextData->inputBlobs; + for (int i = 1; i < inputs.size(); ++i) + { + if (inputs[i]->size[1] != inputs[0]->size[1]) + { + nextEltwiseLayer = Ptr(); + break; + } + } + } +#endif + + if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2) { LayerData *eltwiseData = nextData; @@ -2666,65 +2701,160 @@ struct Net::Impl : public detail::NetImplBase } CV_Assert(biasLayerData); { - if( eltwiseData->consumers.size() == 1 ) + // fuse eltwise + activation layer + // bias must already be computed to fuse => bias layer must appear before convolution + if (biasLayerData->id < ld.id) { - // fuse eltwise + activation layer - if (biasLayerData->id < ld.id) + /* we can fuse activation if: + * => activation layer that follows is the only consumer of eltwise output + * => activation layer does not process multiple inputs + * => we do not require to keep the output of eltwise + */ + Ptr nextFusabeleActivLayer; + if (eltwiseData->consumers.size() == 1 && pinsToKeep.count(lpNext) == 0) { nextData = &layers[eltwiseData->consumers[0].lid]; lpNext = LayerPin(eltwiseData->consumers[0].lid, 0); - Ptr nextActivLayer; - if( nextData ) - nextActivLayer = nextData->layerInstance.dynamicCast(); + if (pinsToKeep.count(lpNext) == 0 && nextData->outputBlobs.size() == 1) + nextFusabeleActivLayer = nextData->layerInstance.dynamicCast(); + } + else + { + // OCL backend cannot fuse in this case but the CUDA backend can continue with just eltwise + nextData = 0; + } - if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 && - (!nextData->type.compare("ReLU") || - !nextData->type.compare("ChannelsPReLU") || - !nextData->type.compare("Power")) && - currLayer->setActivation(nextActivLayer) ) + // the requirements of OCV OpenCL backend and CUDA backend are different + // we need to check them separately; hence, the fuse variables + bool fuse_eltwise = false, fuse_activation = false; + + if (IS_DNN_OPENCL_TARGET(preferableTarget) && !nextFusabeleActivLayer.empty() && + (!nextData->type.compare("ReLU") || + !nextData->type.compare("ChannelsPReLU") || + !nextData->type.compare("Power")) && + currLayer->setActivation(nextFusabeleActivLayer)) + { + fuse_eltwise = true; + fuse_activation = true; + } + + if (IS_DNN_CUDA_TARGET(preferableTarget)) + { + /* supported fusion options: + * => convolution + eltwise + * => activation(convolution) + eltwise + * > convolution + activation would have been fused already; we have to fuse eltwise + * => activation(convolution + eltwise) + * > fuse eltwise and then activation + */ + auto layer = nextEltwiseLayer.staticCast(); + if (currLayer->tryFuse(layer)) { - CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1); - ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]); - printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str())); - printf_(("\tfused with %s\n", nextActivLayer->name.c_str())); - eltwiseData->skip = true; - nextData->skip = true; - // This optimization for cases like - // some_layer conv - // | | - // +-- eltwise --+ - // | - // activ - // This way all the element-wise computations - // (i.e. some_layer+conv or some_layer*conv) - // would be done at [conv] layer. So we need to - // replace [conv]'s output blob to [eltwise]'s one - // considering that [activ] is an in-place layer. - // Also we need to move all the consumers' references. - // To prevent memory collisions (i.e. when input of - // [conv] and output of [eltwise] is the same blob) - // we allocate a new blob. - CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1); - ld.outputBlobs[0] = ld.outputBlobs[0].clone(); - ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]); - - eltwiseData->outputBlobs = ld.outputBlobs; - nextData->outputBlobs = ld.outputBlobs; - eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers; - nextData->outputBlobsWrappers = ld.outputBlobsWrappers; - - // Move references of [activ] layer consumers to the newly allocated blob. - for (int i = 0; i < nextData->consumers.size(); ++i) + fuse_eltwise = true; /* eltwise was successfully fused */ + if (!nextFusabeleActivLayer.empty()) { - LayerData& consumer = layers[nextData->consumers[i].lid]; - for (int j = 0; j < consumer.inputBlobsId.size(); ++j) + if ((!nextData->type.compare("ReLU") || + !nextData->type.compare("ReLU6") || + !nextData->type.compare("Power") || + !nextData->type.compare("TanH") || + !nextData->type.compare("Sigmoid") || + !nextData->type.compare("Swish") || + !nextData->type.compare("Mish")) && + currLayer->setActivation(nextFusabeleActivLayer)) { - if (consumer.inputBlobsId[j].lid == lpNext.lid) - { - consumer.inputBlobs[j] = &ld.outputBlobs[0]; - consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; - break; - } + // activation was fused + fuse_activation = true; + } + } + } + } + + CV_Assert(!fuse_activation || fuse_eltwise); /* cannot fuse activation without eltwise */ + if(fuse_eltwise && fuse_activation) + { + CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1); + ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]); + printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str())); + printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str())); + eltwiseData->skip = true; + nextData->skip = true; + // This optimization for cases like + // some_layer conv + // | | + // +-- eltwise --+ + // | + // activ + // This way all the element-wise computations + // (i.e. some_layer+conv or some_layer*conv) + // would be done at [conv] layer. So we need to + // replace [conv]'s output blob to [eltwise]'s one + // considering that [activ] is an in-place layer. + // Also we need to move all the consumers' references. + // To prevent memory collisions (i.e. when input of + // [conv] and output of [eltwise] is the same blob) + // we allocate a new blob. + CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1); + ld.outputBlobs[0] = ld.outputBlobs[0].clone(); + ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]); + + eltwiseData->outputBlobs = ld.outputBlobs; + nextData->outputBlobs = ld.outputBlobs; + eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers; + nextData->outputBlobsWrappers = ld.outputBlobsWrappers; + + // Move references of [activ] layer consumers to the newly allocated blob. + for (int i = 0; i < nextData->consumers.size(); ++i) + { + LayerData& consumer = layers[nextData->consumers[i].lid]; + for (int j = 0; j < consumer.inputBlobsId.size(); ++j) + { + if (consumer.inputBlobsId[j].lid == lpNext.lid) + { + consumer.inputBlobs[j] = &ld.outputBlobs[0]; + consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; + break; + } + } + } + } + else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise) + { + CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget)); + CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1); + ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]); + printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str())); + eltwiseData->skip = true; + // This optimization is for cases like + // some_layer conv (maybe fused with activ) + // | | + // +-- eltwise --+ + // + // This way all the element-wise computations + // (i.e. some_layer+conv or some_layer*conv) + // would be done at [conv] layer. So we need to + // replace [conv]'s output blob to [eltwise]'s one. + // Also we need to move all the consumers' references. + // To prevent memory collisions (i.e. when input of + // [conv] and output of [eltwise] is the same blob) + // we allocate a new blob. + CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1); + ld.outputBlobs[0] = ld.outputBlobs[0].clone(); + ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]); + + eltwiseData->outputBlobs = ld.outputBlobs; + eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers; + + // Move references of [eltwise] layer consumers to the newly allocated blob. + for (int i = 0; i < eltwiseData->consumers.size(); ++i) + { + LayerData& consumer = layers[eltwiseData->consumers[i].lid]; + for (int j = 0; j < consumer.inputBlobsId.size(); ++j) + { + if (consumer.inputBlobsId[j].lid == eltwiseData->id) + { + consumer.inputBlobs[j] = &ld.outputBlobs[0]; + consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0]; + break; } } } diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 5e5af51ba9..2e125b5e95 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -248,6 +248,7 @@ public: #endif #ifdef HAVE_CUDA + cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode; cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType; float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp; #endif @@ -261,6 +262,7 @@ public: #endif #ifdef HAVE_CUDA + cudaFusionMode = cuda4dnn::ConvolutionConfiguration::FusionMode::NONE; cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY; #endif } @@ -425,10 +427,18 @@ public: #endif #ifdef HAVE_CUDA - cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY; + if (activ.empty()) + { + /* setActivation was called with empty argument => reset all fusions */ + cudaFusionMode = cuda4dnn::ConvolutionConfiguration::FusionMode::NONE; + cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY; + } if(IS_DNN_CUDA_TARGET(preferableTarget)) { + CV_Assert(cudaFusionMode == ConvolutionConfiguration::FusionMode::NONE || + cudaFusionMode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM); + Ptr activ_relu = activ.dynamicCast(); if(!activ_relu.empty()) { @@ -475,12 +485,53 @@ public: cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::MISH; if (cudaActType == cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY) + { + /* no activation fused */ activ.reset(); + } + else + { + /* activation was fused */ + if (cudaFusionMode == ConvolutionConfiguration::FusionMode::NONE) /* no previous fusion */ + cudaFusionMode = ConvolutionConfiguration::FusionMode::ACTIVATION; /* now activation */ + else if (cudaFusionMode == ConvolutionConfiguration::FusionMode::ELTWISE_SUM) /* previously eltwise was fused */ + cudaFusionMode = ConvolutionConfiguration::FusionMode::ELTWISE_SUM_THEN_ACTIVATION; /* now activation on eltwise output */ + } } #endif return !activ.empty(); } + virtual bool tryFuse(Ptr& top) CV_OVERRIDE + { +#ifdef HAVE_CUDA + if(IS_DNN_CUDA_TARGET(preferableTarget)) + { + Ptr eltwise = top.dynamicCast(); + if (!eltwise.empty()) // && eltwise->op == EltwiseLayer::SUM && eltwise->coeffs.empty()) + { + /* we also need to check that the eltwise input does not require shortcut mechanism + * it's difficult to verify it here but we hope that `fuseLayers` has done the check already + */ + if (cudaFusionMode == ConvolutionConfiguration::FusionMode::NONE) + { + /* no previous fusion */ + cudaFusionMode = ConvolutionConfiguration::FusionMode::ELTWISE_SUM; /* now eltwise */ + return true; + } + else if(cudaFusionMode == ConvolutionConfiguration::FusionMode::ACTIVATION) + { + /* previously an activation was fused */ + cudaFusionMode = ConvolutionConfiguration::FusionMode::ACTIVATION_THEN_ELTWISE_SUM; + return true; + } + return false; + } + } +#endif + return BaseConvolutionLayerImpl::tryFuse(top); + } + void fuseWeights(const Mat& w_, const Mat& b_) CV_OVERRIDE { // Convolution weights have OIHW data layout. Parameters fusion in case of @@ -1493,7 +1544,7 @@ public: { auto context = reinterpret_cast(context_); - CV_Assert(inputs.size() == 1); + CV_Assert(inputs.size() == 1 || inputs.size() == 2); auto input_wrapper = inputs[0].dynamicCast(); auto input_shape = input_wrapper->getShape(); @@ -1534,6 +1585,7 @@ public: config.output_shape.assign(std::begin(output_shape), std::end(output_shape)); config.groups = groups; + config.fusion_mode = cudaFusionMode; config.activation_type = cudaActType; config.relu_negative_slope = cuda_relu_slope; config.crelu_floor = cuda_crelu_floor;