opencv/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
Li, Peng ab8022f74e update convolution opencl kernels in dnn module (#11762)
* optimize ocl kernel enqueue in fc layer

Signed-off-by: Li Peng <peng.li@intel.com>

* use CV_LOG_INFO in convolution auto tuning

Signed-off-by: Li Peng <peng.li@intel.com>

* update convolution IDLF kernel

extend parameter tuning range, also cleanup
ocl kernel implementation

Signed-off-by: Li Peng <peng.li@intel.com>

* update in-memory convolution cache config

fp16 and fp32 cache config are stored separately

Signed-off-by: Li Peng <peng.li@intel.com>
2018-06-25 17:06:18 +03:00

1970 lines
71 KiB
C++

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2017, Intel Corporation, all rights reserved.
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "../../precomp.hpp"
#include <opencv2/core/utils/configuration.private.hpp>
#include <string>
#include <vector>
#include <fstream>
#include <sys/stat.h>
#include <assert.h>
#include "../include/common.hpp"
#include "../include/ocl4dnn.hpp"
#include "opencl_kernels_dnn.hpp"
#include "../include/math_functions.hpp"
#include "../include/default_kernel_config.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#include "opencv2/core/utils/logger.hpp"
#if defined WIN32 || defined _WIN32
#include <windows.h>
#include <direct.h>
#endif
namespace cv { namespace dnn { namespace ocl4dnn {
static cv::Mutex kernelConfigMutex;
typedef std::map<std::string, std::string> kernel_hash_t;
static kernel_hash_t kernelConfigMap;
static bool defaultConfigLoaded = false;
static std::string sanitize(const std::string& s)
{
std::string s_ = s;
for (size_t i = 0; i < s_.size(); i++)
{
char c = s_[i];
if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'))
{
s_[i] = '_';
}
}
// TODO add hash?
// s_ = s_ + cv::format("_%08llx", crc64((uchar*)s.c_str(), s.size()));
return s_;
}
static void initializeGlobalBuiltinConfigurations(const std::string& cache_path)
{
CV_Assert(defaultConfigLoaded == false);
CV_Assert(kernelConfigMap.empty());
/* fp32 config */
size_t numConfigs = sizeof(default_kernel_config_intel_fp32) /
sizeof(default_kernel_config_intel_fp32[0]) / 2;
for (size_t i = 0; i < numConfigs; i++)
{
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp32[2 * i];
if (!cache_path.empty())
{
std::string cacheFile = cache_path + sanitize(key);
std::ifstream cachedKernel(cacheFile.c_str());
if (cachedKernel)
continue; // external configuration found, skip builtin
}
std::pair<std::string, std::string> entry(
key,
default_kernel_config_intel_fp32[2 * i + 1]);
kernelConfigMap.insert(entry);
}
/* fp16 config */
numConfigs = sizeof(default_kernel_config_intel_fp16) /
sizeof(default_kernel_config_intel_fp16[0]) / 2;
for (size_t i = 0; i < numConfigs; i++)
{
std::string key = std::string("Intel(R) Corporation_") + default_kernel_config_intel_fp16[2 * i];
if (!cache_path.empty())
{
std::string cacheFile = cache_path + sanitize(key);
std::ifstream cachedKernel(cacheFile.c_str());
if (cachedKernel)
continue; // external configuration found, skip builtin
}
std::pair<std::string, std::string> entry(
key,
default_kernel_config_intel_fp16[2 * i + 1]);
kernelConfigMap.insert(entry);
}
defaultConfigLoaded = true;
}
template<typename Dtype>
OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
{
bias_term_ = config.bias_term;
int dims = config.in_shape.size();
int spatial_dims = 2;
channels_ = config.in_shape[dims - spatial_dims - 1];
num_output_ = config.out_shape[dims - spatial_dims - 1];
group_ = config.group;
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
fused_eltwise_ = false;
power_ = 1.f;
negative_slope_ = 0;
min_value_ = 0;
max_value_ = 0;
prev_kernel_type_ = -1;
tuned_ = false;
use_half_ = config.use_half;
// assumption: spatial dimension is 2.
kernel_h_ = config.kernel.height;
kernel_w_ = config.kernel.width;
pad_h_ = config.pad.height;
pad_w_ = config.pad.width;
stride_h_ = config.stride.height;
stride_w_ = config.stride.width;
dilation_h_ = config.dilation.height;
dilation_w_ = config.dilation.width;
M_ = num_output_ / group_;
height_ = config.in_shape[dims - spatial_dims + 0];
width_ = config.in_shape[dims - spatial_dims + 1];
output_h_ = config.out_shape[dims - spatial_dims + 0];
output_w_ = config.out_shape[dims - spatial_dims + 1];
bottom_dim_ = channels_ * width_ * height_;
top_dim_ = num_output_ * output_w_ * output_h_;
int Ph = (output_h_ - 1) * stride_h_ + (dilation_h_ * (kernel_h_ - 1) + 1) - height_;
int Pw = (output_w_ - 1) * stride_w_ + (dilation_w_ * (kernel_w_ - 1) + 1) - width_;
Ph = (Ph > 0) ? Ph : 0;
Pw = (Pw > 0) ? Pw : 0;
pad_right_ = (Pw + 1) / 2;
pad_bottom_ = (Ph + 1) / 2;
cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
dwconv_ = (num_output_ == channels_ && channels_ == group_);
use_cache_path_ = false;
if (!cache_path_.empty())
{
#if defined _WIN32
struct _stat file_stat;
use_cache_path_ = _stat(cache_path_.c_str(), &file_stat) == 0 &&
((_S_IFDIR & file_stat.st_mode) != 0);
#else
struct stat file_stat;
use_cache_path_ = stat(cache_path_.c_str(), &file_stat) == 0 &&
S_ISDIR(file_stat.st_mode);
#endif
if (!use_cache_path_)
{
static int warn_ = 0;
if (!warn_)
{
std::cerr
<< "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
<< std::endl;
warn_ = true;
}
}
}
run_auto_tuning_ = use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false);
force_auto_tuning_ = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false);
}
template<typename Dtype>
OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
{
if (!swizzled_weights_umat.empty()) {
swizzled_weights_umat.release();
}
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise)
{
if (fused_eltwise)
addDef("FUSED_CONV_ELTWISE", 1);
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
addDef("FUSED_CONV_RELU", 1);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
addDef("FUSED_CONV_PRELU", 1);
break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
addDef("FUSED_CONV_POWER", 1);
break;
case OCL4DNN_CONV_FUSED_ACTIV_TANH:
addDef("FUSED_CONV_TANH", 1);
break;
case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
addDef("FUSED_CONV_RELU6", 1);
break;
default:
;
}
return;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
{
if (fused_eltwise)
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_);
break;
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
kernel.set(argIdx++, (float)power_);
break;
case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
kernel.set(argIdx++, (float)min_value_);
kernel.set(argIdx++, (float)max_value_);
break;
default:
;
}
return;
}
typedef enum {
TYPE_FLOAT = 1,
TYPE_HALF = 2
} ocl4dnnConvSpatialType_t;
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
{
if (use_half_)
{
addDef("TYPE", TYPE_HALF);
addDef("Dtype", "half");
addDef("Dtype2", "half2");
addDef("Dtype4", "half4");
addDef("Dtype8", "half8");
addDef("Dtype16", "half16");
addDef("as_Dtype", "as_half");
addDef("as_Dtype2", "as_half2");
addDef("as_Dtype4", "as_half4");
addDef("as_Dtype8", "as_half8");
}
else
{
addDef("TYPE", TYPE_FLOAT);
addDef("Dtype", "float");
addDef("Dtype2", "float2");
addDef("Dtype4", "float4");
addDef("Dtype8", "float8");
addDef("Dtype16", "float16");
addDef("as_Dtype", "as_float");
addDef("as_Dtype2", "as_float2");
addDef("as_Dtype4", "as_float4");
addDef("as_Dtype8", "as_float8");
}
}
typedef enum {
KERNEL_TYPE_INTEL_IDLF = 2,
KERNEL_TYPE_BASIC = 4,
KERNEL_TYPE_GEMM_LIKE = 5,
KERNEL_TYPE_DWCONV = 6
} ocl4dnnConvSpatialKernelType_t;
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
int32_t blockM,
int32_t blockK,
int32_t blockN)
{
std::string kernelUKey;
int32_t simd_size;
if (kernelType == KERNEL_TYPE_INTEL_IDLF) {
simd_size = blockN;
kernelUKey = generateSpecificKey(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, 1);
// kernel name
kernel_name_ = "IDLF_";
kernel_name_ += kernelUKey;
if (simd_size == 16)
kernel_name_ += "_SIMD16";
else
kernel_name_ += "_SIMD8";
// options
options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
options_ << " -cl-mad-enable";
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
// defs
int32_t output_block_width = blockM;
int32_t output_block_height = blockK;
int tile_x = (output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_;
int tile_y = (output_block_height - 1) * stride_h_ + kernel_h_ * dilation_h_;
int invec_size = tile_y;
addDef("SIMD_SIZE", simd_size);
addDef("OUT_BLOCK_WIDTH", output_block_width);
addDef("OUT_BLOCK_HEIGHT", output_block_height);
addDef("INPUT_DEPTH", channels_ / group_);
addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
addDef("TOTAL_OUTPUT_DEPTH", num_output_);
addDef("NUM_FILTERS", M_);
addDef("TILE_X", tile_x);
addDef("TILE_Y", tile_y);
addDef("INVEC_SIZE", invec_size);
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
addDef("APPLY_BIAS", bias_term_);
addDef("WEIGHT_PREF", ((kernel_w_ * kernel_h_) == 1) ? 1 : 8);
addDef("INPUT_PITCH", (width_ * height_));
addDef("OUTPUT_PITCH", (output_w_ * output_h_));
addDef("LEFT_FILTERS", ((int)alignSize(M_, simd_size) - M_));
addDef("INPUT_WIDTH", width_);
addDef("INPUT_HEIGHT", height_);
addDef("FILTERS_IN_GROUP", ((int)alignSize(M_, simd_size) / simd_size));
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
else if (kernelType == KERNEL_TYPE_BASIC)
{
addDef("KERNEL_BASIC");
kernelUKey = generateSpecificKey(KERNEL_TYPE_BASIC, blockM, blockK, blockN);
kernel_name_ = "BASIC_";
kernel_name_ += kernelUKey;
// opts
options_ << " -cl-fast-relaxed-math -D ConvolveBasic=" << kernel_name_;
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
// defs
addDef("CHANNELS", channels_ / group_);
addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", M_);
addDef("ZPAR", 1);
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
{
simd_size = blockK;
kernelUKey = generateSpecificKey(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN);
kernel_name_ = "U_GEMM_LIKE_CONV_";
kernel_name_ += kernelUKey.c_str();
kernel_name_ += (blockK == 8) ? "_SIMD8" : "_SIMD16";
std::stringstream kernelDef;
kernelDef << "GEMM_LIKE_CONV_" << blockN << "_" << blockM;
if (blockK == 16)
kernelDef << "_SIMD16";
// Build list of options and defines
options_ << " -cl-fast-relaxed-math " << " -D " << kernelDef.str()
<< " -D Conv_Interleaved=" << kernel_name_.c_str();
options_ << " -cl-mad-enable";
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
addDef("KERNEL_GEMM_LIKE");
addDef("INPUT_DEPTH", channels_);
addDef("WIDTH1", M_);
addDef("OUT_PADDING_LEFT", 0);
addDef("OUT_PADDING_HEIGHT", 0);
addDef("OUT_DEPTH", M_);
addDef("NUM_BATCHES", num_);
addDef("DY", blockM);
addDef("DX", blockN);
addDef("KERNEL_WIDTH_DIV2", kernel_w_ / 2);
addDef("KERNEL_SLICE_DIV2", (kernel_w_ * kernel_h_) / 2);
addDef("TILE_N_LAST", M_ % 32);
addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
}
else if (kernelType == KERNEL_TYPE_DWCONV)
{
kernelUKey = generateSpecificKey(KERNEL_TYPE_DWCONV, blockM, blockK, blockN);
kernel_name_ = "DWCONV_";
kernel_name_ += kernelUKey.c_str();
options_ << " -cl-fast-relaxed-math ";
if (clOptionSupport("-cl-no-subgroup-ifp"))
options_ << " -cl-no-subgroup-ifp ";
addDef("KERNEL_DWCONV");
addDef("KERNEL_SIZE", kernel_w_ * kernel_h_);
addDef("KERNEL_W", kernel_w_);
addDef("KERNEL_H", kernel_h_);
addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", num_output_ * num_);
addDef("CHANNELS", num_output_);
setFusionDefine(fused_activ_, fused_eltwise_);
options_ << " -D DWCONV=" << kernel_name_;
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setupKernel()
{
collectCommonInformation();
addDef("KERNEL_WIDTH", kernel_w_);
addDef("KERNEL_HEIGHT" , kernel_h_);
addDef("STRIDE_X", stride_w_);
addDef("STRIDE_Y", stride_h_);
addDef("DILATION_X", dilation_w_);
addDef("DILATION_Y", dilation_h_);
if (kernelType_ != KERNEL_TYPE_BASIC)
{
addDef("INPUT_PAD_W", pad_w_);
addDef("INPUT_PAD_H", pad_h_);
addDef("INPUT_PAD_RIGHT", pad_right_);
addDef("INPUT_PAD_BOTTOM", pad_bottom_);
}
setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setBias(bool bias_term)
{
bias_term_ = bias_term;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU;
negative_slope_ = slope;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivReLU6(bool fuse_activ, float min, float max)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU6;
min_value_ = min;
max_value_ = max;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_PRELU;
Mat tmpMat = Mat(num_output_, 1, CV_32FC1, (uchar*)&slope[0]);
tmpMat.copyTo(negative_slope_umat_);
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivPower(bool fuse_activ, float power)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_POWER;
power_ = power;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivTanh(bool fuse_activ)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_TANH;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
const UMat& bottom2,
const UMat& weight,
const UMat& bias,
UMat& top,
int32_t numImages)
{
num_ = numImages;
if (!bottom2.empty())
{
fused_eltwise_ = true;
bottom_data2_ = bottom2;
}
else
{
fused_eltwise_ = false;
}
if (use_half_ && bias_half.empty() && !bias.empty())
convertFp16((UMat&)bias, bias_half);
if (use_half_ && weights_half.empty())
convertFp16((UMat&)weight, weights_half);
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
if (bestKernelConfig.empty())
return false;
return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &verifyTop,
const UMat &weight, const UMat &bias,
int32_t numImages)
{
options_.str(""); options_.clear(); // clear contents and state flags
createBasicKernel(1, 1, 1);
kernel_index_ = kernelQueue.size() - 1;
convolve(bottom, verifyTop, weight, bias, numImages, kernelQueue[kernel_index_]);
CV_Assert(phash.find(kernelQueue[kernel_index_]->kernelName) != phash.end());
//unloadProgram(kernelQueue[kernel_index_]->kernelName);
kernelQueue.pop_back();
return;
}
// For large enough input size, we do not need to tune kernels for different
// size. The reason is with large input size, there will be enough work items
// to feed al the EUs.
// FIXME for the gemm like convolution, switch back to eaxct image size.
#define TUNING_SIZE(x) ((x) > 256 ? 256 : (alignSize(x, 16)))
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::generateKey()
{
std::string precision = (use_half_) ? "FP16" : "FP32";
std::stringstream keyBuilder;
// FIXME: to support fuse?
keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
<< "cn" << channels_ << "_"
<< "g" << group_ << "_"
<< "s" << stride_w_ << "x" << stride_h_ << "_"
<< "d" << dilation_w_ << "x" << dilation_h_ << "_"
<< "b" << bias_term_ << "_"
<< "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
<< "p" << pad_w_ << "x" << pad_h_ << "_"
<< "num" << num_ << "_"
<< "M" << M_ << "_"
<< "activ" << fused_activ_ << "_"
<< "eltwise" << fused_eltwise_ << "_"
<< precision;
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
key_sanitized_ = sanitize(key_);
short_key_ = keyBuilder.str();
}
template<typename Dtype>
std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t blockWidth,
int32_t blockHeight, int32_t blockDepth)
{
std::stringstream keyBuilder;
keyBuilder << short_key_
<< "_" << type
<< "_" << blockWidth
<< "_" << blockHeight
<< "_" << blockDepth;
return keyBuilder.str();
}
template<typename Dtype>
void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
int r, int c, int interleavedRows, int nonInterleavedRows,
int blockWidth, int rowAlignment )
{
CHECK_EQ(interleavedRows % 2, 0) <<
"interleaveMatrix only supports even values for interleavedRows.";
size_t memSize = r * c * sizeof(float);
size_t dstSize = memSize *
(interleavedRows + nonInterleavedRows * 2) /
(interleavedRows + nonInterleavedRows);
memset(mem_dst, 0, dstSize); // NOLINT
const int xStride = blockWidth;
const int yStride = c * 2;
const Dtype *pSrc = mem;
Dtype* pDst = mem_dst;
for (int y = 0; y < r;) {
for (int rows = 0; rows < interleavedRows; rows += 2) {
if ( y >= r ) break;
if ((c % xStride) == 0) {
for (int x = 0; x < c / xStride; x++) {
memcpy(pDst + x * xStride * 2, // NOLINT
pSrc + x * xStride, xStride * sizeof(Dtype));
memcpy(pDst + x * xStride * 2 + xStride, // NOLINT
pSrc + x * xStride + c, xStride * sizeof(Dtype));
}
} else {
const int count = c / xStride;
int x = 0;
for (; x < count - 1; x++) {
memcpy(pDst + x * xStride * 2, // NOLINT
pSrc + x * xStride, xStride * sizeof(Dtype));
memcpy(pDst + x * xStride * 2 + xStride, // NOLINT
pSrc + x * xStride + c, xStride * sizeof(Dtype));
}
memcpy(pDst + x * xStride * 2, // NOLINT
pSrc + x * xStride, xStride * sizeof(Dtype));
}
pSrc += yStride;
pDst += yStride;
y += 2;
}
for (int rows = 0; rows < nonInterleavedRows; rows++) {
if (y >= r) break;
const int stride = rowAlignment;
int remaining = c;
for (int x = 0; x < c; x += stride) {
if (remaining >= stride) {
memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype)); // NOLINT
remaining -=stride;
} else {
memcpy(pDst + x * 2, pSrc + x, remaining * sizeof(Dtype)); // NOLINT
}
}
pSrc += yStride / 2;
pDst += yStride;
y++;
}
}
}
template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
int32_t swizzled_factor,
bool interleave)
{
// Simply skip the weight swizzle if we already got a swizzled_weights_
// in test phase and not in auto tuning
// This requires we always call convolve again with the winner configuration
// during the auto tuning stage.
if (tuned_ && !swizzled_weights_umat.empty())
return true;
if (swizzled_weights_umat.empty())
swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
kernel_h_ * (int)alignSize(kernel_w_, 2),
(use_half_) ? CV_16SC1 : CV_32FC1);
UMat swizzled_weights_tmp;
if (use_half_)
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
if (!interleave) {
cl_uint argIdx = 0;
int32_t channels = channels_ / group_;
ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
cv::ocl::dnn::conv_spatial_helper_oclsrc);
if (oclk_copy_weight.empty())
return false;
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (use_half_)
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
else
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
oclk_copy_weight.set(argIdx++, kernel_w_);
oclk_copy_weight.set(argIdx++, kernel_h_);
oclk_copy_weight.set(argIdx++, channels);
oclk_copy_weight.set(argIdx++, num_output_);
oclk_copy_weight.set(argIdx++, swizzled_factor);
size_t global_work_size_copy[3] = {
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
{
std::cout << "Swizzle kernel run failed." << std::endl;
return false;
}
} else {
// assumption: kernel dimension is 2
Mat weightMat = weight.getMat(ACCESS_READ);
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
Mat swizzledWeightMat;
if (use_half_)
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
else
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
int interleavedRows = (kernel_w_ / 2) * 2;
int nonInterleavedRows = kernel_w_ % 2;
int blockWidth = swizzled_factor; // should equal to simd size.
int rowAlignment = 32;
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
for (int od = 0; od < M_; od++)
for (int id = 0; id < channels_; id++)
for (int r = 0; r < kernel_h_; r++)
for (int c = 0; c < kernel_w_; c++)
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
interleaveMatrix(cpu_swizzled_weight,
tmpSwizzledWeight,
kernel_w_ * kernel_h_ * channels_, M_,
interleavedRows,
nonInterleavedRows,
blockWidth,
rowAlignment);
free(tmpSwizzledWeight);
}
if (use_half_)
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
return true;
}
template<>
bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
int32_t blockHeight, int32_t blockDepth)
{
kernelType_ = KERNEL_TYPE_BASIC;
blockM_ = blockWidth;
blockK_ = blockHeight;
blockN_ = blockDepth;
setupKernel();
ocl::Program program = compileKernel();
if (program.ptr())
{
int32_t workItemOutput[3] = { 1, 1, 1 };
size_t globalSize[3] = { (size_t)output_w_, (size_t)output_h_, (size_t)M_ };
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &globalSize[0], (const size_t*)NULL, &workItemOutput[0],
false, KERNEL_TYPE_BASIC));
return true;
}
else
return false;
}
template<>
void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
int32_t offset, int32_t size, bool write_only)
{
cl_mem sub_mem;
cl_buffer_region region;
cl_int err;
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
region.origin = offset * element_size;
region.size = size * element_size;
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
if (err)
{
std::cout << "Failed to create sub buffer." << std::endl;
return;
}
int step = element_size, rows = size, cols = 1;
ocl::convertFromBuffer(sub_mem, step, rows, cols,
(use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
//decrease ocl mem refcount
clReleaseMemObject(sub_mem);
}
template<>
bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
const UMat &weight, const UMat &bias,
int32_t numImages, kernelConfig* config)
{
ocl::Program program;
phash_t::iterator it = phash.find(config->kernelName);
if (it != phash.end())
program = it->second;
else
return false;
int32_t bias_offset;
if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
if (!swizzleWeight(weight, config->workItem_output[2], false))
return false;
size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_;
size_t total_top_size = top_dim_ * numImages;
for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g;
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty())
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
UMat img_buffer;
if (image_offset)
{
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
}
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
}
UMat bias_buffer;
if (bias_term_)
{
if (bias_offset)
{
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
}
}
UMat out_buffer;
if (output_image_offset)
{
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
}
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
{
std::cout << "IDLF kernel run failed." << std::endl;
return false;
}
}
} else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
if (!swizzleWeight(weight, config->workItem_output[1], true))
return false;
size_t total_bottom_size = bottom_dim_ * numImages;
size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
size_t total_bias_size = M_ * group_;
size_t total_top_size = top_dim_ * numImages;
for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g;
int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty())
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
UMat img_buffer;
if (image_offset)
{
CreateSubBuffer(bottom, img_buffer, image_offset,
total_bottom_size - image_offset, false);
if (img_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
}
UMat kernel_buffer;
if (kernel_offset)
{
CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
total_kernel_size - kernel_offset, false);
if (kernel_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
}
UMat bias_buffer;
if (bias_term_)
{
if (bias_offset)
{
CreateSubBuffer(bias, bias_buffer, bias_offset,
total_bias_size - bias_offset, false);
if (bias_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
}
}
UMat out_buffer;
if (output_image_offset)
{
CreateSubBuffer(top, out_buffer, output_image_offset,
total_top_size - output_image_offset, true);
if (out_buffer.empty())
return false;
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
}
else
{
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
}
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
int out_pitch_y = output_w_ * output_h_;
int out_pitch_z = out_pitch_y * M_;
int aligned_input_size = height_ * width_ * channels_ / group_;
int slice_pitch = width_ * height_;
kernel.set(argIdx++, (uint32_t)out_pitch_y);
kernel.set(argIdx++, (uint32_t)out_pitch_z);
kernel.set(argIdx++, (uint32_t)aligned_input_size);
kernel.set(argIdx++, (uint32_t)slice_pitch);
int blockM = config->workItem_output[0];
int blockK = config->workItem_output[1];
int blockN = config->workItem_output[2];
int alignedFilterWidth = alignSize(M_, blockN);
int alignedExpandHeight = alignSize(output_w_ * output_h_, blockM);
int globalWorkSizeDX = blockN;
int globalWorkSizeDY = blockM;
size_t sgemm_m = alignedExpandHeight;
size_t sgemm_n = alignedFilterWidth;
size_t gx = divUp(sgemm_n, globalWorkSizeDX);
size_t gy = divUp(sgemm_m, globalWorkSizeDY);
gy = alignSize(gy, blockK);
size_t global_size[3] = { gx, gy, config->global_work_size[2] };
if (!kernel.run(3, global_size, config->local_work_size, false))
{
std::cout << "GEMM like kernel run failed." << std::endl;
return false;
}
}
} else if (config->kernelType == KERNEL_TYPE_DWCONV) {
ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty())
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
size_t global_size[3];
global_size[0] = output_w_;
global_size[1] = output_h_;
global_size[2] = num_output_ * num_;
if (!kernel.run(3, global_size, NULL, false))
{
std::cout << "DWCONV kernel run failed." << std::endl;
return false;
}
} else {
for (int32_t n = 0; n < numImages; ++n) {
for (int32_t g = 0; g < group_; ++g) {
bias_offset = M_ * g;
int32_t image_offset = n * bottom_dim_
+ width_ * height_ * (channels_ / group_) * g;
int32_t output_image_offset = n * top_dim_
+ output_w_ * output_h_ * M_ * g;
int32_t kernel_offset = kernel_h_ * kernel_w_ *
(channels_ / group_) * M_
* g;
ocl::Kernel kernel(config->kernelName.c_str(), program);
if (kernel.empty())
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, kernel_offset);
if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
else
kernel.set(argIdx++, (void *)NULL);
kernel.set(argIdx++, bias_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
kernel.set(argIdx++, output_image_offset);
kernel.set(argIdx++, (uint16_t)width_);
kernel.set(argIdx++, (uint16_t)height_);
kernel.set(argIdx++, (uint16_t)output_w_);
kernel.set(argIdx++, (uint16_t)output_h_);
kernel.set(argIdx++, (uint16_t)pad_w_);
kernel.set(argIdx++, (uint16_t)pad_h_);
if (!kernel.run(3, config->global_work_size,
(config->use_null_local) ? NULL : config->local_work_size,
false))
{
std::cout << "Basic kernel run failed." << std::endl;
return false;
}
}
}
}
return true;
}
template<>
float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
const UMat &weight, const UMat &bias,
int32_t numImages, kernelConfig* config)
{
cv::ocl::Queue queue;
try
{
queue = cv::ocl::Queue::getDefault();
}
catch (const cv::Exception&)
{
static int warn_ = 0;
if (!warn_)
{
std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
warn_ = true;
}
return 1e6;
}
// warm up.
bool saved_tuned = tuned_;
tuned_ = false;
convolve(bottom, top, weight, bias, numImages, config);
cv::ocl::Timer timer(queue);
timer.start();
bool res = true;;
CV_LOG_INFO(NULL, "Benchmarking kernel: " << config->kernelName);
tuned_ = true;
int loop_cnt = 4;
for (int i = 0; i < loop_cnt; i++) {
res = convolve(bottom, top, weight, bias, numImages, config);
if (!res)
break;
}
tuned_ = saved_tuned;
timer.stop();
if (!res) {
config->tested = true;
config->verified = false;
return 1e5;
}
float elapsedTime = timer.durationNS() * 1e-6 / loop_cnt;
double out_w = output_w_;
double out_h = output_h_;
double out_z = M_;
double k_w = kernel_w_;
double k_h = kernel_h_;
double k_z = channels_;
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
return elapsedTime;
}
template<>
bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
UMat &top,
const UMat &weight,
const UMat &bias,
int32_t numImages,
kernelConfig* config,
UMat &verifyTop)
{
uint32_t verificationFail = 0;
if (config->verified)
return true;
else if (config->tested)
return false;
int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
bool saved_tuned = tuned_;
tuned_ = false;
convolve(bottom, top, weight, bias, numImages, config);
tuned_ = saved_tuned;
UMat new_top, new_verify_top;
float *data, *verify_data;
if (use_half_)
{
convertFp16(top, new_top);
convertFp16(verifyTop, new_verify_top);
data = (float *)new_top.getMat(ACCESS_READ).ptr<float>();
verify_data = (float *)new_verify_top.getMat(ACCESS_READ).ptr<float>();
}
else
{
data = (float *)top.getMat(ACCESS_READ).ptr<float>();
verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
}
for (int32_t n = 0; n < num_; ++n) {
for (int32_t g = 0; g < group_; ++g) {
int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g;
for (int out_ch = 0; out_ch < M_ && !verificationFail; out_ch++)
for (int h = 0; h < output_h_ && !verificationFail; h++)
for (int w = 0; w < output_w_; w++) {
size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
float error_factor = fabs(data[offset] - verify_data[offset]);
if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
<< " out_ch " << out_ch << " h " << h << " w " << w
<< " got " << data[offset] << " expected " << verify_data[offset]);
verificationFail = 1;
goto out;
}
else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) &&
!(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4))
{
CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g
<< " out_ch " << out_ch << " h " << h << " w " << w
<< " got " << data[offset] << " expected " << verify_data[offset]);
verificationFail = 1;
goto out;
}
}
}
}
out:
if (verificationFail == 1)
return false;
else
return true;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::unloadProgram(const std::string& kernelName)
{
ocl::Program program;
phash_t::iterator it = phash.find(kernelName);
if (it != phash.end())
{
program = it->second;
it->second = ocl::Program();
}
else
return;
ocl::Context ctx = ocl::Context::getDefault();
ctx.unloadProg(program);
}
template<typename Dtype>
ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
{
phash_t::iterator it = phash.find(kernel_name_);
if (it != phash.end())
{
return it->second;
}
String errmsg;
ocl::Context ctx = ocl::Context::getDefault();
std::string options = options_.str();
CV_Assert(options.size() != 0);
ocl::Program program = ctx.getProg(src_, options, errmsg);
phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
if (!program.ptr())
{
std::cout << "Failed to compile kernel: " << kernel_name_
<< ", buildflags: " << options
<< ", errmsg: " << errmsg << std::endl;
}
return program;
}
template<>
bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
int32_t blockK,
int32_t blockN)
{
int32_t simd_size = blockK;
int workItemOutput[3] = { blockM, blockK, blockN };
size_t gx = (size_t)divUp(M_, blockN);
size_t gy = (size_t)divUp(output_w_ * output_h_, blockM);
gy = alignSize(gy, simd_size);
size_t gz = num_;
size_t global_size[3] = { gx, gy, gz };
size_t local_size[3] = { 1, static_cast<size_t>(simd_size), 1 };
kernelType_ = KERNEL_TYPE_GEMM_LIKE;
blockM_ = blockM;
blockK_ = blockK;
blockN_ = blockN;
setupKernel();
ocl::Program program = compileKernel();
if (program.ptr())
{
size_t workgroupSize_used;
ocl::Kernel kernel(kernel_name_.c_str(), program);
if (kernel.empty())
return false;
workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
if (workgroupSize_used != simd_size)
{
std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl;
std::cerr << " Skip this kernel " << kernel_name_ << std::endl;
unloadProgram(kernel_name_);
return false;
}
else
{
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
true, KERNEL_TYPE_GEMM_LIKE));
return true;
}
}
else
return false;
}
template<>
bool OCL4DNNConvSpatial<float>::createIDLFKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t simd_size)
{
int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
const int32_t num_output_maps = M_;
int32_t output_width = output_w_;
int32_t output_height = output_h_;
int32_t output_block_width = blockWidth;
int32_t output_block_height = blockHeight;
int32_t num_batches = num_;
size_t global_size[3] = {
(size_t)divUp(output_width, output_block_width),
(size_t)divUp(output_height, output_block_height),
(size_t)num_batches * alignSize(num_output_maps, simd_size) };
size_t local_size[3] = { 1, 1, static_cast<size_t>(simd_size) };
kernelType_ = KERNEL_TYPE_INTEL_IDLF;
blockM_ = blockWidth;
blockK_ = blockHeight;
blockN_ = simd_size;
setupKernel();
ocl::Program program = compileKernel();
if (program.ptr())
{
size_t workgroupSize_used;
ocl::Kernel kernel(kernel_name_.c_str(), program);
if (kernel.empty())
return false;
workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
if (workgroupSize_used != simd_size)
{
std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl;
std::cerr << " Skip this kernel " << kernel_name_ << std::endl;
unloadProgram(kernel_name_);
return false;
}
else
{
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
true, KERNEL_TYPE_INTEL_IDLF));
return true;
}
}
else
return false;
}
template<>
bool OCL4DNNConvSpatial<float>::createDWConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth)
{
if (!dwconv_)
return false;
int workItemOutput[3] = { 1, 1, 1 };
size_t local_size[3] = { 1, 1, 1 };
size_t global_size[3];
global_size[0] = divUp(output_w_, workItemOutput[0]);
global_size[1] = divUp(output_h_, workItemOutput[1]);
global_size[2] = divUp(M_ * num_, workItemOutput[2]);
kernelType_ = KERNEL_TYPE_DWCONV;
blockM_ = blockWidth;
blockK_ = blockHeight;
blockN_ = blockDepth;
setupKernel();
ocl::Program program = compileKernel();
if (program.ptr())
{
kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0],
&workItemOutput[0], false, KERNEL_TYPE_DWCONV));
return true;
}
else
return false;
}
template<>
bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth)
{
kernelType_ = kernelType;
options_.str(""); options_.clear(); // clear contents and state flags
src_ = ocl::ProgramSource();
if (kernelType == KERNEL_TYPE_INTEL_IDLF)
return createIDLFKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_BASIC)
return createBasicKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
else if (kernelType == KERNEL_TYPE_DWCONV)
return createDWConvKernel(blockWidth, blockHeight, blockDepth);
else
CV_Assert(0 && "Internal error");
return false;
}
template<>
void OCL4DNNConvSpatial<float>::generate_gemmlike_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int blockN)
{
if (group_ != 1 || ((M_ % 8 != 0) || (M_ % 32 == 24)))
return;
if (blockM != 1 && blockM != 2)
return;
if (blockN != 32)
return;
if (blockK != 8 && blockK != 16)
return;
if (blockK == 16)
{
if ((blockM == 1 && (kernel_w_ > 4)) || M_ % 32 != 0)
return;
if ((blockM == 2) || M_ % 32 != 0)
return;
}
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN));
}
template<>
void OCL4DNNConvSpatial<float>::generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int simd_size)
{
int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
if (simd_size != 8 && simd_size != 16)
return;
if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
return;
if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
return;
int width_max, height_max, block_size_max;
width_max = 14;
height_max = 14;
block_size_max = 32;
if (blockM > width_max)
return;
if (blockK > height_max)
return;
if (blockM > output_w_)
return;
if (blockK > output_h_)
return;
// Only when the work items count is less than the device
// max work items or the M_ is less than 16, we will tune
// for simd 8.
if (simd_size == 8 && M_ >= 16 &&
((num_ * M_ * output_w_ * output_h_ / static_cast<float>(blockM * blockK)) >=
max_compute_units * 7 * 16))
return;
int actual_tile_x = kernel_w_ * dilation_w_ + (blockM - 1) * stride_w_ ;
int tile_x = alignSize(actual_tile_x, simd_size);
if (tile_x > simd_size)
return;
if (blockM * blockK > block_size_max)
return;
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, simd_size));
}
template<>
void OCL4DNNConvSpatial<float>::generate_dwconv_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
int blockM, int blockK, int blockN)
{
if (!dwconv_)
return;
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_DWCONV, blockM, blockK, blockN));
}
template<>
void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
{
if (ocl::Device::getDefault().intelSubgroupsSupport())
{
// depthwise kernel
generate_dwconv_tuneritems(tunerItems, 1, 1, 1);
if (tunerItems.size() > 0 && group_ > 8)
return;
// gemm like kernel
generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
generate_gemmlike_tuneritems(tunerItems, 2, 16, 32);
// idlf kernel
for (int simd_size = 8; simd_size <= 16; simd_size += 8)
{
int width_max, height_max;
width_max = 14;
height_max = 14;
for (uint32_t width = width_max; width > 0; width--)
{
for (uint32_t height = height_max; height > 0; height--)
{
generate_idlf_tuneritems(tunerItems, width, height, simd_size);
}
}
}
}
}
template<>
void OCL4DNNConvSpatial<float>::useFirstAvailable(const UMat &bottom,
UMat &top,
const UMat &weight,
const UMat &bias,
int32_t numImages,
UMat &verifyTop)
{
std::vector< cv::Ptr<tunerParam> > tunerItems;
generateTunerItems(tunerItems);
tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_BASIC, 1, 1, 1));
for (int i = 0; i < tunerItems.size(); i++) {
if (createConvolutionKernel(tunerItems[i]->kernelType,
tunerItems[i]->blockWidth,
tunerItems[i]->blockHeight,
tunerItems[i]->blockDepth)) {
int kernelIdx = kernelQueue.size() - 1;
if (verifyResult(bottom, top, weight, bias, numImages, kernelQueue[kernelIdx], verifyTop)) {
bestKernelConfig = kernelQueue[kernelIdx];
if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF &&
bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
if (!swizzled_weights_umat.empty())
swizzled_weights_umat.release();
for (int32_t j = 0; j < kernelIdx; j++) {
CV_Assert(phash.find(kernelQueue[j]->kernelName) != phash.end());
unloadProgram(kernelQueue[j]->kernelName);
}
kernelQueue.clear();
tuned_ = true;
break;
}
}
}
}
template<>
void OCL4DNNConvSpatial<float>::cacheTunedConfig()
{
if (tuned_)
{
cv::AutoLock lock(kernelConfigMutex);
std::stringstream outputKernel;
outputKernel << bestKernelConfig->workItem_output[0] << " "
<< bestKernelConfig->workItem_output[1] << " "
<< bestKernelConfig->workItem_output[2] << " "
<< bestKernelConfig->kernelType << " "
<< bestKernelConfig->local_work_size[0] << " "
<< bestKernelConfig->local_work_size[1] << " "
<< bestKernelConfig->local_work_size[2] << " "
<< bestKernelConfig->swizzle_weights << " "
<< bestKernelConfig->use_null_local << " ";
kernelConfigMap.insert(std::pair<std::string, std::string>(key_, outputKernel.str()));
}
}
template<>
void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
UMat &top,
const UMat &weight,
const UMat &bias,
int32_t numImages,
UMat &verifyTop)
{
std::vector< cv::Ptr<tunerParam> > tunerItems;
generateTunerItems(tunerItems);
for (int i = 0; i < tunerItems.size(); i++)
createConvolutionKernel(tunerItems[i]->kernelType,
tunerItems[i]->blockWidth,
tunerItems[i]->blockHeight,
tunerItems[i]->blockDepth);
for (int32_t x = 0; x < kernelQueue.size(); x++) {
kernelQueue[x]->executionTime = timedConvolve(bottom, top, weight, bias, numImages,
kernelQueue[x]);
#ifdef TEST_ALL_KERNELS
if (kernelQueue[x]->tested == false) {
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
if (verified == false) {
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification");
CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: "
<< kernelQueue[x]->workItem_output[0] << " "
<< "kernelQueue[x]->workItem_output[1]: "
<< kernelQueue[x]->workItem_output[1] << " "
<< "kernelQueue[x]->workItem_output[2]: "
<< kernelQueue[x]->workItem_output[2] << " "
<< "kernelQueue[x]->kernelType: "
<< kernelQueue[x]->kernelType << " "
<< "kernelQueue[x]->global_work_size[0]: "
<< kernelQueue[x]->global_work_size[0] << " "
<< "kernelQueue[x]->global_work_size[1]: "
<< kernelQueue[x]->global_work_size[1] << " "
<< "kernelQueue[x]->global_work_size[2]: "
<< kernelQueue[x]->global_work_size[2] << " "
<< "kernelQueue[x]->local_work_size[0]: "
<< kernelQueue[x]->local_work_size[0] << " "
<< "kernelQueue[x]->local_work_size[1]: "
<< kernelQueue[x]->local_work_size[1] << " "
<< "kernelQueue[x]->local_work_size[2]: "
<< kernelQueue[x]->local_work_size[2] << " "
<< kernelQueue[x]->swizzle_weights << " "
<< kernelQueue[x]->use_null_local);
} else {
CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification");
}
}
#endif
}
int32_t failures = 0;
bool verification = false;
if (kernelQueue.size()) {
while (failures < kernelQueue.size()) {
int32_t fastestKernel = -1;
float fastestTime = std::numeric_limits<float>::infinity();
for (int32_t x = 0; x < kernelQueue.size(); x++) {
if (kernelQueue[x]->executionTime < fastestTime &&
kernelQueue[x]->tested == false) {
fastestKernel = x;
fastestTime = kernelQueue[x]->executionTime;
}
}
if (fastestKernel < 0) break;
// Test fastest kernel
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
if (verified == true) {
kernelQueue[fastestKernel]->verified = true;
kernel_index_ = fastestKernel;
verification = true;
break;
} else {
kernelQueue[fastestKernel]->tested = true;
CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName <<
" failed verification");
failures++;
}
}
}
if (verification) {
CV_LOG_INFO(NULL, "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
"> passed verification");
CV_LOG_INFO(NULL, "Convolution Time:" << kernelQueue[kernel_index_]->executionTime);
double out_w = output_w_;
double out_h = output_h_;
double out_z = M_;
double k_w = kernel_w_;
double k_h = kernel_h_;
double k_z = channels_;
float elapsedTime = kernelQueue[kernel_index_]->executionTime;
double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
CV_LOG_INFO(NULL, "\tEstimated Gflops:" << (totalFlops * 1e-9));
CV_LOG_INFO(NULL, "\tEstimated GFLOPS/S: " << ((totalFlops * 1e-9)*(1000.0/elapsedTime)));
} else {
CV_LOG_INFO(NULL, "fallback to basic kernel");
options_.str(""); options_.clear(); // clear contents and state flags
createBasicKernel(1, 1, 1);
kernel_index_ = kernelQueue.size() - 1;
}
this->bestKernelConfig = kernelQueue[kernel_index_];
if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
if (!swizzled_weights_umat.empty())
swizzled_weights_umat.release();
for (int32_t x = 0; x < kernelQueue.size(); x++) {
if (x != kernel_index_) {
CV_Assert(phash.find(kernelQueue[x]->kernelName) != phash.end());
unloadProgram(kernelQueue[x]->kernelName);
}
}
kernelQueue.clear();
tuned_ = true;
saveTunedConfig();
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::saveTunedConfig()
{
CV_Assert(tuned_);
if (!use_cache_path_ || cache_path_.empty())
return;
std::string outputFile;
outputFile = cache_path_ + "/" + key_sanitized_;
std::ofstream outputKernel;
outputKernel.open(outputFile.c_str());
outputKernel << bestKernelConfig->workItem_output[0] << " "
<< bestKernelConfig->workItem_output[1] << " "
<< bestKernelConfig->workItem_output[2] << " "
<< bestKernelConfig->kernelType << " "
<< bestKernelConfig->local_work_size[0] << " "
<< bestKernelConfig->local_work_size[1] << " "
<< bestKernelConfig->local_work_size[2] << " "
<< bestKernelConfig->swizzle_weights << " "
<< bestKernelConfig->use_null_local << " ";
outputKernel.close();
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
const UMat &weight, const UMat &bias,
int32_t numImages)
{
std::string previous_key = key_;
generateKey();
if (key_.compare(previous_key) == 0 && bestKernelConfig != NULL)
return;
if (bestKernelConfig)
{
prev_kernel_type_ = bestKernelConfig->kernelType;
CV_Assert(phash.find(bestKernelConfig->kernelName) != phash.end());
phash.erase(bestKernelConfig->kernelName);
bestKernelConfig.release();
}
if (loadCachedConfig()) // check in-memory cache
return;
if (loadTunedConfig()) // check external storage
return;
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
if (run_auto_tuning_ || force_auto_tuning_)
{
setupConvolution(bottom, top, weight, bias, numImages, benchData);
}
else
{
useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
}
cacheTunedConfig();
}
template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::loadCachedConfig()
{
cv::AutoLock lock(kernelConfigMutex);
if (!defaultConfigLoaded && !force_auto_tuning_)
initializeGlobalBuiltinConfigurations((use_cache_path_ && !cache_path_.empty()) ? (cache_path_ + '/') : std::string());
kernel_hash_t::iterator it = kernelConfigMap.find(key_);
if (it != kernelConfigMap.end())
{
int32_t x, y, z, type, lx, ly, lz;
bool swizzle, nullLocal;
std::stringstream cachedKernel(it->second);
if (cachedKernel)
{
cachedKernel >> x;
cachedKernel >> y;
cachedKernel >> z;
cachedKernel >> type;
cachedKernel >> lx;
cachedKernel >> ly;
cachedKernel >> lz;
cachedKernel >> swizzle;
cachedKernel >> nullLocal;
if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
tuned_ = true;
return true;
}
}
}
return false;
}
template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int type,
int lx, int ly, int lz,
bool swizzle, bool nullLocal)
{
if (type == KERNEL_TYPE_INTEL_IDLF)
{
if (z == 1)
z = 16;
CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
}
kernelQueue.clear();
createConvolutionKernel(type, x, y, z);
if (kernelQueue.size() != 1) {
std::cerr << "Failed setup kernel by config:"
<< " x = " << x
<< " y = " << y
<< " z = " << z
<< " type = " << type
<< std::endl;
return false;
}
bestKernelConfig = kernelQueue[0];
kernelQueue.clear();
bestKernelConfig->local_work_size[0] = lx;
bestKernelConfig->local_work_size[1] = ly;
bestKernelConfig->local_work_size[2] = lz;
bestKernelConfig->swizzle_weights = swizzle;
bestKernelConfig->use_null_local = nullLocal;
// If kernel type changed to type 2 or 4, we need to reset the swizzled
// weights pointer to invalidate the previous swizzled weights data.
if (prev_kernel_type_ != bestKernelConfig->kernelType &&
(bestKernelConfig->kernelType == KERNEL_TYPE_INTEL_IDLF ||
bestKernelConfig->kernelType == KERNEL_TYPE_GEMM_LIKE))
{
if (!swizzled_weights_umat.empty())
swizzled_weights_umat.release();
}
return true;
}
template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
{
if (force_auto_tuning_)
return false; // don't load results from external storage
if (!use_cache_path_)
{
if (cache_path_.empty())
{
static int warn_ = 0;
if (!warn_)
{
std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
<< " via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
warn_ = true;
}
}
return false;
}
int32_t x, y, z, type, lx, ly, lz;
bool swizzle, nullLocal;
// Find cached kernel configuration from file
std::string cacheFile = cache_path_ + "/" + key_sanitized_;
std::ifstream cachedKernel(cacheFile.c_str());
if (cachedKernel)
{
cachedKernel >> x;
cachedKernel >> y;
cachedKernel >> z;
cachedKernel >> type;
cachedKernel >> lx;
cachedKernel >> ly;
cachedKernel >> lz;
cachedKernel >> swizzle;
cachedKernel >> nullLocal;
if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
tuned_ = true;
return true;
}
}
return false;
}
template class OCL4DNNConvSpatial<float>;
}}} // namespace cv::dnn::ocl4dnn