diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index a3a0936bd4..2ec0f78f6f 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -60,6 +60,8 @@ #if defined WIN32 || defined _WIN32 #include #include +#undef min +#undef max #endif namespace cv { namespace dnn { namespace ocl4dnn { @@ -68,6 +70,30 @@ typedef std::map kernel_hash_t; static kernel_hash_t kernelConfigMap; static bool defaultConfigLoaded = false; +static bool enableWorkaroundIDLF() +{ + static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_WORKAROUND_IDLF", true); + return param; +} + +static bool dumpFailedResult() +{ + static bool param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_DUMP_FAILED_RESULT", false); + return param; +} + +static size_t testAllKernels() +{ + static size_t param = utils::getConfigurationParameterSizeT("OPENCV_OCL4DNN_TEST_ALL_KERNELS", 0); + return param; +} + +static bool raiseOnCheckError() +{ + static bool param = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_TUNING_RAISE_CHECK_ERROR", false); + return param; +} + static std::string sanitize(const std::string& s) { std::string s_ = s; @@ -1221,9 +1247,6 @@ bool OCL4DNNConvSpatial::verifyResult(const UMat &bottom, kernelConfig* config, UMat &verifyTop) { - - uint32_t verificationFail = 0; - if (config->verified) return true; else if (config->tested) @@ -1236,6 +1259,8 @@ bool OCL4DNNConvSpatial::verifyResult(const UMat &bottom, convolve(bottom, top, weight, bias, numImages, config); tuned_ = saved_tuned; + config->tested = true; + UMat new_top, new_verify_top; Mat mat_top, mat_verify_top; if (use_half_) @@ -1254,41 +1279,88 @@ bool OCL4DNNConvSpatial::verifyResult(const UMat &bottom, const float* data = mat_top.ptr(); const float* verify_data = mat_verify_top.ptr(); - for (int32_t n = 0; n < num_; ++n) { - for (int32_t g = 0; g < group_; ++g) { - int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g; - for (int out_ch = 0; out_ch < M_ && !verificationFail; out_ch++) - for (int h = 0; h < output_h_ && !verificationFail; h++) - for (int w = 0; w < output_w_; w++) { - size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w; + int error_slice_offset = 0; + int error_slice = 0; + float relative_eps = use_half_ ? 0.1f : 0.01f; - float error_factor = fabs(data[offset] - verify_data[offset]); - if (use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && - error_factor > 0.04 && !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) - { - CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g - << " out_ch " << out_ch << " h " << h << " w " << w - << " got " << data[offset] << " expected " << verify_data[offset]); - verificationFail = 1; - goto out; + size_t errors = 0; + + double rel_err = norm(mat_top.reshape(1, 1), mat_verify_top.reshape(1, 1), NORM_L1 | NORM_RELATIVE); + if (rel_err >= relative_eps) + { + for (int32_t n = 0; n < num_; ++n) { + for (int32_t g = 0; g < group_; ++g) { + int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g; + for (int out_ch = 0; out_ch < M_; out_ch++) + for (int h = 0; h < output_h_; h++) + for (int w = 0; w < output_w_; w++) { + size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w; + + bool has_error = !(data[offset] == data[offset]); // is NaN + if (!has_error) + { + float error_factor = std::fabs(data[offset] - verify_data[offset]); + float base_value_abs = std::max(1e-3f, std::fabs(verify_data[offset])); + has_error = error_factor > relative_eps * base_value_abs; + } + if (has_error) + { + if (errors == 0) + { + error_slice = (int)(offset / (output_w_ * output_h_)); + error_slice_offset = (int)(offset % (output_w_ * output_h_)); + CV_LOG_ERROR(NULL, "Kernel: " << config->kernelName); + } + if (errors < 10) + CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g + << " out_ch " << out_ch << " h " << h << " w " << w + << " (offset: " << offset << ")" + << " got " << data[offset] << " expected " << verify_data[offset]); + errors++; + } } - else if (!use_half_ && error_factor > 0.1 * fabs(verify_data[offset]) && - !(fabs(verify_data[offset]) < 1.e-3 && error_factor < 1.e-4)) - { - CV_LOG_ERROR(NULL, "test verification failed @ image " << n << " group " << g - << " out_ch " << out_ch << " h " << h << " w " << w - << " got " << data[offset] << " expected " << verify_data[offset]); - verificationFail = 1; - goto out; - } - } + } } } -out: - if (verificationFail == 1) + + if (errors) + { + if (dumpFailedResult()) + { + try + { + int n_outputs = (int)(mat_top.size[0]*mat_top.size[1]); + int slice_size = (int)(mat_top.total() / n_outputs); + Rect roi(0, 0, slice_size, n_outputs); + roi.width = std::min(roi.width, 32); + roi.height = std::min(roi.height, 16); + roi.x = std::max(0, std::min(slice_size - roi.width, error_slice_offset - roi.width/2)); + roi.y = std::max(0, std::min(n_outputs - roi.height, error_slice - roi.height/2)); + std::cout << "roi = " << roi << " errors=" << errors << std::endl; + std::cout << "mat_top = " << shape(mat_top) << std::endl + << mat_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl; + std::cout << "verify_top = " << shape(mat_verify_top) << std::endl + << mat_verify_top.reshape(1, 1).reshape(1, n_outputs)(roi) << std::endl; + } + catch (const std::exception& e) + { + CV_LOG_ERROR(NULL, "Results dump failed: " << e.what()); + } + catch (...) + { + CV_LOG_ERROR(NULL, "Results dump failed") + } + } + + if (raiseOnCheckError()) + CV_Error_(Error::StsError, ("ocl4dnn tuning verification failed: %s (errors %lld)", config->kernelName.c_str(), (long long int)errors)); return false; + } else + { + config->verified = true; return true; + } } template @@ -1408,6 +1480,17 @@ bool OCL4DNNConvSpatial::createIDLFKernel(int32_t blockWidth, setupKernel(); + if (enableWorkaroundIDLF() && ocl::Device::getDefault().intelSubgroupsSupport()) + { + // Issues are observed with these kernels: 3x1 (covered by tests), 2x1, 4x1, 5x1, 3x2 + // kernels 1x3, 3x3, 2x3 are good + if (pad_h_ != 0 && kernel_w_ <= simd_size && kernel_h_ <= 2) + { + CV_LOG_INFO(NULL, "DNN(workaround): skip IDLF kernel: " << kernel_name_); + return false; + } + } + ocl::Program program = compileKernel(); if (program.ptr()) { @@ -1623,13 +1706,38 @@ void OCL4DNNConvSpatial::useFirstAvailable(const UMat &bottom, generateTunerItems(tunerItems); tunerItems.push_back(makePtr(KERNEL_TYPE_BASIC, 1, 1, 1)); - for (int i = 0; i < tunerItems.size(); i++) { + for (int i = 0; i < tunerItems.size(); i++) + { if (createConvolutionKernel(tunerItems[i]->kernelType, tunerItems[i]->blockWidth, tunerItems[i]->blockHeight, - tunerItems[i]->blockDepth)) { + tunerItems[i]->blockDepth)) + { int kernelIdx = kernelQueue.size() - 1; - if (verifyResult(bottom, top, weight, bias, numImages, kernelQueue[kernelIdx], verifyTop)) { + kernelConfig* config = kernelQueue[kernelIdx].get(); + bool failed = false; + const size_t testCount = testAllKernels(); + for(int t = 0; t < testCount; t++) + { + try + { + config->tested = false; + config->verified = false; + if (!verifyResult(bottom, top, weight, bias, numImages, config, verifyTop)) + { + CV_LOG_ERROR(NULL, "Failed on test iteration: " << t); + failed = true; + break; + } + } + catch (...) + { + CV_LOG_ERROR(NULL, "Failed on test iteration: " << t); + throw; + } + } + if (!failed && verifyResult(bottom, top, weight, bias, numImages, config, verifyTop)) + { bestKernelConfig = kernelQueue[kernelIdx]; if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE) @@ -1685,42 +1793,50 @@ void OCL4DNNConvSpatial::setupConvolution(const UMat &bottom, tunerItems[i]->blockHeight, tunerItems[i]->blockDepth); - for (int32_t x = 0; x < kernelQueue.size(); x++) { - kernelQueue[x]->executionTime = timedConvolve(bottom, top, weight, bias, numImages, - kernelQueue[x]); - #ifdef TEST_ALL_KERNELS - if (kernelQueue[x]->tested == false) { - bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop); - if (verified == false) { - CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[x]->kernelName << " failed verification"); - CV_LOG_ERROR(NULL, "kernelQueue[x]->workItem_output[0]: " - << kernelQueue[x]->workItem_output[0] << " " - << "kernelQueue[x]->workItem_output[1]: " - << kernelQueue[x]->workItem_output[1] << " " - << "kernelQueue[x]->workItem_output[2]: " - << kernelQueue[x]->workItem_output[2] << " " - << "kernelQueue[x]->kernelType: " - << kernelQueue[x]->kernelType << " " - << "kernelQueue[x]->global_work_size[0]: " - << kernelQueue[x]->global_work_size[0] << " " - << "kernelQueue[x]->global_work_size[1]: " - << kernelQueue[x]->global_work_size[1] << " " - << "kernelQueue[x]->global_work_size[2]: " - << kernelQueue[x]->global_work_size[2] << " " - << "kernelQueue[x]->local_work_size[0]: " - << kernelQueue[x]->local_work_size[0] << " " - << "kernelQueue[x]->local_work_size[1]: " - << kernelQueue[x]->local_work_size[1] << " " - << "kernelQueue[x]->local_work_size[2]: " - << kernelQueue[x]->local_work_size[2] << " " - << kernelQueue[x]->swizzle_weights << " " - << kernelQueue[x]->use_null_local); - } else { - CV_LOG_INFO(NULL, "Kernel " << kernelQueue[x]->kernelName << " pass verification"); + const size_t testCount = testAllKernels(); + for (int32_t x = 0; x < kernelQueue.size(); x++) + { + kernelConfig* config = kernelQueue[x]; + config->executionTime = timedConvolve(bottom, top, weight, bias, numImages, config); + for(int t = 0; t < testCount; t++) + { + try + { + config->tested = false; + config->verified = false; + bool verified = verifyResult(bottom, top, weight, bias, numImages, config, verifyTop); + if (verified == false) + { + CV_LOG_ERROR(NULL, "Kernel " << config->kernelName << " failed verification"); + CV_LOG_ERROR(NULL, "workItem=" + << config->workItem_output[0] << "," + << config->workItem_output[1] << "," + << config->workItem_output[2] << " " + << "kernelType: " << config->kernelType << " " + << "global_work_size=" + << config->global_work_size[0] << "," + << config->global_work_size[1] << "," + << config->global_work_size[2] << " " + << "local_work_size=" + << config->local_work_size[0] << "," + << config->local_work_size[1] << "," + << config->local_work_size[2] << " " + << config->swizzle_weights << " " + << config->use_null_local); + } + else + { + CV_LOG_VERBOSE(NULL, "Kernel " << config->kernelName << " pass verification"); + } + } + catch (...) + { + CV_LOG_ERROR(NULL, "Failed on test iteration: " << t); + throw; } } - #endif } + int32_t failures = 0; bool verification = false; if (kernelQueue.size()) { @@ -1739,12 +1855,10 @@ void OCL4DNNConvSpatial::setupConvolution(const UMat &bottom, // Test fastest kernel bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop); if (verified == true) { - kernelQueue[fastestKernel]->verified = true; kernel_index_ = fastestKernel; verification = true; break; } else { - kernelQueue[fastestKernel]->tested = true; CV_LOG_ERROR(NULL, "Kernel " << kernelQueue[fastestKernel]->kernelName << " failed verification"); failures++; diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp index e9b219fafe..1be30cda64 100644 --- a/modules/dnn/test/test_halide_layers.cpp +++ b/modules/dnn/test/test_halide_layers.cpp @@ -99,14 +99,6 @@ TEST_P(Convolution, Accuracy) #endif bool skipCheck = false; - if (cvtest::skipUnstableTests && backendId == DNN_BACKEND_OPENCV && - (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) && - ( - (kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1)) || - (stride.area() > 1 && !(pad.width == 0 && pad.height == 0)) - ) - ) - skipCheck = true; int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width}; Mat weights(4, &sz[0], CV_32F);