Merge pull request #9418 from borisfom:cuda9
CUDA9 build fixed, added detection (#9418) * CUDA9 build fixed, added detection * Replacing deprecated __shfl_xxx with __shfl_sync, fixing bogus CUDA9 warnings
This commit is contained in:
parent
d0509f6702
commit
c48807c383
@ -195,8 +195,8 @@ OCV_OPTION(WITH_CPUFEATURES "Use cpufeatures Android library" ON
|
||||
OCV_OPTION(WITH_VTK "Include VTK library support (and build opencv_viz module eiher)" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) )
|
||||
OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (NOT IOS AND NOT WINRT) )
|
||||
OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (NOT IOS AND NOT WINRT) )
|
||||
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (NOT IOS AND NOT WINRT) )
|
||||
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (NOT IOS AND NOT APPLE) )
|
||||
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" ON IF (NOT IOS AND NOT WINRT) )
|
||||
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" ON IF (NOT IOS AND NOT APPLE) )
|
||||
OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON IF (NOT WINRT) )
|
||||
OCV_OPTION(WITH_VFW "Include Video for Windows support" ON IF WIN32 )
|
||||
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
|
||||
|
||||
@ -790,8 +790,18 @@ endif()
|
||||
if(CUDA_VERSION VERSION_GREATER "5.0")
|
||||
# In CUDA 5.5 NPP was splitted onto 3 separate libraries.
|
||||
find_cuda_helper_libs(nppc)
|
||||
find_cuda_helper_libs(nppi)
|
||||
find_cuda_helper_libs(nppial)
|
||||
find_cuda_helper_libs(nppicc)
|
||||
find_cuda_helper_libs(nppicom)
|
||||
find_cuda_helper_libs(nppidei)
|
||||
find_cuda_helper_libs(nppif)
|
||||
find_cuda_helper_libs(nppig)
|
||||
find_cuda_helper_libs(nppim)
|
||||
find_cuda_helper_libs(nppist)
|
||||
find_cuda_helper_libs(nppisu)
|
||||
find_cuda_helper_libs(nppitc)
|
||||
find_cuda_helper_libs(npps)
|
||||
set(CUDA_nppi_LIBRARY "${CUDA_nppial_LIBRARY};${CUDA_nppicc_LIBRARY};${CUDA_nppicom_LIBRARY};${CUDA_nppidei_LIBRARY};${CUDA_nppif_LIBRARY};${CUDA_nppig_LIBRARY};${CUDA_nppim_LIBRARY};${CUDA_nppist_LIBRARY};${CUDA_nppisu_LIBRARY};${CUDA_nppitc_LIBRARY}")
|
||||
set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}")
|
||||
elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
|
||||
find_cuda_helper_libs(npp)
|
||||
|
||||
@ -43,7 +43,7 @@ if(CUDA_FOUND)
|
||||
|
||||
message(STATUS "CUDA detected: " ${CUDA_VERSION})
|
||||
|
||||
set(_generations "Fermi" "Kepler" "Maxwell" "Pascal")
|
||||
set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta")
|
||||
if(NOT CMAKE_CROSSCOMPILING)
|
||||
list(APPEND _generations "Auto")
|
||||
endif()
|
||||
@ -70,6 +70,8 @@ if(CUDA_FOUND)
|
||||
set(__cuda_arch_bin "5.0 5.2")
|
||||
elseif(CUDA_GENERATION STREQUAL "Pascal")
|
||||
set(__cuda_arch_bin "6.0 6.1")
|
||||
elseif(CUDA_GENERATION STREQUAL "Volta")
|
||||
set(__cuda_arch_bin "7.0")
|
||||
elseif(CUDA_GENERATION STREQUAL "Auto")
|
||||
execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run"
|
||||
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
|
||||
@ -94,17 +96,17 @@ if(CUDA_FOUND)
|
||||
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(NOT _nvcc_res EQUAL 0)
|
||||
message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
|
||||
set(__cuda_arch_bin "5.3 6.2")
|
||||
set(__cuda_arch_bin "5.3 6.2 7.0")
|
||||
else()
|
||||
set(__cuda_arch_bin "${_nvcc_out}")
|
||||
string(REPLACE "2.1" "2.1(2.0)" __cuda_arch_bin "${__cuda_arch_bin}")
|
||||
endif()
|
||||
set(__cuda_arch_ptx "")
|
||||
else()
|
||||
if(${CUDA_VERSION} VERSION_LESS "8.0")
|
||||
set(__cuda_arch_bin "2.0 3.0 3.5 3.7 5.0 5.2")
|
||||
else()
|
||||
if(${CUDA_VERSION} VERSION_LESS "9.0")
|
||||
set(__cuda_arch_bin "2.0 3.0 3.5 3.7 5.0 5.2 6.0 6.1")
|
||||
else()
|
||||
set(__cuda_arch_bin "3.0 3.5 3.7 5.0 5.2 6.0 6.1 7.0")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -58,6 +58,14 @@
|
||||
#ifdef HAVE_CUDA
|
||||
# include <cuda.h>
|
||||
# include <cuda_runtime.h>
|
||||
# if defined (__GNUC__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wstrict-aliasing"
|
||||
# include <cuda_fp16.h>
|
||||
# pragma GCC diagnostic pop
|
||||
# else
|
||||
# include <cuda_fp16.h>
|
||||
# endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
|
||||
# include <npp.h>
|
||||
# include "opencv2/core/cuda_stream_accessor.hpp"
|
||||
# include "opencv2/core/cuda/common.hpp"
|
||||
|
||||
@ -56,7 +56,7 @@
|
||||
#include "opencv2/core/private.cuda.hpp"
|
||||
|
||||
#ifdef HAVE_NVCUVID
|
||||
#include <nvcuvid.h>
|
||||
#include <dynlink_nvcuvid.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define NOMINMAX
|
||||
|
||||
@ -47,6 +47,7 @@
|
||||
#define OPENCV_CUDEV_UTIL_SATURATE_CAST_HPP
|
||||
|
||||
#include "../common.hpp"
|
||||
#include "opencv2/core/private.cuda.hpp"
|
||||
|
||||
namespace cv { namespace cudev {
|
||||
|
||||
@ -274,12 +275,21 @@ template <typename T, typename D> __device__ __forceinline__ D cast_fp16(T v);
|
||||
|
||||
template <> __device__ __forceinline__ float cast_fp16<short, float>(short v)
|
||||
{
|
||||
#if __CUDACC_VER_MAJOR__ >= 9
|
||||
return float(*(__half*)&v);
|
||||
#else
|
||||
return __half2float(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <> __device__ __forceinline__ short cast_fp16<float, short>(float v)
|
||||
{
|
||||
return (short)__float2half_rn(v);
|
||||
#if __CUDACC_VER_MAJOR__ >= 9
|
||||
__half h(v);
|
||||
return *(short*)&v;
|
||||
#else
|
||||
return (short)__float2half_rn(v);
|
||||
#endif
|
||||
}
|
||||
//! @}
|
||||
|
||||
|
||||
@ -56,8 +56,14 @@ namespace cv { namespace cudev {
|
||||
|
||||
#if CV_CUDEV_ARCH >= 300
|
||||
|
||||
// shfl
|
||||
#if __CUDACC_VER_MAJOR__ >= 9
|
||||
# define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
|
||||
# define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
|
||||
# define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
|
||||
# define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
|
||||
#endif
|
||||
|
||||
// shfl
|
||||
__device__ __forceinline__ uchar shfl(uchar val, int srcLane, int width = warpSize)
|
||||
{
|
||||
return (uchar) __shfl((int) val, srcLane, width);
|
||||
@ -419,6 +425,10 @@ CV_CUDEV_SHFL_XOR_VEC_INST(float)
|
||||
CV_CUDEV_SHFL_XOR_VEC_INST(double)
|
||||
|
||||
#undef CV_CUDEV_SHFL_XOR_VEC_INST
|
||||
#undef __shfl
|
||||
#undef __shfl_xor
|
||||
#undef __shfl_up
|
||||
#undef __shfl_down
|
||||
|
||||
#endif // CV_CUDEV_ARCH >= 300
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user