diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 12814e179f..2cd5cac030 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -333,8 +333,12 @@ public: void* ptr() const; static Queue& getDefault(); + /// @brief Returns OpenCL command queue with enable profiling mode support + const Queue& getProfilingQueue() const; + + struct Impl; friend struct Impl; + inline Impl* getImpl() const { return p; } protected: - struct Impl; Impl* p; }; @@ -569,6 +573,12 @@ public: size_t localsize[], bool sync, const Queue& q=Queue()); bool runTask(bool sync, const Queue& q=Queue()); + /** @brief Similar to synchronized run() call with returning of kernel execution time + * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE) + * @return Execution time in nanoseconds or negative number on error + */ + int64 runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q=Queue()); + size_t workGroupSize() const; size_t preferedWorkGroupSizeMultiple() const; bool compileWorkGroupSize(size_t wsz[]) const; diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 8fea1d2b1e..447c52fa22 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -1840,9 +1840,35 @@ void initializeContextFromHandle(Context& ctx, void* platform, void* _context, v struct Queue::Impl { - Impl(const Context& c, const Device& d) + inline void __init() { refcount = 1; + handle = 0; + isProfilingQueue_ = false; + } + + Impl(cl_command_queue q) + { + __init(); + handle = q; + + cl_command_queue_properties props = 0; + cl_int result = clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL); + CV_Assert(result && "clGetCommandQueueInfo(CL_QUEUE_PROPERTIES)"); + isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE); + } + + Impl(cl_command_queue q, bool isProfilingQueue) + { + __init(); + handle = q; + isProfilingQueue_ = isProfilingQueue; + } + + Impl(const Context& c, const Device& d, bool withProfiling = false) + { + __init(); + const Context* pc = &c; cl_context ch = (cl_context)pc->ptr(); if( !ch ) @@ -1854,8 +1880,10 @@ struct Queue::Impl if( !dh ) dh = (cl_device_id)pc->device(0).ptr(); cl_int retval = 0; - handle = clCreateCommandQueue(ch, dh, 0, &retval); + cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0; + handle = clCreateCommandQueue(ch, dh, props, &retval); CV_OclDbgAssert(retval == CL_SUCCESS); + isProfilingQueue_ = withProfiling; } ~Impl() @@ -1873,9 +1901,37 @@ struct Queue::Impl } } + const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self) + { + if (isProfilingQueue_) + return self; + + if (profiling_queue_.ptr()) + return profiling_queue_; + + cl_context ctx = 0; + CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL)); + + cl_device_id device = 0; + CV_Assert(CL_SUCCESS == clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL)); + + cl_int result = CL_SUCCESS; + cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; + cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result); + CV_Assert(result == CL_SUCCESS && "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)"); + + Queue queue; + queue.p = new Impl(q, true); + profiling_queue_ = queue; + + return profiling_queue_; + } + IMPLEMENT_REFCOUNTABLE(); cl_command_queue handle; + bool isProfilingQueue_; + cv::ocl::Queue profiling_queue_; }; Queue::Queue() @@ -1929,6 +1985,12 @@ void Queue::finish() } } +const Queue& Queue::getProfilingQueue() const +{ + CV_Assert(p); + return p->getProfilingQueue(*this); +} + void* Queue::ptr() const { return p ? p->handle : 0; @@ -2032,6 +2094,9 @@ struct Kernel::Impl release(); } + bool run(int dims, size_t _globalsize[], size_t _localsize[], + bool sync, int64* timeNS, const Queue& q); + ~Impl() { if(handle) @@ -2259,19 +2324,15 @@ int Kernel::set(int i, const KernelArg& arg) return i+1; } - bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], bool sync, const Queue& q) { - CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str()); - - if(!p || !p->handle || p->isInProgress) + if (!p) return false; - cl_command_queue qq = getQueue(q); size_t globalsize[CV_MAX_DIM] = {1,1,1}; size_t total = 1; - CV_Assert(_globalsize != 0); + CV_Assert(_globalsize != NULL); for (int i = 0; i < dims; i++) { size_t val = _localsize ? _localsize[i] : @@ -2283,12 +2344,28 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val; } CV_Assert(total > 0); - if( p->haveTempDstUMats ) + + return p->run(dims, globalsize, _localsize, sync, NULL, q); +} + + +bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[], + bool sync, int64* timeNS, const Queue& q) +{ + CV_INSTRUMENT_REGION_OPENCL_RUN(p->name.c_str()); + + if (!handle || isInProgress) + return false; + + cl_command_queue qq = getQueue(q); + if (haveTempDstUMats) + sync = true; + if (timeNS) sync = true; cl_event asyncEvent = 0; - cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims, - NULL, globalsize, _localsize, 0, 0, - sync ? 0 : &asyncEvent); + cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims, + NULL, globalsize, localsize, 0, 0, + (sync && !timeNS) ? 0 : &asyncEvent); #if CV_OPENCL_SHOW_RUN_ERRORS if (retval != CL_SUCCESS) { @@ -2296,16 +2373,31 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], fflush(stdout); } #endif - if( sync || retval != CL_SUCCESS ) + if (sync || retval != CL_SUCCESS) { CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS); - p->cleanupUMats(); + if (timeNS) + { + if (retval == CL_SUCCESS) + { + clWaitForEvents(1, &asyncEvent); + cl_ulong startTime, stopTime; + CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL)); + CV_Assert(CL_SUCCESS == clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL)); + *timeNS = (int64)(stopTime - startTime); + } + else + { + *timeNS = -1; + } + } + cleanupUMats(); } else { - p->addref(); - p->isInProgress = true; - CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS); + addref(); + isInProgress = true; + CV_OclDbgAssert(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this) == CL_SUCCESS); } if (asyncEvent) clReleaseEvent(asyncEvent); @@ -2336,6 +2428,17 @@ bool Kernel::runTask(bool sync, const Queue& q) return retval == CL_SUCCESS; } +int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_) +{ + CV_Assert(p && p->handle && !p->isInProgress); + Queue q = q_.ptr() ? q_ : Queue::getDefault(); + CV_Assert(q.ptr()); + q.finish(); // call clFinish() on base queue + Queue profilingQueue = q.getProfilingQueue(); + int64 timeNs = -1; + bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue); + return res ? timeNs : -1; +} size_t Kernel::workGroupSize() const { diff --git a/modules/core/src/ocl_deprecated.hpp b/modules/core/src/ocl_deprecated.hpp index 6bf426b9a5..3cf261b8e7 100644 --- a/modules/core/src/ocl_deprecated.hpp +++ b/modules/core/src/ocl_deprecated.hpp @@ -759,15 +759,15 @@ OCL_FUNC_P(cl_mem, clCreateBuffer, /* OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue)) - +*/ OCL_FUNC(cl_int, clGetCommandQueueInfo, - (cl_command_queue command_queue, - cl_command_queue_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret), - (command_queue, param_name, param_value_size, param_value, param_value_size_ret)) - + (cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret), + (command_queue, param_name, param_value_size, param_value, param_value_size_ret)) +/* OCL_FUNC_P(cl_mem, clCreateSubBuffer, (cl_mem buffer, cl_mem_flags flags, @@ -1202,6 +1202,19 @@ OCL_FUNC(cl_int, clSetEventCallback, OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event)) +OCL_FUNC(cl_int, clWaitForEvents, + (cl_uint num_events, const cl_event *event_list), + (num_events, event_list)) + + +OCL_FUNC(cl_int, clGetEventProfilingInfo, ( + cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret), + (event, param_name, param_value_size, param_value, param_value_size_ret)) + } #endif