diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 8c9c075f6a..34450cd879 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -54,40 +54,30 @@ namespace cv { namespace gpu { namespace device __global__ void buildPointList(const DevMem2Db src, unsigned int* list) { - const int x = blockIdx.x * 32 * PIXELS_PER_THREAD + threadIdx.x; - const int y = blockIdx.y * 4 + threadIdx.y; + __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; + __shared__ unsigned int s_qsize[4]; + __shared__ unsigned int s_start[4]; + + const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; if (y >= src.rows) return; - volatile int qindex = -1; - __shared__ volatile int s_qindex[4]; - __shared__ volatile int s_qstart[4]; - s_qindex[threadIdx.y] = -1; + if (threadIdx.x == 0) + s_qsize[threadIdx.y] = 0; - __shared__ volatile unsigned int s_queue[4][32 * PIXELS_PER_THREAD]; + __syncthreads(); // fill the queue - for (int i = 0; i < PIXELS_PER_THREAD; ++i) + for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x) { - const int xx = i * blockDim.x + x; - - if (xx >= src.cols) - break; - if (src(y, xx)) { - const unsigned int queue_val = (y << 16) | xx; - - do { - qindex++; - s_qindex[threadIdx.y] = qindex; - s_queue[threadIdx.y][qindex] = queue_val; - } while (s_queue[threadIdx.y][qindex] != queue_val); + const unsigned int val = (y << 16) | xx; + int qidx = Emulation::smem::atomicInc(&s_qsize[threadIdx.y], (unsigned int)(-1)); + s_queues[threadIdx.y][qidx] = val; } - - // reload index from smem (last thread to write to smem will have updated it) - qindex = s_qindex[threadIdx.y]; } __syncthreads(); @@ -96,31 +86,27 @@ namespace cv { namespace gpu { namespace device if (threadIdx.x == 0 && threadIdx.y == 0) { // find how many items are stored in each list - int total_index = 0; - #pragma unroll - for (int i = 0; i < 4; ++i) + unsigned int total_size = 0; + for (int i = 0; i < blockDim.y; ++i) { - s_qstart[i] = total_index; - total_index += (s_qindex[i] + 1u); + s_start[i] = total_size; + total_size += s_qsize[i]; } //calculate the offset in the global list - const unsigned int global_offset = atomicAdd(&g_counter, total_index); - #pragma unroll - for (int i = 0; i < 4; ++i) - s_qstart[i] += global_offset; + const unsigned int global_offset = atomicAdd(&g_counter, total_size); + for (int i = 0; i < blockDim.y; ++i) + s_start[i] += global_offset; } __syncthreads(); // copy local queues to global queue - for(int i = 0; i <= qindex; i += 32) + const unsigned int qsize = s_qsize[threadIdx.y]; + for(int i = threadIdx.x; i < qsize; i += blockDim.x) { - if(i + threadIdx.x > qindex) - break; - - unsigned int qvalue = s_queue[threadIdx.y][i + threadIdx.x]; - list[s_qstart[threadIdx.y] + i + threadIdx.x] = qvalue; + unsigned int val = s_queues[threadIdx.y][i]; + list[s_start[threadIdx.y] + i] = val; } } diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index f4d4399d26..e78637c69e 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -61,8 +61,24 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, CV_Assert(src.rows < std::numeric_limits::max()); ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); - unsigned int count = buildPointList_gpu(src, buf.ptr()); +// unsigned int count = 0; +// { +// cv::Mat h_src(src); +// cv::Mat h_buf(1, src.size().area(), CV_32SC1); +// for (int y = 0; y < h_src.rows; ++y) +// { +// for (int x = 0; x < h_src.cols; ++x) +// { +// if (h_src.at(y, x)) +// { +// const unsigned int val = (y << 16) | x; +// h_buf.ptr()[count++] = val; +// } +// } +// } +// buf.upload(h_buf); +// } const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); @@ -70,7 +86,8 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); accum.setTo(cv::Scalar::all(0)); - linesAccum_gpu(buf.ptr(), count, accum, rho, theta); + if (count > 0) + linesAccum_gpu(buf.ptr(), count, accum, rho, theta); } void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)