diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
index 8c9c075f6a..34450cd879 100644
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -54,40 +54,30 @@ namespace cv { namespace gpu { namespace device
 
         __global__ void buildPointList(const DevMem2Db src, unsigned int* list)
         {
-            const int x = blockIdx.x * 32 * PIXELS_PER_THREAD + threadIdx.x;
-            const int y = blockIdx.y * 4 + threadIdx.y;
+            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
+            __shared__ unsigned int s_qsize[4];
+            __shared__ unsigned int s_start[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
             if (y >= src.rows)
                 return;
 
-            volatile int qindex = -1;
-            __shared__ volatile int s_qindex[4];
-            __shared__ volatile int s_qstart[4];
-            s_qindex[threadIdx.y] = -1;
+            if (threadIdx.x == 0)
+                s_qsize[threadIdx.y] = 0;
 
-            __shared__ volatile unsigned int s_queue[4][32 * PIXELS_PER_THREAD];
+            __syncthreads();
 
             // fill the queue
-            for (int i = 0; i < PIXELS_PER_THREAD; ++i)
+            for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
             {
-                const int xx = i * blockDim.x + x;
-
-                if (xx >= src.cols)
-                    break;
-
                 if (src(y, xx))
                 {
-                    const unsigned int queue_val = (y << 16) | xx;
-
-                    do {
-                        qindex++;
-                        s_qindex[threadIdx.y] = qindex;
-                        s_queue[threadIdx.y][qindex] = queue_val;
-                    } while (s_queue[threadIdx.y][qindex] != queue_val);
+                    const unsigned int val = (y << 16) | xx;
+                    int qidx = Emulation::smem::atomicInc(&s_qsize[threadIdx.y], (unsigned int)(-1));
+                    s_queues[threadIdx.y][qidx] = val;
                 }
-
-                // reload index from smem (last thread to write to smem will have updated it)
-                qindex = s_qindex[threadIdx.y];
             }
 
             __syncthreads();
@@ -96,31 +86,27 @@ namespace cv { namespace gpu { namespace device
             if (threadIdx.x == 0 && threadIdx.y == 0)
             {
                 // find how many items are stored in each list
-                int total_index = 0;
-                #pragma unroll
-                for (int i = 0; i < 4; ++i)
+                unsigned int total_size = 0;
+                for (int i = 0; i < blockDim.y; ++i)
                 {
-                    s_qstart[i] = total_index;
-                    total_index += (s_qindex[i] + 1u);
+                    s_start[i] = total_size;
+                    total_size += s_qsize[i];
                 }
 
                 //calculate the offset in the global list
-                const unsigned int global_offset = atomicAdd(&g_counter, total_index);
-                #pragma unroll
-                for (int i = 0; i < 4; ++i)
-                    s_qstart[i] += global_offset;
+                const unsigned int global_offset = atomicAdd(&g_counter, total_size);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_start[i] += global_offset;
             }
 
             __syncthreads();
 
             // copy local queues to global queue
-            for(int i = 0; i <= qindex; i += 32)
+            const unsigned int qsize = s_qsize[threadIdx.y];
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x)
             {
-                if(i + threadIdx.x > qindex)
-                    break;
-
-                unsigned int qvalue = s_queue[threadIdx.y][i + threadIdx.x];
-                list[s_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
+                unsigned int val = s_queues[threadIdx.y][i];
+                list[s_start[threadIdx.y] + i] = val;
             }
         }
 
diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp
index f4d4399d26..e78637c69e 100644
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -61,8 +61,24 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
     CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());
 
     ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf);
-
     unsigned int count = buildPointList_gpu(src, buf.ptr<unsigned int>());
+//    unsigned int count = 0;
+//    {
+//        cv::Mat h_src(src);
+//        cv::Mat h_buf(1, src.size().area(), CV_32SC1);
+//        for (int y = 0; y < h_src.rows; ++y)
+//        {
+//            for (int x = 0; x < h_src.cols; ++x)
+//            {
+//                if (h_src.at<uchar>(y, x))
+//                {
+//                    const unsigned int val = (y << 16) | x;
+//                    h_buf.ptr<unsigned int>()[count++] = val;
+//                }
+//            }
+//        }
+//        buf.upload(h_buf);
+//    }
 
     const int numangle = cvRound(CV_PI / theta);
     const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
@@ -70,7 +86,8 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
     ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
     accum.setTo(cv::Scalar::all(0));
 
-    linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta);
+    if (count > 0)
+        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta);
 }
 
 void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)