diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index db228a69b8..8362282b0d 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1552,12 +1552,14 @@ public:
         enum {PEDESTRIAN = 0};
     };
 
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT};
+
     // An empty cascade will be created.
     // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
     // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
     // Param scales is a number of scales from minScale to maxScale.
     // Param rejfactor is used for NMS.
-    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejCriteria = 1);
 
     virtual ~SCascade();
 
@@ -1595,7 +1597,7 @@ private:
     double maxScale;
 
     int scales;
-    int rejfactor;
+    int rejCriteria;
 };
 
 ////////////////////////////////// SURF //////////////////////////////////////////
diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
index 27d60e6372..5334441d8d 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -41,9 +41,10 @@
 //M*/
 
 #include <opencv2/gpu/device/common.hpp>
+
 #include <icf.hpp>
-#include <stdio.h>
 #include <float.h>
+#include <stdio.h>
 
 namespace cv { namespace gpu { namespace device {
 namespace icf {
@@ -79,6 +80,70 @@ namespace icf {
         }
     }
 
+    __device__ __forceinline__ float overlapArea(const Detection &a, const Detection &b)
+    {
+        int w = ::min(a.x + a.w, b.x + b.w) - ::max(a.x, b.x);
+        int h = ::min(a.y + a.h, b.y + b.h) - ::max(a.y, b.y);
+
+        return (w < 0 || h < 0)? 0.f : (float)(w * h);
+    }
+
+    __global__ void overlap(const uint* n, const Detection* detections, uchar* overlaps)
+    {
+        const int idx = threadIdx.x;
+        const int total = *n;
+
+        for (int i = idx; i < total; i += 192)
+        {
+            const Detection& a = detections[i];
+            bool excluded = false;
+
+            for (int j = i + 1; j < total; ++j)
+            {
+                const Detection& b = detections[j];
+                float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h);
+
+                if (ovl > 0.65f)
+                {
+                    int suppessed = (a.confidence > b.confidence)? j : i;
+                    overlaps[suppessed] = 1;
+                    excluded = excluded || (suppessed == i);
+                }
+
+                if (__all(excluded)) break;
+            }
+        }
+    }
+
+    __global__ void collect(const uint* n, const Detection* detections, uchar* overlaps)
+    {
+        const int idx = threadIdx.x;
+        const int total = *n;
+
+        for (int i = idx; i < total; i += 192)
+        {
+            if (!overlaps[i])
+            {
+                const Detection& det = detections[i];
+                // printf("%d: %d %d %d %d %f\n", i, det.x, det.y, det.w, det.h, det.confidence );
+            }
+        }
+    }
+
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections)
+    {
+        int block = 192;
+        int grid = 1;
+
+        overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0));
+        collect<<<grid, block>>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0));
+        // if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError());
+            cudaSafeCall( cudaDeviceSynchronize());
+        }
+    }
+
     template<typename Policy>
     struct PrefixSum
     {
diff --git a/modules/gpu/src/gpu_init.cpp b/modules/gpu/src/gpu_init.cpp
index f25bc2ceb0..773a8b64e9 100644
--- a/modules/gpu/src/gpu_init.cpp
+++ b/modules/gpu/src/gpu_init.cpp
@@ -46,10 +46,10 @@ namespace cv { namespace gpu
 {
 
 CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
-                  obj.info()->addParam(obj, "minScale",  obj.minScale);
-                  obj.info()->addParam(obj, "maxScale",  obj.maxScale);
-                  obj.info()->addParam(obj, "scales",    obj.scales);
-                  obj.info()->addParam(obj, "rejfactor", obj.rejfactor));
+                  obj.info()->addParam(obj, "minScale",    obj.minScale);
+                  obj.info()->addParam(obj, "maxScale",    obj.maxScale);
+                  obj.info()->addParam(obj, "scales",      obj.scales);
+                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));
 
 bool initModule_gpu(void)
 {
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index d5a8e84814..35bd72e552 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -85,6 +85,8 @@ namespace cv { namespace gpu { namespace device {
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins, cudaStream_t stream);
+
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections);
 }
 
 namespace imgproc {
@@ -309,6 +311,8 @@ struct cv::gpu::SCascade::Fields
         hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1);
         hogluv.setTo(cv::Scalar::all(0));
 
+        overlaps.create(1, 5000, CV_8UC1);
+
         return true;
     }
 
@@ -437,7 +441,15 @@ private:
         }
     }
 
+#include <iostream>
 public:
+    void suppress(GpuMat& ndetections, GpuMat& objects)
+    {
+        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
+        overlaps.setTo(0);
+        device::icf::suppress(objects, overlaps, ndetections);
+        // std::cout << cv::Mat(overlaps) << std::endl;
+    }
 
     // scales range
     float minScale;
@@ -469,6 +481,9 @@ public:
     // 161x121x10
     GpuMat hogluv;
 
+    // used for area overlap computing during
+    GpuMat overlaps;
+
     // Cascade from xml
     GpuMat octaves;
     GpuMat stages;
@@ -478,6 +493,8 @@ public:
 
     GpuMat sobelBuf;
 
+    GpuMat collected;
+
     std::vector<device::icf::Octave> voctaves;
 
     DeviceInfo info;
@@ -494,7 +511,7 @@ public:
 };
 
 cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int rjf)
-: fields(0),  minScale(mins), maxScale(maxs), scales(sc), rejfactor(rjf) {}
+: fields(0),  minScale(mins), maxScale(maxs), scales(sc), rejCriteria(rjf) {}
 
 cv::gpu::SCascade::~SCascade() { delete fields; }
 
@@ -534,6 +551,9 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     flds.detect(rois, tmp, objects, stream);
+
+    // if (rejCriteria != NO_REJECT)
+    flds.suppress(tmp, objects);
 }
 
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const