From c8d8b1fbcd612d7044ea9c94c5bf3f6cd53cdaef Mon Sep 17 00:00:00 2001
From: Ya-Chiu Wu <yacwu@cs.nctu.edu.tw>
Date: Fri, 2 Feb 2018 18:20:46 +0800
Subject: [PATCH] Merge pull request #10553 from GlueCrow:bgfg_knn_opencl

Add ocl version BackgroundSubtractorKNN (#10553)

* Add ocl version bgfg_knn

* Add ocl KNN perf test

* ocl KNN: Avoid unnecessary initializing when non-UMat parameters are used

* video: turn off OpenCL for color KNN on Intel devices

due performance degradation

* video: turn off KNN OpenCL on Apple devices with Intel iGPU

due process freeze during clBuildProgram() call
---
 modules/video/perf/opencl/perf_bgfg_knn.cpp |  95 +++++++
 modules/video/src/bgfg_KNN.cpp              | 271 +++++++++++++++++---
 modules/video/src/opencl/bgfg_knn.cl        | 248 ++++++++++++++++++
 3 files changed, 580 insertions(+), 34 deletions(-)
 create mode 100644 modules/video/perf/opencl/perf_bgfg_knn.cpp
 create mode 100644 modules/video/src/opencl/bgfg_knn.cl
diff --git a/modules/video/perf/opencl/perf_bgfg_knn.cpp b/modules/video/perf/opencl/perf_bgfg_knn.cpp
new file mode 100644
index 0000000000..30419af422
--- /dev/null
+++ b/modules/video/perf/opencl/perf_bgfg_knn.cpp
@@ -0,0 +1,95 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+#ifdef HAVE_VIDEO_INPUT
+#include "../perf_bgfg_utils.hpp"
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////// KNN//////////////////////////
+
+typedef tuple<string, int> VideoKNNParamType;
+typedef TestBaseWithParam<VideoKNNParamType> KNN_Apply;
+typedef TestBaseWithParam<VideoKNNParamType> KNN_GetBackgroundImage;
+
+using namespace opencv_test;
+
+OCL_PERF_TEST_P(KNN_Apply, KNN, Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"), Values(1,3)))
+{
+    VideoKNNParamType params = GetParam();
+
+    const string inputFile = getDataPath(get<0>(params));
+
+    const int cn = get<1>(params);
+    int nFrame = 5;
+
+    vector<Mat> frame_buffer(nFrame);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+    prepareData(cap, cn, frame_buffer);
+
+    UMat u_foreground;
+
+    OCL_TEST_CYCLE()
+    {
+        Ptr<cv::BackgroundSubtractorKNN> knn = createBackgroundSubtractorKNN();
+        knn->setDetectShadows(false);
+        u_foreground.release();
+        for (int i = 0; i < nFrame; i++)
+        {
+            knn->apply(frame_buffer[i], u_foreground);
+        }
+    }
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST_P(KNN_GetBackgroundImage, KNN, Values(
+        std::make_pair<string, int>("gpu/video/768x576.avi", 5),
+        std::make_pair<string, int>("gpu/video/1920x1080.avi", 5)))
+{
+    VideoKNNParamType params = GetParam();
+
+    const string inputFile = getDataPath(get<0>(params));
+
+    const int cn = 3;
+    const int skipFrames = get<1>(params);
+    int nFrame = 10;
+
+    vector<Mat> frame_buffer(nFrame);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+    prepareData(cap, cn, frame_buffer, skipFrames);
+
+    UMat u_foreground, u_background;
+
+    OCL_TEST_CYCLE()
+    {
+        Ptr<cv::BackgroundSubtractorKNN> knn = createBackgroundSubtractorKNN();
+        knn->setDetectShadows(false);
+        u_foreground.release();
+        u_background.release();
+        for (int i = 0; i < nFrame; i++)
+        {
+            knn->apply(frame_buffer[i], u_foreground);
+        }
+        knn->getBackgroundImage(u_background);
+    }
+#ifdef DEBUG_BGFG
+    imwrite(format("fg_%d_%d_knn_ocl.png", frame_buffer[0].rows, cn), u_foreground.getMat(ACCESS_READ));
+    imwrite(format("bg_%d_%d_knn_ocl.png", frame_buffer[0].rows, cn), u_background.getMat(ACCESS_READ));
+#endif
+    SANITY_CHECK_NOTHING();
+}
+
+}}// namespace cvtest::ocl
+
+#endif
+#endif
diff --git a/modules/video/src/bgfg_KNN.cpp b/modules/video/src/bgfg_KNN.cpp
index 0f0ff12958..006d3bd1c9 100755
--- a/modules/video/src/bgfg_KNN.cpp
+++ b/modules/video/src/bgfg_KNN.cpp
@@ -42,6 +42,7 @@
 //#include <math.h>
 
 #include "precomp.hpp"
+#include "opencl_kernels_video.hpp"
 
 namespace cv
 {
@@ -92,6 +93,9 @@ public:
     nLongCounter = 0;
     nMidCounter = 0;
     nShortCounter = 0;
+#ifdef HAVE_OPENCL
+    opencl_ON = true;
+#endif
     }
     //! the full constructor that takes the length of the history,
     // the number of gaussian mixtures, the background ratio parameter and the noise strength
@@ -119,6 +123,9 @@ public:
     nLongCounter = 0;
     nMidCounter = 0;
     nShortCounter = 0;
+#ifdef HAVE_OPENCL
+    opencl_ON = true;
+#endif
     }
     //! the destructor
     ~BackgroundSubtractorKNNImpl() {}
@@ -131,40 +138,80 @@ public:
     //! re-initialization method
     void initialize(Size _frameSize, int _frameType)
     {
-    frameSize = _frameSize;
-    frameType = _frameType;
-    nframes = 0;
+        frameSize = _frameSize;
+        frameType = _frameType;
+        nframes = 0;
 
-    int nchannels = CV_MAT_CN(frameType);
-    CV_Assert( nchannels <= CV_CN_MAX );
+        int nchannels = CV_MAT_CN(frameType);
+        CV_Assert( nchannels <= CV_CN_MAX );
 
-    // Reserve memory for the model
-    int size=frameSize.height*frameSize.width;
-    // for each sample of 3 speed pixel models each pixel bg model we store ...
-    // values + flag (nchannels+1 values)
-    bgmodel.create( 1,(nN * 3) * (nchannels+1)* size,CV_8U);
-    bgmodel = Scalar::all(0);
+        // Reserve memory for the model
+        int size=frameSize.height*frameSize.width;
+        //Reset counters
+        nShortCounter = 0;
+        nMidCounter = 0;
+        nLongCounter = 0;
 
-    //index through the three circular lists
-    aModelIndexShort.create(1,size,CV_8U);
-    aModelIndexMid.create(1,size,CV_8U);
-    aModelIndexLong.create(1,size,CV_8U);
-    //when to update next
-    nNextShortUpdate.create(1,size,CV_8U);
-    nNextMidUpdate.create(1,size,CV_8U);
-    nNextLongUpdate.create(1,size,CV_8U);
+#ifdef HAVE_OPENCL
+        if (ocl::isOpenCLActivated() && opencl_ON)
+        {
+            create_ocl_apply_kernel();
 
-    //Reset counters
-    nShortCounter = 0;
-    nMidCounter = 0;
-    nLongCounter = 0;
+            kernel_getBg.create("getBackgroundImage2_kernel", ocl::video::bgfg_knn_oclsrc, format( "-D CN=%d -D NSAMPLES=%d", nchannels, nN));
 
-    aModelIndexShort = Scalar::all(0);//random? //((m_nN)*rand())/(RAND_MAX+1);//0...m_nN-1
-    aModelIndexMid = Scalar::all(0);
-    aModelIndexLong = Scalar::all(0);
-    nNextShortUpdate = Scalar::all(0);
-    nNextMidUpdate = Scalar::all(0);
-    nNextLongUpdate = Scalar::all(0);
+            if (kernel_apply.empty() || kernel_getBg.empty())
+                opencl_ON = false;
+        }
+        else opencl_ON = false;
+
+        if (opencl_ON)
+        {
+            u_flag.create(frameSize.height * nN * 3, frameSize.width, CV_8UC1);
+            u_flag.setTo(Scalar::all(0));
+
+            if (nchannels==3)
+                nchannels=4;
+            u_sample.create(frameSize.height * nN * 3, frameSize.width, CV_32FC(nchannels));
+            u_sample.setTo(Scalar::all(0));
+
+            u_aModelIndexShort.create(frameSize.height, frameSize.width, CV_8UC1);
+            u_aModelIndexShort.setTo(Scalar::all(0));
+            u_aModelIndexMid.create(frameSize.height, frameSize.width, CV_8UC1);
+            u_aModelIndexMid.setTo(Scalar::all(0));
+            u_aModelIndexLong.create(frameSize.height, frameSize.width, CV_8UC1);
+            u_aModelIndexLong.setTo(Scalar::all(0));
+
+            u_nNextShortUpdate.create(frameSize.height, frameSize.width, CV_8UC1);
+            u_nNextShortUpdate.setTo(Scalar::all(0));
+            u_nNextMidUpdate.create(frameSize.height, frameSize.width, CV_8UC1);
+            u_nNextMidUpdate.setTo(Scalar::all(0));
+            u_nNextLongUpdate.create(frameSize.height, frameSize.width, CV_8UC1);
+            u_nNextLongUpdate.setTo(Scalar::all(0));
+        }
+        else
+#endif
+        {
+            // for each sample of 3 speed pixel models each pixel bg model we store ...
+            // values + flag (nchannels+1 values)
+            bgmodel.create( 1,(nN * 3) * (nchannels+1)* size,CV_8U);
+            bgmodel = Scalar::all(0);
+
+            //index through the three circular lists
+            aModelIndexShort.create(1,size,CV_8U);
+            aModelIndexMid.create(1,size,CV_8U);
+            aModelIndexLong.create(1,size,CV_8U);
+            //when to update next
+            nNextShortUpdate.create(1,size,CV_8U);
+            nNextMidUpdate.create(1,size,CV_8U);
+            nNextLongUpdate.create(1,size,CV_8U);
+
+            aModelIndexShort = Scalar::all(0);//random? //((m_nN)*rand())/(RAND_MAX+1);//0...m_nN-1
+            aModelIndexMid = Scalar::all(0);
+            aModelIndexLong = Scalar::all(0);
+            nNextShortUpdate = Scalar::all(0);
+            nNextMidUpdate = Scalar::all(0);
+            nNextLongUpdate = Scalar::all(0);
+        }
     }
 
     virtual int getHistory() const { return history; }
@@ -180,7 +227,19 @@ public:
     virtual void setDist2Threshold(double _dist2Threshold) { fTb = (float)_dist2Threshold; }
 
     virtual bool getDetectShadows() const { return bShadowDetection; }
-    virtual void setDetectShadows(bool detectshadows) { bShadowDetection = detectshadows; }
+    virtual void setDetectShadows(bool detectshadows)
+    {
+        if ((bShadowDetection && detectshadows) || (!bShadowDetection && !detectshadows))
+            return;
+        bShadowDetection = detectshadows;
+#ifdef HAVE_OPENCL
+        if (!kernel_apply.empty())
+        {
+            create_ocl_apply_kernel();
+            CV_Assert( !kernel_apply.empty() );
+        }
+#endif
+    }
 
     virtual int getShadowValue() const { return nShadowDetection; }
     virtual void setShadowValue(int value) { nShadowDetection = (uchar)value; }
@@ -256,7 +315,29 @@ protected:
     Mat nNextMidUpdate;
     Mat nNextLongUpdate;
 
+#ifdef HAVE_OPENCL
+    mutable bool opencl_ON;
+
+    UMat u_flag;
+    UMat u_sample;
+    UMat u_aModelIndexShort;
+    UMat u_aModelIndexMid;
+    UMat u_aModelIndexLong;
+    UMat u_nNextShortUpdate;
+    UMat u_nNextMidUpdate;
+    UMat u_nNextLongUpdate;
+
+    mutable ocl::Kernel kernel_apply;
+    mutable ocl::Kernel kernel_getBg;
+#endif
+
     String name_;
+
+#ifdef HAVE_OPENCL
+    bool ocl_getBackgroundImage(OutputArray backgroundImage) const;
+    bool ocl_apply(InputArray _image, OutputArray _fgmask, double learningRate=-1);
+    void create_ocl_apply_kernel();
+#endif
 };
 
 CV_INLINE void
@@ -328,7 +409,6 @@ CV_INLINE int
     include=0;//do we include this pixel into background model?
 
     int ndata=nchannels+1;
-//	float k;
     // now increase the probability for each pixel
     for (int n = 0; n < m_nN*3; n++)
     {
@@ -546,18 +626,132 @@ public:
     uchar m_nShadowDetection;
 };
 
+#ifdef HAVE_OPENCL
+bool BackgroundSubtractorKNNImpl::ocl_apply(InputArray _image, OutputArray _fgmask, double learningRate)
+{
+    bool needToInitialize = nframes == 0 || learningRate >= 1 || _image.size() != frameSize || _image.type() != frameType;
 
+    if( needToInitialize )
+        initialize(_image.size(), _image.type());
+
+    ++nframes;
+    learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./std::min( 2*nframes, history );
+    CV_Assert(learningRate >= 0);
+
+    _fgmask.create(_image.size(), CV_8U);
+    UMat fgmask = _fgmask.getUMat();
+
+    UMat frame = _image.getUMat();
+
+    //recalculate update rates - in case alpha is changed
+    // calculate update parameters (using alpha)
+    int Kshort,Kmid,Klong;
+    //approximate exponential learning curve
+    Kshort=(int)(log(0.7)/log(1-learningRate))+1;//Kshort
+    Kmid=(int)(log(0.4)/log(1-learningRate))-Kshort+1;//Kmid
+    Klong=(int)(log(0.1)/log(1-learningRate))-Kshort-Kmid+1;//Klong
+
+    //refresh rates
+    int nShortUpdate = (Kshort/nN)+1;
+    int nMidUpdate = (Kmid/nN)+1;
+    int nLongUpdate = (Klong/nN)+1;
+
+    int idxArg = 0;
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadOnly(frame));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadOnly(u_nNextLongUpdate));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadOnly(u_nNextMidUpdate));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadOnly(u_nNextShortUpdate));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_aModelIndexLong));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_aModelIndexMid));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_aModelIndexShort));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_flag));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::PtrReadWrite(u_sample));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::WriteOnlyNoSize(fgmask));
+
+    idxArg = kernel_apply.set(idxArg, nLongCounter);
+    idxArg = kernel_apply.set(idxArg, nMidCounter);
+    idxArg = kernel_apply.set(idxArg, nShortCounter);
+    idxArg = kernel_apply.set(idxArg, fTb);
+    idxArg = kernel_apply.set(idxArg, nkNN);
+    idxArg = kernel_apply.set(idxArg, fTau);
+    if (bShadowDetection)
+        kernel_apply.set(idxArg, nShadowDetection);
+
+    size_t globalsize[2] = {(size_t)frame.cols, (size_t)frame.rows};
+    if(!kernel_apply.run(2, globalsize, NULL, true))
+        return false;
+
+    nShortCounter++;//0,1,...,nShortUpdate-1
+    nMidCounter++;
+    nLongCounter++;
+    if (nShortCounter >= nShortUpdate)
+    {
+        nShortCounter = 0;
+        randu(u_nNextShortUpdate, Scalar::all(0),  Scalar::all(nShortUpdate));
+    }
+    if (nMidCounter >= nMidUpdate)
+    {
+        nMidCounter = 0;
+        randu(u_nNextMidUpdate, Scalar::all(0),  Scalar::all(nMidUpdate));
+    }
+    if (nLongCounter >= nLongUpdate)
+    {
+        nLongCounter = 0;
+        randu(u_nNextLongUpdate, Scalar::all(0),  Scalar::all(nLongUpdate));
+    }
+    return true;
+}
+
+bool BackgroundSubtractorKNNImpl::ocl_getBackgroundImage(OutputArray _backgroundImage) const
+{
+    _backgroundImage.create(frameSize, frameType);
+    UMat dst = _backgroundImage.getUMat();
+
+    int idxArg = 0;
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::PtrReadOnly(u_flag));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::PtrReadOnly(u_sample));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::WriteOnly(dst));
+
+    size_t globalsize[2] = {(size_t)dst.cols, (size_t)dst.rows};
+
+    return kernel_getBg.run(2, globalsize, NULL, false);
+}
+
+void BackgroundSubtractorKNNImpl::create_ocl_apply_kernel()
+{
+    int nchannels = CV_MAT_CN(frameType);
+    String opts = format("-D CN=%d -D NSAMPLES=%d%s", nchannels, nN, bShadowDetection ? " -D SHADOW_DETECT" : "");
+    kernel_apply.create("knn_kernel", ocl::video::bgfg_knn_oclsrc, opts);
+}
+
+#endif
 
 void BackgroundSubtractorKNNImpl::apply(InputArray _image, OutputArray _fgmask, double learningRate)
 {
     CV_INSTRUMENT_REGION()
 
-    Mat image = _image.getMat();
-    bool needToInitialize = nframes == 0 || learningRate >= 1 || image.size() != frameSize || image.type() != frameType;
+#ifdef HAVE_OPENCL
+    if (opencl_ON)
+    {
+#ifndef __APPLE__
+        CV_OCL_RUN(_fgmask.isUMat() && OCL_PERFORMANCE_CHECK(!ocl::Device::getDefault().isIntel() || _image.channels() == 1),
+                   ocl_apply(_image, _fgmask, learningRate))
+#else
+        CV_OCL_RUN(_fgmask.isUMat() && OCL_PERFORMANCE_CHECK(!ocl::Device::getDefault().isIntel()),
+                   ocl_apply(_image, _fgmask, learningRate))
+#endif
+
+        opencl_ON = false;
+        nframes = 0;
+    }
+#endif
+
+    bool needToInitialize = nframes == 0 || learningRate >= 1 || _image.size() != frameSize || _image.type() != frameType;
 
     if( needToInitialize )
-        initialize(image.size(), image.type());
+        initialize(_image.size(), _image.type());
 
+    Mat image = _image.getMat();
     _fgmask.create( image.size(), CV_8U );
     Mat fgmask = _fgmask.getMat();
 
@@ -622,6 +816,15 @@ void BackgroundSubtractorKNNImpl::getBackgroundImage(OutputArray backgroundImage
 {
     CV_INSTRUMENT_REGION()
 
+#ifdef HAVE_OPENCL
+    if (opencl_ON)
+    {
+        CV_OCL_RUN(opencl_ON, ocl_getBackgroundImage(backgroundImage))
+
+        opencl_ON = false;
+    }
+#endif
+
     int nchannels = CV_MAT_CN(frameType);
     //CV_Assert( nchannels == 3 );
     Mat meanBackground(frameSize, CV_8UC3, Scalar::all(0));
diff --git a/modules/video/src/opencl/bgfg_knn.cl b/modules/video/src/opencl/bgfg_knn.cl
new file mode 100644
index 0000000000..0205dba76c
--- /dev/null
+++ b/modules/video/src/opencl/bgfg_knn.cl
@@ -0,0 +1,248 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2018 Ya-Chiu Wu, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Ya-Chiu Wu, yacwu@cs.nctu.edu.tw
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if CN==1
+
+#define T_MEAN float
+#define F_ZERO (0.0f)
+
+#define frameToMean(a, b) (b) = *(a);
+#define meanToFrame(a, b) *b = convert_uchar_sat(a);
+
+#else
+
+#define T_MEAN float4
+#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
+
+#define meanToFrame(a, b)\
+    b[0] = convert_uchar_sat(a.x); \
+    b[1] = convert_uchar_sat(a.y); \
+    b[2] = convert_uchar_sat(a.z);
+
+#define frameToMean(a, b)\
+    b.x = a[0]; \
+    b.y = a[1]; \
+    b.z = a[2]; \
+    b.w = 0.0f;
+
+#endif
+
+__kernel void knn_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,
+                         __global const uchar* nNextLongUpdate,
+                         __global const uchar* nNextMidUpdate,
+                         __global const uchar* nNextShortUpdate,
+                         __global uchar* aModelIndexLong,
+                         __global uchar* aModelIndexMid,
+                         __global uchar* aModelIndexShort,
+                         __global uchar* flag,
+                         __global uchar* sample,
+                         __global uchar* fgmask, int fgmask_step, int fgmask_offset,
+                         int nLongCounter, int nMidCounter, int nShortCounter,
+                         float c_Tb, int c_nkNN, float c_tau
+#ifdef SHADOW_DETECT
+                         , uchar c_shadowVal
+#endif
+                         )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if( x < frame_col && y < frame_row)
+    {
+        __global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));
+        T_MEAN pix;
+        frameToMean(_frame, pix);
+
+        uchar foreground = 255; // 0 - the pixel classified as background
+
+        int Pbf = 0;
+        int Pb = 0;
+        uchar include = 0;
+
+        int pt_idx =  mad24(y, frame_col, x);
+        int idx_step = frame_row * frame_col;
+
+        __global T_MEAN* _sample = (__global T_MEAN*)(sample);
+
+        for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)
+        {
+            int n_idx = mad24(n, idx_step, pt_idx);
+
+            T_MEAN c_mean = _sample[n_idx];
+
+            uchar c_flag = flag[n_idx];
+
+            T_MEAN diff = c_mean - pix;
+            float dist2 = dot(diff, diff);
+
+            if (dist2 < c_Tb)
+            {
+                Pbf++;
+                if (c_flag)
+                {
+                    Pb++;
+                    if (Pb >= c_nkNN)
+                    {
+                        include = 1;
+                        foreground = 0;
+                        break;
+                    }
+                }
+            }
+        }
+        if (Pbf >= c_nkNN)
+        {
+            include = 1;
+        }
+
+#ifdef SHADOW_DETECT
+        if (foreground)
+        {
+            int Ps = 0;
+            for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)
+            {
+                int n_idx = mad24(n, idx_step, pt_idx);
+                uchar c_flag = flag[n_idx];
+
+                if (c_flag)
+                {
+                    T_MEAN c_mean = _sample[n_idx];
+
+                    float numerator = dot(pix, c_mean);
+                    float denominator = dot(c_mean, c_mean);
+
+                    if (denominator == 0)
+                        break;
+
+                    if (numerator <= denominator && numerator >= c_tau * denominator)
+                    {
+                        float a = numerator / denominator;
+
+                        T_MEAN dD = mad(a, c_mean, -pix);
+
+                        if (dot(dD, dD) < c_Tb * a * a)
+                        {
+                            Ps++;
+                            if (Ps >= c_nkNN)
+                            {
+                                foreground = c_shadowVal;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif
+        __global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);
+        *_fgmask = (uchar)foreground;
+
+        __global const uchar* _nNextLongUpdate = nNextLongUpdate + pt_idx;
+        __global const uchar* _nNextMidUpdate = nNextMidUpdate + pt_idx;
+        __global const uchar* _nNextShortUpdate = nNextShortUpdate + pt_idx;
+        __global uchar* _aModelIndexLong = aModelIndexLong + pt_idx;
+        __global uchar* _aModelIndexMid = aModelIndexMid + pt_idx;
+        __global uchar* _aModelIndexShort = aModelIndexShort + pt_idx;
+
+        uchar nextLongUpdate = _nNextLongUpdate[0];
+        uchar nextMidUpdate = _nNextMidUpdate[0];
+        uchar nextShortUpdate = _nNextShortUpdate[0];
+        uchar modelIndexLong = _aModelIndexLong[0];
+        uchar modelIndexMid = _aModelIndexMid[0];
+        uchar modelIndexShort = _aModelIndexShort[0];
+        int offsetLong = mad24(mad24(2, (NSAMPLES), modelIndexLong), idx_step, pt_idx);
+        int offsetMid = mad24((NSAMPLES)+modelIndexMid, idx_step, pt_idx);
+        int offsetShort = mad24(modelIndexShort, idx_step, pt_idx);
+        if (nextLongUpdate == nLongCounter)
+        {
+            _sample[offsetLong] = _sample[offsetMid];
+            flag[offsetLong] = flag[offsetMid];
+            _aModelIndexLong[0] = (modelIndexLong >= ((NSAMPLES)-1)) ? 0 : (modelIndexLong + 1);
+        }
+
+        if (nextMidUpdate == nMidCounter)
+        {
+            _sample[offsetMid] = _sample[offsetShort];
+            flag[offsetMid] = flag[offsetShort];
+            _aModelIndexMid[0] = (modelIndexMid >= ((NSAMPLES)-1)) ? 0 : (modelIndexMid + 1);
+        }
+
+        if (nextShortUpdate == nShortCounter)
+        {
+            _sample[offsetShort] = pix;
+            flag[offsetShort] = include;
+            _aModelIndexShort[0] = (modelIndexShort >= ((NSAMPLES)-1)) ? 0 : (modelIndexShort + 1);
+        }
+    }
+}
+
+__kernel void getBackgroundImage2_kernel(__global const uchar* flag,
+                                         __global const uchar* sample,
+                                         __global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < dst_col && y < dst_row)
+    {
+        int pt_idx =  mad24(y, dst_col, x);
+
+        T_MEAN meanVal = (T_MEAN)F_ZERO;
+
+        __global T_MEAN* _sample = (__global T_MEAN*)(sample);
+        int idx_step = dst_row * dst_col;
+        for (uchar n = 0; n < (NSAMPLES) * 3 ; ++n)
+        {
+            int n_idx = mad24(n, idx_step, pt_idx);
+            uchar c_flag = flag[n_idx];
+            if(c_flag)
+            {
+                meanVal = _sample[n_idx];
+                break;
+            }
+        }
+        __global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));
+        meanToFrame(meanVal, _dst);
+    }
+}