From a5b5684670ebc8d5dffa6499949ab1a573c04b8b Mon Sep 17 00:00:00 2001
From: Jiri Horner <laeqten@gmail.com>
Date: Wed, 16 Aug 2017 18:46:11 +0200
Subject: [PATCH] Merge pull request #9330 from hrnr:akaze_ocl

[GSOC] Enable OCL for AKAZE (#9330)

* revert e0489cb - reenable OCL for AKAZE

* deal with conversion internally in AKAZE

* pass InputArray directly to AKAZE to allow distiguishing input Mat/UMat. deal with conversion there
* ensure that keypoints orientations are always computed. prevents misuse of internal AKAZE class.

* covert internal AKAZE functions to use InputArray/OutputArray

* make internal functions private in AKAZE

* split OCL and CPU paths in AKAZE

* create 2 separate pyramids, 1 for OCL and 1 for CPU
* template functions that use temporaries to always store them as correct type (UMat/Mat)

* remove variable used only in OCL path

causes unused variable warning

* update AKAZE documentation

* run ocl version only when ocl is enabled

* add tests for OCL path in AKAZE

* relax condition for keypoints angle
---
 .../features2d/include/opencv2/features2d.hpp |  21 +-
 modules/features2d/src/akaze.cpp              |  23 +-
 modules/features2d/src/kaze/AKAZEFeatures.cpp | 350 +++++++++++-------
 modules/features2d/src/kaze/AKAZEFeatures.h   |  50 ++-
 .../features2d/test/ocl/test_feature2d.cpp    |  72 ++++
 5 files changed, 341 insertions(+), 175 deletions(-)
 create mode 100644 modules/features2d/test/ocl/test_feature2d.cpp

diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index afd2477d29..de0aeb4818 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -658,13 +658,22 @@ public:
     CV_WRAP virtual int getDiffusivity() const = 0;
 };
 
-/** @brief Class implementing the AKAZE keypoint detector and descriptor extractor, described in @cite ANB13 . :
+/** @brief Class implementing the AKAZE keypoint detector and descriptor extractor, described in @cite ANB13.
 
-@note AKAZE descriptors can only be used with KAZE or AKAZE keypoints. Try to avoid using *extract*
-and *detect* instead of *operator()* due to performance reasons. .. [ANB13] Fast Explicit Diffusion
-for Accelerated Features in Nonlinear Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien
-Bartoli. In British Machine Vision Conference (BMVC), Bristol, UK, September 2013.
- */
+@details AKAZE descriptors can only be used with KAZE or AKAZE keypoints. This class is thread-safe.
+
+@note When you need descriptors use Feature2D::detectAndCompute, which
+provides better performance. When using Feature2D::detect followed by
+Feature2D::compute scale space pyramid is computed twice.
+
+@note AKAZE implements T-API. When image is passed as UMat some parts of the algorithm
+will use OpenCL.
+
+@note [ANB13] Fast Explicit Diffusion for Accelerated Features in Nonlinear
+Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien Bartoli. In
+British Machine Vision Conference (BMVC), Bristol, UK, September 2013.
+
+*/
 class CV_EXPORTS_W AKAZE : public Feature2D
 {
 public:
diff --git a/modules/features2d/src/akaze.cpp b/modules/features2d/src/akaze.cpp
index baca3c6677..63194bc690 100644
--- a/modules/features2d/src/akaze.cpp
+++ b/modules/features2d/src/akaze.cpp
@@ -169,38 +169,25 @@ namespace cv
         {
             CV_INSTRUMENT_REGION()
 
-            Mat img = image.getMat();
-            if (img.channels() > 1)
-                cvtColor(image, img, COLOR_BGR2GRAY);
-
-            Mat img1_32;
-            if ( img.depth() == CV_32F )
-                img1_32 = img;
-            else if ( img.depth() == CV_8U )
-                img.convertTo(img1_32, CV_32F, 1.0 / 255.0, 0);
-            else if ( img.depth() == CV_16U )
-                img.convertTo(img1_32, CV_32F, 1.0 / 65535.0, 0);
-
-            CV_Assert( ! img1_32.empty() );
+            CV_Assert( ! image.empty() );
 
             AKAZEOptions options;
             options.descriptor = descriptor;
             options.descriptor_channels = descriptor_channels;
             options.descriptor_size = descriptor_size;
-            options.img_width = img.cols;
-            options.img_height = img.rows;
+            options.img_width = image.cols();
+            options.img_height = image.rows();
             options.dthreshold = threshold;
             options.omax = octaves;
             options.nsublevels = sublevels;
             options.diffusivity = diffusivity;
 
             AKAZEFeatures impl(options);
-            impl.Create_Nonlinear_Scale_Space(img1_32);
+            impl.Create_Nonlinear_Scale_Space(image);
 
             if (!useProvidedKeypoints)
             {
                 impl.Feature_Detection(keypoints);
-                impl.Compute_Keypoints_Orientation(keypoints);
             }
 
             if (!mask.empty())
@@ -208,7 +195,7 @@ namespace cv
                 KeyPointsFilter::runByPixelsMask(keypoints, mask.getMat());
             }
 
-            if( descriptors.needed() )
+            if(descriptors.needed())
             {
                 impl.Compute_Descriptors(keypoints, descriptors);
 
diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp
index 024a5cad2e..eda14e3db5 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -15,10 +15,6 @@
 
 #include <iostream>
 
-#ifdef HAVE_OPENCL // OpenCL is not well supported
-#undef HAVE_OPENCL
-#endif
-
 // Namespaces
 namespace cv
 {
@@ -75,7 +71,7 @@ void AKAZEFeatures::Allocate_Memory_Evolution(void) {
     }
 
     for (int j = 0; j < options_.nsublevels; j++) {
-      Evolution step;
+      MEvolution step;
       step.size = Size(level_width, level_height);
       step.esigma = options_.soffset*pow(2.f, (float)(j) / (float)(options_.nsublevels) + i);
       step.sigma_size = cvRound(step.esigma * options_.derivative_factor / power);  // In fact sigma_size only depends on j
@@ -257,39 +253,41 @@ private:
 static inline bool
 ocl_non_linear_diffusion_step(InputArray Lt_, InputArray Lf_, OutputArray Lstep_, float step_size)
 {
-    if (!Lt_.isContinuous())
-        return false;
+  if(!Lt_.isContinuous())
+    return false;
 
-    UMat Lt = Lt_.getUMat(), Lf = Lf_.getUMat(), Lstep = Lstep_.getUMat();
+  UMat Lt = Lt_.getUMat();
+  UMat Lf = Lf_.getUMat();
+  UMat Lstep = Lstep_.getUMat();
 
-    size_t globalSize[] = {(size_t)Lt.cols, (size_t)Lt.rows};
+  size_t globalSize[] = {(size_t)Lt.cols, (size_t)Lt.rows};
 
-    ocl::Kernel ker("AKAZE_nld_step_scalar", ocl::features2d::akaze_oclsrc);
-    if (ker.empty())
-        return false;
+  ocl::Kernel ker("AKAZE_nld_step_scalar", ocl::features2d::akaze_oclsrc);
+  if( ker.empty() )
+    return false;
 
-    return ker.args(
-            ocl::KernelArg::ReadOnly(Lt),
-            ocl::KernelArg::PtrReadOnly(Lf),
-            ocl::KernelArg::PtrWriteOnly(Lstep),
-            step_size)
-    .run(2, globalSize, 0, true);
+  return ker.args(
+    ocl::KernelArg::ReadOnly(Lt),
+    ocl::KernelArg::PtrReadOnly(Lf),
+    ocl::KernelArg::PtrWriteOnly(Lstep),
+    step_size).run(2, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL
 
 static inline void
-non_linear_diffusion_step(InputArray Lt, InputArray Lf, OutputArray Lstep, float step_size)
+non_linear_diffusion_step(InputArray Lt_, InputArray Lf_, OutputArray Lstep_, float step_size)
 {
   CV_INSTRUMENT_REGION()
 
-  Lstep.create(Lt.size(), Lt.type());
+  Lstep_.create(Lt_.size(), Lt_.type());
 
-#ifdef HAVE_OPENCL
-  CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lstep.isUMat()), ocl_non_linear_diffusion_step(Lt, Lf, Lstep, step_size));
-#endif
+  CV_OCL_RUN(Lt_.isUMat() && Lf_.isUMat() && Lstep_.isUMat(),
+    ocl_non_linear_diffusion_step(Lt_, Lf_, Lstep_, step_size));
 
-  Mat Mstep = Lstep.getMat();
-  parallel_for_(Range(0, Lt.rows()), NonLinearScalarDiffusionStep(Lt.getMat(), Lf.getMat(), Mstep, step_size));
+  Mat Lt = Lt_.getMat();
+  Mat Lf = Lf_.getMat();
+  Mat Lstep = Lstep_.getMat();
+  parallel_for_(Range(0, Lt.rows), NonLinearScalarDiffusionStep(Lt, Lf, Lstep, step_size));
 }
 
 /**
@@ -302,12 +300,15 @@ non_linear_diffusion_step(InputArray Lt, InputArray Lf, OutputArray Lstep, float
  * @return k contrast factor
  */
 static inline float
-compute_kcontrast(const cv::Mat& Lx, const cv::Mat& Ly, float perc, int nbins)
+compute_kcontrast(InputArray Lx_, InputArray Ly_, float perc, int nbins)
 {
   CV_INSTRUMENT_REGION()
 
   CV_Assert(nbins > 2);
-  CV_Assert(!Lx.empty());
+  CV_Assert(!Lx_.empty());
+
+  Mat Lx = Lx_.getMat();
+  Mat Ly = Ly_.getMat();
 
   // temporary square roots of dot product
   Mat modgs (Lx.rows - 2, Lx.cols - 2, CV_32F);
@@ -356,21 +357,22 @@ compute_kcontrast(const cv::Mat& Lx, const cv::Mat& Ly, float perc, int nbins)
 static inline bool
 ocl_pm_g2(InputArray Lx_, InputArray Ly_, OutputArray Lflow_, float kcontrast)
 {
-    UMat Lx = Lx_.getUMat(), Ly = Ly_.getUMat(), Lflow = Lflow_.getUMat();
+  UMat Lx = Lx_.getUMat();
+  UMat Ly = Ly_.getUMat();
+  UMat Lflow = Lflow_.getUMat();
 
-    int total = Lx.rows * Lx.cols;
-    size_t globalSize[] = {(size_t)total};
+  int total = Lx.rows * Lx.cols;
+  size_t globalSize[] = {(size_t)total};
 
-    ocl::Kernel ker("AKAZE_pm_g2", ocl::features2d::akaze_oclsrc);
-    if (ker.empty())
-        return false;
+  ocl::Kernel ker("AKAZE_pm_g2", ocl::features2d::akaze_oclsrc);
+  if( ker.empty() )
+    return false;
 
-    return ker.args(
-            ocl::KernelArg::PtrReadOnly(Lx),
-            ocl::KernelArg::PtrReadOnly(Ly),
-            ocl::KernelArg::PtrWriteOnly(Lflow),
-            kcontrast, total)
-    .run(1, globalSize, 0, true);
+  return ker.args(
+    ocl::KernelArg::PtrReadOnly(Lx),
+    ocl::KernelArg::PtrReadOnly(Ly),
+    ocl::KernelArg::PtrWriteOnly(Lflow),
+    kcontrast, total).run(1, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL
 
@@ -386,9 +388,7 @@ compute_diffusivity(InputArray Lx, InputArray Ly, OutputArray Lflow, float kcont
       pm_g1(Lx, Ly, Lflow, kcontrast);
     break;
     case KAZE::DIFF_PM_G2:
-#ifdef HAVE_OPENCL
-      CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lflow.isUMat()), ocl_pm_g2(Lx, Ly, Lflow, kcontrast));
-#endif
+      CV_OCL_RUN(Lx.isUMat() && Ly.isUMat() && Lflow.isUMat(), ocl_pm_g2(Lx, Ly, Lflow, kcontrast));
       pm_g2(Lx, Ly, Lflow, kcontrast);
     break;
     case KAZE::DIFF_WEICKERT:
@@ -404,28 +404,54 @@ compute_diffusivity(InputArray Lx, InputArray Ly, OutputArray Lflow, float kcont
 }
 
 /**
- * @brief This method creates the nonlinear scale space for a given image
- * @param img Input image for which the nonlinear scale space needs to be created
- * @return 0 if the nonlinear scale space was created successfully, -1 otherwise
+ * @brief Converts input image to grayscale float image
+ *
+ * @param image any image
+ * @param dst grayscale float image
  */
-void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
+static inline void prepareInputImage(InputArray image, OutputArray dst)
+{
+  Mat img = image.getMat();
+  if (img.channels() > 1)
+    cvtColor(image, img, COLOR_BGR2GRAY);
+
+  if ( img.depth() == CV_32F )
+    dst.assign(img);
+  else if ( img.depth() == CV_8U )
+    img.convertTo(dst, CV_32F, 1.0 / 255.0, 0);
+  else if ( img.depth() == CV_16U )
+    img.convertTo(dst, CV_32F, 1.0 / 65535.0, 0);
+}
+
+/**
+ * @brief This method creates the nonlinear scale space for a given image
+ * @param image Input image for which the nonlinear scale space needs to be created
+ */
+template<typename MatType>
+static inline void
+create_nonlinear_scale_space(InputArray image, const AKAZEOptions &options,
+  const std::vector<std::vector<float > > &tsteps_evolution, std::vector<Evolution<MatType> > &evolution)
 {
   CV_INSTRUMENT_REGION()
-  CV_Assert(evolution_.size() > 0);
+  CV_Assert(evolution.size() > 0);
+
+  // convert input to grayscale float image if needed
+  MatType img;
+  prepareInputImage(image, img);
 
   // create first level of the evolution
-  int ksize = getGaussianKernelSize(options_.soffset);
-  GaussianBlur(img, evolution_[0].Lsmooth, Size(ksize, ksize), options_.soffset, options_.soffset, BORDER_REPLICATE);
-  evolution_[0].Lsmooth.copyTo(evolution_[0].Lt);
+  int ksize = getGaussianKernelSize(options.soffset);
+  GaussianBlur(img, evolution[0].Lsmooth, Size(ksize, ksize), options.soffset, options.soffset, BORDER_REPLICATE);
+  evolution[0].Lsmooth.copyTo(evolution[0].Lt);
 
-  if (evolution_.size() == 1) {
+  if (evolution.size() == 1) {
     // we don't need to compute kcontrast factor
-    Compute_Determinant_Hessian_Response();
+    Compute_Determinant_Hessian_Response(evolution);
     return;
   }
 
   // derivatives, flow and diffusion step
-  Mat Lx, Ly, Lsmooth, Lflow, Lstep;
+  MatType Lx, Ly, Lsmooth, Lflow, Lstep;
 
   // compute derivatives for computing k contrast
   GaussianBlur(img, Lsmooth, Size(5, 5), 1.0f, 1.0f, BORDER_REPLICATE);
@@ -433,19 +459,19 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
   Scharr(Lsmooth, Ly, CV_32F, 0, 1, 1, 0, BORDER_DEFAULT);
   Lsmooth.release();
   // compute the kcontrast factor
-  float kcontrast = compute_kcontrast(Lx, Ly, options_.kcontrast_percentile, options_.kcontrast_nbins);
+  float kcontrast = compute_kcontrast(Lx, Ly, options.kcontrast_percentile, options.kcontrast_nbins);
 
   // Now generate the rest of evolution levels
-  for (size_t i = 1; i < evolution_.size(); i++) {
-    Evolution &e = evolution_[i];
+  for (size_t i = 1; i < evolution.size(); i++) {
+    Evolution<MatType> &e = evolution[i];
 
-    if (e.octave > evolution_[i - 1].octave) {
+    if (e.octave > evolution[i - 1].octave) {
       // new octave will be half the size
-      resize(evolution_[i - 1].Lt, e.Lt, e.size, 0, 0, INTER_AREA);
+      resize(evolution[i - 1].Lt, e.Lt, e.size, 0, 0, INTER_AREA);
       kcontrast *= 0.75f;
     }
     else {
-      evolution_[i - 1].Lt.copyTo(e.Lt);
+      evolution[i - 1].Lt.copyTo(e.Lt);
     }
 
     GaussianBlur(e.Lt, e.Lsmooth, Size(5, 5), 1.0f, 1.0f, BORDER_REPLICATE);
@@ -455,10 +481,10 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
     Scharr(e.Lsmooth, Ly, CV_32F, 0, 1, 1.0, 0, BORDER_DEFAULT);
 
     // Compute the conductivity equation
-    compute_diffusivity(Lx, Ly, Lflow, kcontrast, options_.diffusivity);
+    compute_diffusivity(Lx, Ly, Lflow, kcontrast, options.diffusivity);
 
     // Perform Fast Explicit Diffusion on Lt
-    std::vector<float> &tsteps = tsteps_[i - 1];
+    const std::vector<float> &tsteps = tsteps_evolution[i - 1];
     for (size_t j = 0; j < tsteps.size(); j++) {
       const float step_size = tsteps[j] * 0.5f;
       non_linear_diffusion_step(e.Lt, Lflow, Lstep, step_size);
@@ -466,31 +492,73 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
     }
   }
 
-  Compute_Determinant_Hessian_Response();
+  Compute_Determinant_Hessian_Response(evolution);
+
+  return;
+}
+
+/**
+ * @brief Converts between UMatPyramid and Pyramid and vice versa
+ * @details Matrices in evolution levels will be copied
+ *
+ * @param src source pyramid
+ * @param dst destination pyramid
+ */
+template<typename MatTypeSrc, typename MatTypeDst>
+static inline void
+convertScalePyramid(const std::vector<Evolution<MatTypeSrc> >& src, std::vector<Evolution<MatTypeDst> > &dst)
+{
+  dst.resize(src.size());
+  for (size_t i = 0; i < src.size(); ++i) {
+    dst[i] = Evolution<MatTypeDst>(src[i]);
+  }
+}
+
+/**
+ * @brief This method creates the nonlinear scale space for a given image
+ * @param image Input image for which the nonlinear scale space needs to be created
+ */
+void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray image)
+{
+  if (ocl::useOpenCL() && image.isUMat()) {
+    // will run OCL version of scale space pyramid
+    UMatPyramid uPyr;
+    // init UMat pyramid with sizes
+    convertScalePyramid(evolution_, uPyr);
+    create_nonlinear_scale_space(image, options_, tsteps_, uPyr);
+    // download pyramid from GPU
+    convertScalePyramid(uPyr, evolution_);
+  } else {
+    // CPU version
+    create_nonlinear_scale_space(image, options_, tsteps_, evolution_);
+  }
 }
 
 /* ************************************************************************* */
 
 #ifdef HAVE_OPENCL
 static inline bool
-ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_, OutputArray Ldet_, float sigma)
+ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_,
+  OutputArray Ldet_, float sigma)
 {
-    UMat Lxx = Lxx_.getUMat(), Lxy = Lxy_.getUMat(), Lyy = Lyy_.getUMat(), Ldet = Ldet_.getUMat();
+  UMat Lxx = Lxx_.getUMat();
+  UMat Lxy = Lxy_.getUMat();
+  UMat Lyy = Lyy_.getUMat();
+  UMat Ldet = Ldet_.getUMat();
 
-    const int total = Lxx.rows * Lxx.cols;
-    size_t globalSize[] = {(size_t)total};
+  const int total = Lxx.rows * Lxx.cols;
+  size_t globalSize[] = {(size_t)total};
 
-    ocl::Kernel ker("AKAZE_compute_determinant", ocl::features2d::akaze_oclsrc);
-    if (ker.empty())
-        return false;
+  ocl::Kernel ker("AKAZE_compute_determinant", ocl::features2d::akaze_oclsrc);
+  if( ker.empty() )
+    return false;
 
-    return ker.args(
-            ocl::KernelArg::PtrReadOnly(Lxx),
-            ocl::KernelArg::PtrReadOnly(Lxy),
-            ocl::KernelArg::PtrReadOnly(Lyy),
-            ocl::KernelArg::PtrWriteOnly(Ldet),
-            sigma, total)
-    .run(1, globalSize, 0, true);
+  return ker.args(
+    ocl::KernelArg::PtrReadOnly(Lxx),
+    ocl::KernelArg::PtrReadOnly(Lxy),
+    ocl::KernelArg::PtrReadOnly(Lyy),
+    ocl::KernelArg::PtrWriteOnly(Ldet),
+    sigma, total).run(1, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL
 
@@ -504,47 +572,44 @@ ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_, Outpu
  * @param Ldet output determinant
  * @param sigma determinant will be scaled by this sigma
  */
-static inline void compute_determinant(InputArray Lxx, InputArray Lxy, InputArray Lyy, OutputArray Ldet, float sigma)
+static inline void compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_,
+  OutputArray Ldet_, float sigma)
 {
-    CV_INSTRUMENT_REGION()
+  CV_INSTRUMENT_REGION()
 
-    Ldet.create(Lxx.size(), Lxx.type());
+  Ldet_.create(Lxx_.size(), Lxx_.type());
 
-#ifdef HAVE_OPENCL
-    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Ldet.isUMat()), ocl_compute_determinant(Lxx, Lxy, Lyy, Ldet, sigma));
-#endif
+  CV_OCL_RUN(Lxx_.isUMat() && Ldet_.isUMat(), ocl_compute_determinant(Lxx_, Lxy_, Lyy_, Ldet_, sigma));
+
+  // output determinant
+  Mat Lxx = Lxx_.getMat(), Lxy = Lxy_.getMat(), Lyy = Lyy_.getMat(), Ldet = Ldet_.getMat();
+  float *lxx = Lxx.ptr<float>();
+  float *lxy = Lxy.ptr<float>();
+  float *lyy = Lyy.ptr<float>();
+  float *ldet = Ldet.ptr<float>();
+  const int total = Lxx.cols * Lxx.rows;
+  for (int j = 0; j < total; j++) {
+    ldet[j] = (lxx[j] * lyy[j] - lxy[j] * lxy[j]) * sigma;
+  }
 
-    // output determinant
-    Mat Mxx = Lxx.getMat(), Mxy = Lxy.getMat(), Myy = Lyy.getMat(), Mdet = Ldet.getMat();
-    const int W = Mxx.cols, H = Mxx.rows;
-    for (int y = 0; y < H; y++)
-    {
-        float *lxx = Mxx.ptr<float>(y);
-        float *lxy = Mxy.ptr<float>(y);
-        float *lyy = Myy.ptr<float>(y);
-        float *ldet = Mdet.ptr<float>(y);
-        for (int x = 0; x < W; x++)
-        {
-            ldet[x] = (lxx[x] * lyy[x] - lxy[x] * lxy[x]) * sigma;
-        }
-    }
 }
 
+template <typename MatType>
 class DeterminantHessianResponse : public ParallelLoopBody
 {
 public:
-    explicit DeterminantHessianResponse(std::vector<Evolution>& ev)
+    explicit DeterminantHessianResponse(std::vector<Evolution<MatType> >& ev)
     : evolution_(&ev)
   {
   }
 
   void operator()(const Range& range) const
   {
-    Mat Lxx, Lxy, Lyy;
+    MatType Lxx, Lxy, Lyy;
 
     for (int i = range.start; i < range.end; i++)
     {
-      Evolution &e = (*evolution_)[i];
+      Evolution<MatType> &e = (*evolution_)[i];
 
       // we cannot use cv:Scharr here, because we need to handle also
       // kernel sizes other than 3, by default we are using 9x9, 5x5 and 7x7
@@ -571,23 +636,33 @@ public:
   }
 
 private:
-  std::vector<Evolution>*  evolution_;
+  std::vector<Evolution<MatType> >*  evolution_;
 };
 
 
 /**
  * @brief This method computes the feature detector response for the nonlinear scale space
+ * @details OCL version
  * @note We use the Hessian determinant as the feature detector response
  */
-void AKAZEFeatures::Compute_Determinant_Hessian_Response(void) {
+static inline void
+Compute_Determinant_Hessian_Response(UMatPyramid &evolution) {
   CV_INSTRUMENT_REGION()
 
-  if (ocl::useOpenCL()) {
-    DeterminantHessianResponse body (evolution_);
-    body(Range(0, (int)evolution_.size()));
-  } else {
-    parallel_for_(Range(0, (int)evolution_.size()), DeterminantHessianResponse(evolution_));
-  }
+  DeterminantHessianResponse<UMat> body (evolution);
+  body(Range(0, (int)evolution.size()));
+}
+
+/**
+ * @brief This method computes the feature detector response for the nonlinear scale space
+ * @details CPU version
+ * @note We use the Hessian determinant as the feature detector response
+ */
+static inline void
+Compute_Determinant_Hessian_Response(Pyramid &evolution) {
+  CV_INSTRUMENT_REGION()
+
+  parallel_for_(Range(0, (int)evolution.size()), DeterminantHessianResponse<Mat>(evolution));
 }
 
 /* ************************************************************************* */
@@ -604,6 +679,7 @@ void AKAZEFeatures::Feature_Detection(std::vector<KeyPoint>& kpts)
   std::vector<Mat> keypoints_by_layers;
   Find_Scale_Space_Extrema(keypoints_by_layers);
   Do_Subpixel_Refinement(keypoints_by_layers, kpts);
+  Compute_Keypoints_Orientation(kpts);
 }
 
 /**
@@ -644,7 +720,7 @@ find_neighbor_point(const int x, const int y, const Mat &mask, const int search_
 class FindKeypointsSameScale : public ParallelLoopBody
 {
 public:
-    explicit FindKeypointsSameScale(const std::vector<Evolution>& ev,
+    explicit FindKeypointsSameScale(const Pyramid& ev,
       std::vector<Mat>& kpts, float dthreshold)
     : evolution_(&ev), keypoints_by_layers_(&kpts), dthreshold_(dthreshold)
   {}
@@ -653,7 +729,7 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      const Evolution &e = (*evolution_)[i];
+      const MEvolution &e = (*evolution_)[i];
       Mat &kpts = (*keypoints_by_layers_)[i];
       // this mask will hold positions of keypoints in this level
       kpts = Mat::zeros(e.Ldet.size(), CV_8UC1);
@@ -704,7 +780,7 @@ public:
   }
 
 private:
-  const std::vector<Evolution>*  evolution_;
+  const Pyramid*  evolution_;
   std::vector<Mat>* keypoints_by_layers_;
   float dthreshold_; ///< Detector response threshold to accept point
 };
@@ -799,7 +875,7 @@ void AKAZEFeatures::Do_Subpixel_Refinement(
   CV_INSTRUMENT_REGION()
 
   for (size_t i = 0; i < keypoints_by_layers.size(); i++) {
-    const Evolution &e = evolution_[i];
+    const MEvolution &e = evolution_[i];
     const float * const ldet = e.Ldet.ptr<float>();
     const float ratio = e.octave_ratio;
     const int cols = e.Ldet.cols;
@@ -865,7 +941,7 @@ void AKAZEFeatures::Do_Subpixel_Refinement(
 class SURF_Descriptor_Upright_64_Invoker : public ParallelLoopBody
 {
 public:
-  SURF_Descriptor_Upright_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  SURF_Descriptor_Upright_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, const Pyramid& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -885,13 +961,13 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  const Pyramid*   evolution_;
 };
 
 class SURF_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  SURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  SURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -911,13 +987,13 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
 };
 
 class MSURF_Upright_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  MSURF_Upright_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  MSURF_Upright_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -937,13 +1013,13 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
 };
 
 class MSURF_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  MSURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  MSURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -963,13 +1039,13 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
 };
 
 class Upright_MLDB_Full_Descriptor_Invoker : public ParallelLoopBody
 {
 public:
-  Upright_MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution, AKAZEOptions& options)
+  Upright_MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution, AKAZEOptions& options)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -990,7 +1066,7 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
   AKAZEOptions*              options_;
 };
 
@@ -999,7 +1075,7 @@ class Upright_MLDB_Descriptor_Subset_Invoker : public ParallelLoopBody
 public:
   Upright_MLDB_Descriptor_Subset_Invoker(std::vector<KeyPoint>& kpts,
                                          Mat& desc,
-                                         std::vector<Evolution>& evolution,
+                                         Pyramid& evolution,
                                          AKAZEOptions& options,
                                          Mat descriptorSamples,
                                          Mat descriptorBits)
@@ -1025,7 +1101,7 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
   AKAZEOptions*              options_;
 
   Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
@@ -1035,7 +1111,7 @@ private:
 class MLDB_Full_Descriptor_Invoker : public ParallelLoopBody
 {
 public:
-  MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution, AKAZEOptions& options)
+  MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution, AKAZEOptions& options)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -1060,7 +1136,7 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
   AKAZEOptions*              options_;
 };
 
@@ -1069,7 +1145,7 @@ class MLDB_Descriptor_Subset_Invoker : public ParallelLoopBody
 public:
   MLDB_Descriptor_Subset_Invoker(std::vector<KeyPoint>& kpts,
                                  Mat& desc,
-                                 std::vector<Evolution>& evolution,
+                                 Pyramid& evolution,
                                  AKAZEOptions& options,
                                  Mat descriptorSamples,
                                  Mat descriptorBits)
@@ -1095,7 +1171,7 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
   AKAZEOptions*              options_;
 
   Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
@@ -1282,10 +1358,10 @@ void quantized_counting_sort(const float a[], const int n,
  * original SURF method. See Bay et al., Speeded Up Robust Features, ECCV 2006
  */
 static inline
-void Compute_Main_Orientation(KeyPoint& kpt, const std::vector<Evolution>& evolution)
+void Compute_Main_Orientation(KeyPoint& kpt, const Pyramid& evolution)
 {
   // get the right evolution level for this keypoint
-  const Evolution& e = evolution[kpt.class_id];
+  const MEvolution& e = evolution[kpt.class_id];
   // Get the information from the keypoint
   int scale = cvRound(0.5f * kpt.size / e.octave_ratio);
   int x0 = cvRound(kpt.pt.x / e.octave_ratio);
@@ -1366,7 +1442,7 @@ class ComputeKeypointOrientation : public ParallelLoopBody
 {
 public:
   ComputeKeypointOrientation(std::vector<KeyPoint>& kpts,
-                             const std::vector<Evolution>& evolution)
+                             const Pyramid& evolution)
     : keypoints_(&kpts)
     , evolution_(&evolution)
   {
@@ -1381,7 +1457,7 @@ public:
   }
 private:
   std::vector<KeyPoint>* keypoints_;
-  const std::vector<Evolution>* evolution_;
+  const Pyramid* evolution_;
 };
 
 /**
@@ -1421,7 +1497,7 @@ void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const
   // Subregion centers for the 4x4 gaussian weighting
   float cx = -0.5f, cy = 0.5f;
 
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;
 
   // Set the descriptor size and the sample and pattern sizes
   sample_step = 5;
@@ -1554,7 +1630,7 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
   // Subregion centers for the 4x4 gaussian weighting
   float cx = -0.5f, cy = 0.5f;
 
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;
 
   // Set the descriptor size and the sample and pattern sizes
   sample_step = 5;
@@ -1675,7 +1751,7 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
 void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char *desc, int desc_size) const {
 
   const AKAZEOptions & options = *options_;
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;
 
   // Buffer for the M-LDB descriptor
   const int max_channels = 3;
@@ -1777,7 +1853,7 @@ void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(cons
 void MLDB_Full_Descriptor_Invoker::MLDB_Fill_Values(float* values, int sample_step, const int level,
                                                     float xf, float yf, float co, float si, float scale) const
 {
-    const std::vector<Evolution>& evolution = *evolution_;
+    const Pyramid& evolution = *evolution_;
     int pattern_size = options_->descriptor_pattern_size;
     int chan = options_->descriptor_channels;
     const Mat Lx = evolution[level].Lx;
@@ -1924,7 +2000,7 @@ void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint&
   float sample_x = 0.f, sample_y = 0.f;
 
   const AKAZEOptions & options = *options_;
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;
 
   // Get the information from the keypoint
   float ratio = (float)(1 << kpt.octave);
@@ -2033,7 +2109,7 @@ void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(
   int x1 = 0, y1 = 0;
 
   const AKAZEOptions & options = *options_;
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;
 
   // Get the information from the keypoint
   float ratio = (float)(1 << kpt.octave);
diff --git a/modules/features2d/src/kaze/AKAZEFeatures.h b/modules/features2d/src/kaze/AKAZEFeatures.h
index 18dc5fd99c..512f553886 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.h
+++ b/modules/features2d/src/kaze/AKAZEFeatures.h
@@ -17,6 +17,7 @@ namespace cv
 {
 
 /// A-KAZE nonlinear diffusion filtering evolution
+template <typename MatType>
 struct Evolution
 {
   Evolution() {
@@ -29,10 +30,28 @@ struct Evolution
     border = 0;
   }
 
-  Mat Lx, Ly;           ///< First order spatial derivatives
-  Mat Lt;               ///< Evolution image
-  Mat Lsmooth;          ///< Smoothed image, used only for computing determinant, released afterwards
-  Mat Ldet;             ///< Detector response
+  template <typename T>
+  explicit Evolution(const Evolution<T> &other) {
+    size = other.size;
+    etime = other.etime;
+    esigma = other.esigma;
+    octave = other.octave;
+    sublevel = other.sublevel;
+    sigma_size = other.sigma_size;
+    octave_ratio = other.octave_ratio;
+    border = other.border;
+
+    other.Lx.copyTo(Lx);
+    other.Ly.copyTo(Ly);
+    other.Lt.copyTo(Lt);
+    other.Lsmooth.copyTo(Lsmooth);
+    other.Ldet.copyTo(Ldet);
+  }
+
+  MatType Lx, Ly;           ///< First order spatial derivatives
+  MatType Lt;               ///< Evolution image
+  MatType Lsmooth;          ///< Smoothed image, used only for computing determinant, released afterwards
+  MatType Ldet;             ///< Detector response
 
   Size size;                ///< Size of the layer
   float etime;              ///< Evolution time
@@ -44,6 +63,11 @@ struct Evolution
   int border;               ///< Width of border where descriptors cannot be computed
 };
 
+typedef Evolution<Mat> MEvolution;
+typedef Evolution<UMat> UEvolution;
+typedef std::vector<MEvolution> Pyramid;
+typedef std::vector<UEvolution> UMatPyramid;
+
 /* ************************************************************************* */
 // AKAZE Class Declaration
 class AKAZEFeatures {
@@ -51,7 +75,7 @@ class AKAZEFeatures {
 private:
 
   AKAZEOptions options_;                ///< Configuration options for AKAZE
-  std::vector<Evolution> evolution_;        ///< Vector of nonlinear diffusion evolution
+  Pyramid evolution_;        ///< Vector of nonlinear diffusion evolution
 
   /// FED parameters
   int ncycles_;                  ///< Number of cycles
@@ -64,23 +88,21 @@ private:
   cv::Mat descriptorBits_;
   cv::Mat bitMask_;
 
-public:
-
-  /// Constructor with input arguments
-  AKAZEFeatures(const AKAZEOptions& options);
-
   /// Scale Space methods
   void Allocate_Memory_Evolution();
-  void Create_Nonlinear_Scale_Space(InputArray img);
-  void Feature_Detection(std::vector<cv::KeyPoint>& kpts);
-  void Compute_Determinant_Hessian_Response(void);
   void Find_Scale_Space_Extrema(std::vector<Mat>& keypoints_by_layers);
   void Do_Subpixel_Refinement(std::vector<Mat>& keypoints_by_layers,
     std::vector<KeyPoint>& kpts);
 
   /// Feature description methods
-  void Compute_Descriptors(std::vector<cv::KeyPoint>& kpts, OutputArray desc);
   void Compute_Keypoints_Orientation(std::vector<cv::KeyPoint>& kpts) const;
+
+public:
+  /// Constructor with input arguments
+  AKAZEFeatures(const AKAZEOptions& options);
+  void Create_Nonlinear_Scale_Space(InputArray img);
+  void Feature_Detection(std::vector<cv::KeyPoint>& kpts);
+  void Compute_Descriptors(std::vector<cv::KeyPoint>& kpts, OutputArray desc);
 };
 
 /* ************************************************************************* */
diff --git a/modules/features2d/test/ocl/test_feature2d.cpp b/modules/features2d/test/ocl/test_feature2d.cpp
new file mode 100644
index 0000000000..2fb0407832
--- /dev/null
+++ b/modules/features2d/test/ocl/test_feature2d.cpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../test_precomp.hpp"
+#include "cvconfig.h"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+#define TEST_IMAGES testing::Values(\
+    "detectors_descriptors_evaluation/images_datasets/leuven/img1.png",\
+    "../stitching/a3.png", \
+    "../stitching/s2.jpg")
+
+PARAM_TEST_CASE(Feature2DFixture, Ptr<Feature2D>, std::string)
+{
+    std::string filename;
+    Mat image, descriptors;
+    vector<KeyPoint> keypoints;
+    UMat uimage, udescriptors;
+    vector<KeyPoint> ukeypoints;
+    Ptr<Feature2D> feature;
+
+    virtual void SetUp()
+    {
+        feature = GET_PARAM(0);
+        filename = GET_PARAM(1);
+
+        image = readImage(filename);
+
+        ASSERT_FALSE(image.empty());
+
+        image.copyTo(uimage);
+
+        OCL_OFF(feature->detect(image, keypoints));
+        OCL_ON(feature->detect(uimage, ukeypoints));
+        // note: we use keypoints from CPU for GPU too, to test descriptors separately
+        OCL_OFF(feature->compute(image, keypoints, descriptors));
+        OCL_ON(feature->compute(uimage, keypoints, udescriptors));
+    }
+};
+
+OCL_TEST_P(Feature2DFixture, KeypointsSame)
+{
+    EXPECT_EQ(keypoints.size(), ukeypoints.size());
+
+    for (size_t i = 0; i < keypoints.size(); ++i)
+    {
+        EXPECT_GE(KeyPoint::overlap(keypoints[i], ukeypoints[i]), 0.95);
+        EXPECT_NEAR(keypoints[i].angle, ukeypoints[i].angle, 0.001);
+    }
+}
+
+OCL_TEST_P(Feature2DFixture, DescriptorsSame)
+{
+    EXPECT_MAT_NEAR(descriptors, udescriptors, 0.001);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(AKAZE, Feature2DFixture,
+    testing::Combine(testing::Values(AKAZE::create()), TEST_IMAGES));
+
+OCL_INSTANTIATE_TEST_CASE_P(AKAZE_DESCRIPTOR_KAZE, Feature2DFixture,
+    testing::Combine(testing::Values(AKAZE::create(AKAZE::DESCRIPTOR_KAZE)), TEST_IMAGES));
+
+}//ocl
+}//cvtest
+
+#endif //HAVE_OPENCL