diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so
index cd429b923e..ba36efcdf5 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
index 9fec924b9b..1868ed2c8c 100644
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so
index 7a3d126cf9..295cbc6b03 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so
index 12f2181374..d34fa65d81 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so
index bc26834779..b6b8f4df50 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
new file mode 100644
index 0000000000..1fec795114
Binary files /dev/null and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so b/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so
index 2ff9434603..f26d26a252 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so and b/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so b/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so
index 475a986010..78c104c114 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so and b/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so b/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so
index 1dbabe4063..c5532e36b5 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so and b/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so b/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so
index 40ebc69461..fae5d9bad0 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so and b/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so b/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so
index 03b77bba57..8543503106 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so and b/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
new file mode 100644
index 0000000000..352461f82c
Binary files /dev/null and b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r2.3.3.so b/3rdparty/lib/x86/libnative_camera_r2.3.3.so
index 0a8b0faee1..4e04a3e23f 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r2.3.3.so and b/3rdparty/lib/x86/libnative_camera_r2.3.3.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r3.0.1.so b/3rdparty/lib/x86/libnative_camera_r3.0.1.so
index 564c2846e4..6d3a898061 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r3.0.1.so and b/3rdparty/lib/x86/libnative_camera_r3.0.1.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r4.0.3.so b/3rdparty/lib/x86/libnative_camera_r4.0.3.so
index 7ab37360e0..c00c957015 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r4.0.3.so and b/3rdparty/lib/x86/libnative_camera_r4.0.3.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r4.1.1.so b/3rdparty/lib/x86/libnative_camera_r4.1.1.so
new file mode 100644
index 0000000000..df22898b45
Binary files /dev/null and b/3rdparty/lib/x86/libnative_camera_r4.1.1.so differ
diff --git a/doc/tutorials/introduction/android_binary_package/android_binary_package.rst b/doc/tutorials/introduction/android_binary_package/android_binary_package.rst
index 09a0997d06..d95a344979 100644
--- a/doc/tutorials/introduction/android_binary_package/android_binary_package.rst
+++ b/doc/tutorials/introduction/android_binary_package/android_binary_package.rst
@@ -93,7 +93,6 @@ You need the following to be installed:
      But for successful compilation of some samples the **target** platform should be set to Android 3.0 (API 11) or higher. It will not prevent them from running on  Android 2.2.
 
      .. image:: images/android_sdk_and_avd_manager.png
-        :height: 500px
         :alt: Android SDK Manager
         :align: center
 
@@ -330,7 +329,6 @@ Well, running samples from Eclipse is very simple:
 * Here is ``Tutorial 2 - Use OpenCV Camera`` sample, running on top of stock camera-preview of the emulator.
 
   .. image:: images/emulator_canny.png
-     :height: 600px
      :alt: Tutorial 1 Basic - 1. Add OpenCV - running Canny
      :align: center
 
diff --git a/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png b/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png
index b411d7b4f0..1bc0511318 100644
Binary files a/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png and b/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png differ
diff --git a/modules/androidcamera/camera_wrapper/CMakeLists.txt b/modules/androidcamera/camera_wrapper/CMakeLists.txt
index 058a2cb6db..9398635c88 100644
--- a/modules/androidcamera/camera_wrapper/CMakeLists.txt
+++ b/modules/androidcamera/camera_wrapper/CMakeLists.txt
@@ -4,18 +4,27 @@ project(${the_target})
 
 link_directories("${ANDROID_SOURCE_TREE}/out/target/product/generic/system/lib")
 
-INCLUDE_DIRECTORIES(BEFORE
+if (ANDROID_VERSION VERSION_LESS "4.1")
+    INCLUDE_DIRECTORIES(BEFORE
 	${ANDROID_SOURCE_TREE}
 	${ANDROID_SOURCE_TREE}/frameworks/base/include/ui
 	${ANDROID_SOURCE_TREE}/frameworks/base/include/surfaceflinger
 	${ANDROID_SOURCE_TREE}/frameworks/base/include/camera
 	${ANDROID_SOURCE_TREE}/frameworks/base/include/media
-	${ANDROID_SOURCE_TREE}/frameworks/base/include/camera
 	${ANDROID_SOURCE_TREE}/frameworks/base/include
 	${ANDROID_SOURCE_TREE}/system/core/include
 	${ANDROID_SOURCE_TREE}/hardware/libhardware/include
 	${ANDROID_SOURCE_TREE}/frameworks/base/native/include
 	)
+else()
+    INCLUDE_DIRECTORIES(BEFORE
+	${ANDROID_SOURCE_TREE}
+	${ANDROID_SOURCE_TREE}/frameworks/native/include
+	${ANDROID_SOURCE_TREE}/frameworks/av/include
+	${ANDROID_SOURCE_TREE}/system/core/include
+	${ANDROID_SOURCE_TREE}/hardware/libhardware/include
+	)
+endif()
 
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
 
diff --git a/modules/androidcamera/camera_wrapper/camera_wrapper.cpp b/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
index 727ae6ad77..c7a55fb39b 100644
--- a/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
+++ b/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
@@ -1,4 +1,4 @@
-#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1) && !defined(ANDROID_r4_0_0) && !defined(ANDROID_r4_0_3)
+#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1) && !defined(ANDROID_r4_0_0) && !defined(ANDROID_r4_0_3) && !defined(ANDROID_r4_1_1)
 # error Building camera wrapper for your version of Android is not supported by OpenCV. You need to modify OpenCV sources in order to compile camera wrapper for your version of Android.
 #endif
 
@@ -12,13 +12,18 @@
 #include "camera_wrapper.h"
 #include "../include/camera_properties.h"
 
-#if defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
+#if defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3) || defined(ANDROID_r4_1_1)
 //Include SurfaceTexture.h file with the SurfaceTexture class
 # include <gui/SurfaceTexture.h>
 # define MAGIC_OPENCV_TEXTURE_ID (0x10)
 #else // defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
 //TODO: This is either 2.2 or 2.3. Include the headers for ISurface.h access
+#if defined(ANDROID_r4_1_1)
+#include <gui/ISurface.h>
+#include <gui/BufferQueue.h>
+#else
 # include <surfaceflinger/ISurface.h>
+#endif  // defined(ANDROID_r4_1_1)
 #endif  // defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
 
 #include <string>
@@ -53,6 +58,21 @@
 
 using namespace android;
 
+void debugShowFPS();
+
+#if defined(ANDROID_r4_1_1)
+class ConsumerListenerStub: public BufferQueue::ConsumerListener
+{
+public:
+    virtual void onFrameAvailable()
+    {
+    }
+    virtual void onBuffersReleased()
+    {
+    }
+};
+#endif
+
 void debugShowFPS()
 {
     static int mFrameCount = 0;
@@ -260,8 +280,8 @@ public:
     }
 
     virtual void postData(int32_t msgType, const sp<IMemory>& dataPtr
-#if defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
-                          ,camera_frame_metadata_t* metadata
+#if defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3) || defined(ANDROID_r4_1_1)
+                          ,camera_frame_metadata_t*
 #endif
                           )
     {
@@ -506,9 +526,16 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
     pdstatus = camera->setPreviewTexture(surfaceTexture);
     if (pdstatus != 0)
         LOGE("initCameraConnect: failed setPreviewTexture call; camera migth not work correctly");
+#elif defined(ANDROID_r4_1_1)
+    sp<BufferQueue> bufferQueue = new BufferQueue();
+    sp<BufferQueue::ConsumerListener> queueListener = new ConsumerListenerStub();
+    bufferQueue->consumerConnect(queueListener);
+    pdstatus = camera->setPreviewTexture(bufferQueue);
+    if (pdstatus != 0)
+	LOGE("initCameraConnect: failed setPreviewTexture call; camera migth not work correctly");
 #endif
 
-#if !(defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3))
+#if (defined(ANDROID_r2_2_0) || defined(ANDROID_r2_3_3) || defined(ANDROID_r3_0_1))
 # if 1
     ////ATTENTION: switching between two versions: with and without copying memory inside Android OS
     //// see the method  CameraService::Client::copyFrameAndPostCopiedFrame and where it is used
@@ -520,6 +547,7 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
     camera->setPreviewCallbackFlags( CAMERA_FRAME_CALLBACK_FLAG_ENABLE_MASK | CAMERA_FRAME_CALLBACK_FLAG_COPY_OUT_MASK);//with copy
 #endif //!(defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3))
 
+    LOGD("Starting preview");
     status_t resStart = camera->startPreview();
 
     if (resStart != 0)
@@ -528,6 +556,10 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
         handler->closeCameraConnect();
         handler = 0;
     }
+    else
+    {
+	LOGD("Preview started successfully");
+    }
 
     return handler;
 }
diff --git a/modules/contrib/doc/facerec/facerec_api.rst b/modules/contrib/doc/facerec/facerec_api.rst
index 018a080654..6766894bb4 100644
--- a/modules/contrib/doc/facerec/facerec_api.rst
+++ b/modules/contrib/doc/facerec/facerec_api.rst
@@ -19,16 +19,19 @@ a unified access to all face recongition algorithms in OpenCV. ::
 
       // Trains a FaceRecognizer.
       virtual void train(InputArray src, InputArray labels) = 0;
-
+      
+      // Updates a FaceRecognizer.
+      virtual void update(InputArrayOfArrays src, InputArray labels);
+      
       // Gets a prediction from a FaceRecognizer.
       virtual int predict(InputArray src) const = 0;
-
+      
       // Predicts the label and confidence for a given sample.
       virtual void predict(InputArray src, int &label, double &confidence) const = 0;
 
       // Serializes this object to a given filename.
       virtual void save(const string& filename) const;
-
+      
       // Deserializes this object from a given filename.
       virtual void load(const string& filename);
 
@@ -39,6 +42,7 @@ a unified access to all face recongition algorithms in OpenCV. ::
       virtual void load(const FileStorage& fs) = 0;
   };
 
+
 Description
 +++++++++++
 
@@ -99,13 +103,6 @@ If you've set the threshold to ``0.0`` as we did above, then:
 
 is going to yield ``-1`` as predicted label, which states this face is unknown.
 
-Adding new samples to a trained FaceRecognizer
-++++++++++++++++++++++++++++++++++++++++++++++
-
-Adding new images to a trained :ocv:class:`FaceRecognizer` is possible, but only if the :ocv:class:`FaceRecognizer` supports it. For the Eigenfaces and Fisherfaces method each call to :ocv:func:`FaceRecognizer::train` empties the old model and estimates a new model on the given data. This is an algorithmic necessity for these two algorithms, no way around that. Please see the tutorial Guide To Face Recognition with OpenCV for details. If you call :ocv:func:`FaceRecognizer::train` on a LBPH model, the internal model is extended with the new samples.
-
-Please note: A :ocv:class:`FaceRecognizer` does not store your training images (this would be very memory intense), the caller is responsible for maintaining the dataset.
-
 Getting the name of a FaceRecognizer
 +++++++++++++++++++++++++++++++++++++
 
@@ -164,6 +161,50 @@ And finally train it on the given dataset (the face images and labels):
     //
     model->train(images, labels);
 
+FaceRecognizer::update
+----------------------
+
+Updates a FaceRecognizer with given data and associated labels.
+
+.. ocv:function:: void FaceRecognizer::update(InputArray src, InputArray labels)
+
+    :param src: The training images, that means the faces you want to learn. The data has to be given as a ``vector<Mat>``.
+
+    :param labels: The labels corresponding to the images have to be given either as a ``vector<int>`` or a
+
+This method updates a (probably trained) :ocv:class:`FaceRecognizer`, but only if the algorithm supports it. The Local Binary Patterns Histograms (LBPH) recognizer (see :ocv:func:`createLBPHFaceRecognizer`) can be updated. For the Eigenfaces and Fisherfaces method, this is algorithmically not possible and you have to re-estimate the model with :ocv:func:`FaceRecognizer::train`. In any case, a call to train empties the existing model and learns a new model, while update does not delete any model data.
+
+.. code-block:: cpp
+
+    // Create a new LBPH model (it can be updated) and use the default parameters,
+    // this is the most common usage of this specific FaceRecognizer:
+    //
+    Ptr<FaceRecognizer> model =  createLBPHFaceRecognizer();
+    // This is the common interface to train all of the available cv::FaceRecognizer
+    // implementations:
+    //
+    model->train(images, labels);
+    // Some containers to hold new image:
+    vector<Mat> newImages;
+    vector<int> newLabels;
+    // You should add some images to the containers:
+    //
+    // ...
+    //
+    // Now updating the model is as easy as calling:
+    model->update(newImages,newLabels);
+    // This will preserve the old model data and extend the existing model 
+    // with the new features extracted from newImages!
+
+Calling update on an Eigenfaces model (see :ocv:func:`createEigenFaceRecognizer`), which doesn't support updating, will throw an error similar to:
+
+.. code-block:: none
+
+    OpenCV Error: The function/feature is not implemented (This FaceRecognizer (FaceRecognizer.Eigenfaces) does not support updating, you have to use FaceRecognizer::train to update it.) in update, file /home/philipp/git/opencv/modules/contrib/src/facerec.cpp, line 305
+    terminate called after throwing an instance of 'cv::Exception'
+
+Please note: The :ocv:class:`FaceRecognizer` does not store your training images, because this would be very memory intense and it's not the responsibility of te :ocv:class:`FaceRecognizer` to do so. The caller is responsible for maintaining the dataset, he want to work with. 
+ 
 FaceRecognizer::predict
 -----------------------
 
@@ -176,8 +217,6 @@ FaceRecognizer::predict
     :param label: The predicted label for the given image.
     :param confidence: Associated confidence (e.g. distance) for the predicted label.
 
-
-
 The suffix ``const`` means that prediction does not affect the internal model
 state, so the method can be safely called from within different threads.
 
@@ -260,7 +299,7 @@ Notes:
 
 * Training and prediction must be done on grayscale images, use :ocv:func:`cvtColor` to convert between the color spaces.
 * **THE EIGENFACES METHOD MAKES THE ASSUMPTION, THAT THE TRAINING AND TEST IMAGES ARE OF EQUAL SIZE.** (caps-lock, because I got so many mails asking for this). You have to make sure your input data has the correct shape, else a meaningful exception is thrown. Use :ocv:func:`resize` to resize the images.
-* A call to :ocv:func:`FaceRecognizer::train` empties the Eigenfaces model and re-estimates a model on given data.
+* This model does not support updating.
 
 Model internal data:
 ++++++++++++++++++++
@@ -287,7 +326,7 @@ Notes:
 
 * Training and prediction must be done on grayscale images, use :ocv:func:`cvtColor` to convert between the color spaces.
 * **THE FISHERFACES METHOD MAKES THE ASSUMPTION, THAT THE TRAINING AND TEST IMAGES ARE OF EQUAL SIZE.** (caps-lock, because I got so many mails asking for this). You have to make sure your input data has the correct shape, else a meaningful exception is thrown. Use :ocv:func:`resize` to resize the images.
-* A call to :ocv:func:`FaceRecognizer::train` empties the Fisherfaces model and re-estimates a model on given data.
+* This model does not support updating.
 
 Model internal data:
 ++++++++++++++++++++
@@ -316,7 +355,7 @@ Notes:
 ++++++
 
 * The Circular Local Binary Patterns (used in training and prediction) expect the data given as grayscale images, use :ocv:func:`cvtColor` to convert between the color spaces.
-* A call to :ocv:func:`FaceRecognizer::train` extends the LBPH model with given data.
+* This model supports updating.
 
 Model internal data:
 ++++++++++++++++++++
diff --git a/modules/contrib/include/opencv2/contrib/contrib.hpp b/modules/contrib/include/opencv2/contrib/contrib.hpp
index 8800c3ea2b..9f8ed9d52c 100644
--- a/modules/contrib/include/opencv2/contrib/contrib.hpp
+++ b/modules/contrib/include/opencv2/contrib/contrib.hpp
@@ -927,6 +927,9 @@ namespace cv
         // Trains a FaceRecognizer.
         CV_WRAP virtual void train(InputArrayOfArrays src, InputArray labels) = 0;
 
+        // Updates a FaceRecognizer.
+        CV_WRAP virtual void update(InputArrayOfArrays src, InputArray labels);
+
         // Gets a prediction from a FaceRecognizer.
         virtual int predict(InputArray src) const = 0;
 
diff --git a/modules/contrib/src/facerec.cpp b/modules/contrib/src/facerec.cpp
index 250706a861..6ff51fe89a 100644
--- a/modules/contrib/src/facerec.cpp
+++ b/modules/contrib/src/facerec.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011. Philipp Wagner <bytefish[at]gmx[dot]de>.
+ * Copyright (c) 2011,2012. Philipp Wagner <bytefish[at]gmx[dot]de>.
  * Released to public domain under terms of the BSD Simplified license.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -197,10 +197,10 @@ public:
     void predict(InputArray _src, int &label, double &dist) const;
 
     // See FaceRecognizer::load.
-    virtual void load(const FileStorage& fs);
+    void load(const FileStorage& fs);
 
     // See FaceRecognizer::save.
-    virtual void save(FileStorage& fs) const;
+    void save(FileStorage& fs) const;
 
     AlgorithmInfo* info() const;
 };
@@ -223,6 +223,12 @@ private:
     vector<Mat> _histograms;
     Mat _labels;
 
+    // Computes a LBPH model with images in src and
+    // corresponding labels in labels, possibly preserving
+    // old model data.
+    void train(InputArrayOfArrays src, InputArray labels, bool preserveData);
+
+
 public:
     using FaceRecognizer::save;
     using FaceRecognizer::load;
@@ -265,6 +271,10 @@ public:
     // corresponding labels in labels.
     void train(InputArrayOfArrays src, InputArray labels);
 
+    // Updates this LBPH model with images in src and
+    // corresponding labels in labels.
+    void update(InputArrayOfArrays src, InputArray labels);
+
     // Predicts the label of a query image in src.
     int predict(InputArray src) const;
 
@@ -290,6 +300,11 @@ public:
 //------------------------------------------------------------------------------
 // FaceRecognizer
 //------------------------------------------------------------------------------
+void FaceRecognizer::update(InputArrayOfArrays, InputArray) {
+    string error_msg = format("This FaceRecognizer (%s) does not support updating, you have to use FaceRecognizer::train to update it.", this->name().c_str());
+    CV_Error(CV_StsNotImplemented, error_msg);
+}
+
 void FaceRecognizer::save(const string& filename) const {
     FileStorage fs(filename, FileStorage::WRITE);
     if (!fs.isOpened())
@@ -563,7 +578,6 @@ void olbp_(InputArray _src, OutputArray _dst) {
     }
 }
 
-
 //------------------------------------------------------------------------------
 // cv::elbp
 //------------------------------------------------------------------------------
@@ -607,15 +621,19 @@ inline void elbp_(InputArray _src, OutputArray _dst, int radius, int neighbors)
 
 static void elbp(InputArray src, OutputArray dst, int radius, int neighbors)
 {
-    switch (src.type()) {
-        case CV_8SC1:   elbp_<char>(src,dst, radius, neighbors); break;
-        case CV_8UC1:   elbp_<unsigned char>(src, dst, radius, neighbors); break;
-        case CV_16SC1:  elbp_<short>(src,dst, radius, neighbors); break;
-        case CV_16UC1:  elbp_<unsigned short>(src,dst, radius, neighbors); break;
-        case CV_32SC1:  elbp_<int>(src,dst, radius, neighbors); break;
-        case CV_32FC1:  elbp_<float>(src,dst, radius, neighbors); break;
-        case CV_64FC1:  elbp_<double>(src,dst, radius, neighbors); break;
-        default: break;
+    int type = src.type();
+    switch (type) {
+    case CV_8SC1:   elbp_<char>(src,dst, radius, neighbors); break;
+    case CV_8UC1:   elbp_<unsigned char>(src, dst, radius, neighbors); break;
+    case CV_16SC1:  elbp_<short>(src,dst, radius, neighbors); break;
+    case CV_16UC1:  elbp_<unsigned short>(src,dst, radius, neighbors); break;
+    case CV_32SC1:  elbp_<int>(src,dst, radius, neighbors); break;
+    case CV_32FC1:  elbp_<float>(src,dst, radius, neighbors); break;
+    case CV_64FC1:  elbp_<double>(src,dst, radius, neighbors); break;
+    default:
+        string error_msg = format("Using Original Local Binary Patterns for feature extraction only works on single-channel images (given %d). Please pass the image data as a grayscale image!", type);
+        CV_Error(CV_StsNotImplemented, error_msg);
+        break;
     }
 }
 
@@ -727,28 +745,45 @@ void LBPH::save(FileStorage& fs) const {
     fs << "labels" << _labels;
 }
 
-void LBPH::train(InputArrayOfArrays _src, InputArray _lbls) {
-    if(_src.kind() != _InputArray::STD_VECTOR_MAT && _src.kind() != _InputArray::STD_VECTOR_VECTOR) {
+void LBPH::train(InputArrayOfArrays _in_src, InputArray _in_labels) {
+    this->train(_in_src, _in_labels, false);
+}
+
+void LBPH::update(InputArrayOfArrays _in_src, InputArray _in_labels) {
+    // got no data, just return
+    if(_in_src.total() == 0)
+        return;
+
+    this->train(_in_src, _in_labels, true);
+}
+
+void LBPH::train(InputArrayOfArrays _in_src, InputArray _in_labels, bool preserveData) {
+    if(_in_src.kind() != _InputArray::STD_VECTOR_MAT && _in_src.kind() != _InputArray::STD_VECTOR_VECTOR) {
         string error_message = "The images are expected as InputArray::STD_VECTOR_MAT (a std::vector<Mat>) or _InputArray::STD_VECTOR_VECTOR (a std::vector< vector<...> >).";
         CV_Error(CV_StsBadArg, error_message);
     }
-    if(_src.total() == 0) {
+    if(_in_src.total() == 0) {
         string error_message = format("Empty training data was given. You'll need more than one sample to learn a model.");
         CV_Error(CV_StsUnsupportedFormat, error_message);
-    } else if(_lbls.getMat().type() != CV_32SC1) {
-        string error_message = format("Labels must be given as integer (CV_32SC1). Expected %d, but was %d.", CV_32SC1, _lbls.type());
+    } else if(_in_labels.getMat().type() != CV_32SC1) {
+        string error_message = format("Labels must be given as integer (CV_32SC1). Expected %d, but was %d.", CV_32SC1, _in_labels.type());
         CV_Error(CV_StsUnsupportedFormat, error_message);
     }
     // get the vector of matrices
     vector<Mat> src;
-    _src.getMatVector(src);
+    _in_src.getMatVector(src);
     // get the label matrix
-    Mat labels = _lbls.getMat();
+    Mat labels = _in_labels.getMat();
     // check if data is well- aligned
     if(labels.total() != src.size()) {
         string error_message = format("The number of samples (src) must equal the number of labels (labels). Was len(samples)=%d, len(labels)=%d.", src.size(), _labels.total());
         CV_Error(CV_StsBadArg, error_message);
     }
+    // if this model should be trained without preserving old data, delete old model data
+    if(!preserveData) {
+        _labels.release();
+        _histograms.clear();
+    }
     // append labels to _labels matrix
     for(size_t labelIdx = 0; labelIdx < labels.total(); labelIdx++) {
         _labels.push_back(labels.at<int>((int)labelIdx));
diff --git a/modules/core/include/opencv2/core/parallel_tool.hpp b/modules/core/include/opencv2/core/parallel_tool.hpp
new file mode 100644
index 0000000000..08258d5c2b
--- /dev/null
+++ b/modules/core/include/opencv2/core/parallel_tool.hpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PARALLEL_TOOL_HPP__
+#define __OPENCV_PARALLEL_TOOL_HPP__
+
+#ifdef HAVE_CVCONFIG_H
+# include <cvconfig.h>
+#endif // HAVE_CVCONFIG_H
+
+/*
+    HAVE_TBB - using TBB
+    HAVE_GCD - using GCD
+    HAVE_OPENMP - using OpenMP
+    HAVE_CONCURRENCY - using visual studio 2010 concurrency
+*/
+
+#ifdef HAVE_TBB
+#  include "tbb/tbb_stddef.h"
+#  if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202
+#    include "tbb/tbb.h"
+#    include "tbb/task.h"
+#    undef min
+#    undef max
+#  else
+#    undef HAVE_TBB
+#   endif // end TBB version
+#endif // HAVE_TBB
+
+#ifdef __cplusplus
+
+namespace cv
+{
+    // a base body class
+    class CV_EXPORTS ParallelLoopBody
+    {
+    public:
+        virtual void operator() (const Range& range) const = 0;
+        virtual ~ParallelLoopBody();
+    };
+
+    CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body);
+
+    template <typename Iterator, typename Body> inline
+    CV_EXPORTS void parallel_do_(Iterator first, Iterator last, const Body& body)
+    {
+#ifdef HAVE_TBB
+        tbb::parallel_do(first, last, body);
+#else
+        for ( ; first != last; ++first)
+            body(*first);
+#endif // HAVE_TBB
+    }
+
+    template <typename Body> inline
+    CV_EXPORTS void parallel_reduce_(const Range& range, Body& body)
+    {
+#ifdef HAVE_TBB
+        tbb::parallel_reduce(tbb::blocked_range<int>(range.start, range.end), body);
+#else
+        body(range);
+#endif // end HAVE_TBB
+    }
+
+} // namespace cv
+
+#endif // __cplusplus
+
+#endif // __OPENCV_PARALLEL_TOOL_HPP__
diff --git a/modules/core/src/parallel_tool.cpp b/modules/core/src/parallel_tool.cpp
new file mode 100644
index 0000000000..423d4787d1
--- /dev/null
+++ b/modules/core/src/parallel_tool.cpp
@@ -0,0 +1,112 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CONCURRENCY
+#  include <ppl.h>
+#elif defined HAVE_OPENMP
+#  include <omp.h>
+#elif defined HAVE_GCD
+#  include <dispatch/dispatch.h>
+#endif // HAVE_CONCURRENCY
+
+namespace cv
+{
+    ParallelLoopBody::~ParallelLoopBody() { }
+
+#ifdef HAVE_TBB
+    class TbbProxyLoopBody
+    {
+    public:
+        TbbProxyLoopBody(const ParallelLoopBody& _body) :
+            body(&_body)
+        { }
+
+        void operator ()(const tbb::blocked_range<int>& range) const
+        {
+            body->operator()(Range(range.begin(), range.end()));
+        }
+
+    private:
+        const ParallelLoopBody* body;
+    };
+#endif // end HAVE_TBB
+
+#ifdef HAVE_GCD
+    static
+    void block_function(void* context, size_t index)
+    {
+        ParallelLoopBody* ptr_body = static_cast<ParallelLoopBody*>(context);
+        ptr_body->operator()(Range(index, index + 1));
+    }
+#endif // HAVE_GCD
+
+    void parallel_for_(const Range& range, const ParallelLoopBody& body)
+    {
+#ifdef HAVE_TBB
+
+        tbb::parallel_for(tbb::blocked_range<int>(range.start, range.end), TbbProxyLoopBody(body));
+
+#elif defined HAVE_CONCURRENCY
+
+        Concurrency::parallel_for(range.start, range.end, body);
+
+#elif defined HAVE_OPENMP
+
+#pragma omp parallel for schedule(dynamic)
+        for (int i = range.start; i < range.end; ++i)
+            body(Range(i, i + 1));
+
+#elif defined (HAVE_GCD)
+
+        dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+        dispatch_apply_f(range.end - range.start, concurrent_queue, &const_cast<ParallelLoopBody&>(body), block_function);
+
+#else
+
+        body(range);
+
+#endif // end HAVE_TBB
+    }
+
+} // namespace cv
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 81b9d6e80b..60429075aa 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -50,6 +50,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/core/core_c.h"
 #include "opencv2/core/internal.hpp"
+#include "opencv2/core/parallel_tool.hpp"
 
 #include <assert.h>
 #include <ctype.h>
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 2ef29eaf5f..6744b17819 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -221,39 +221,42 @@ static int countNonZero_(const T* src, int len )
     return nz;
 }
 
-template <> 
-int countNonZero_ <uchar> (const uchar* src, int len)
+static int countNonZero8u( const uchar* src, int len )
 {
-	int i=0, nz = 0;
-	#if (defined CV_SSE4_2 && CV_SSE4_2 && (_WIN64 || __amd64__))
-	if(USE_SSE4_2)//5x-6x
-	{
-		__m128i pattern = _mm_setzero_si128 ();
-		__m128i inv = _mm_set1_epi8((char)1); 
-		__int64 CV_DECL_ALIGNED(16) buf[2];
-		for (; i<=len-16; i+=16)
-		{
-			__m128i r0 = _mm_lddqu_si128((const __m128i*)(src+i));
-			__m128i res = _mm_cmpeq_epi8(r0, pattern);
-			res =  _mm_add_epi8(res, inv);//11111111+1=00000000, 00000000+1=00000001
-			_mm_store_si128 ((__m128i*)buf, res);
-
-			__int64 countLow = _mm_popcnt_u64(buf[0]);
-			nz += countLow;
-		
-			__int64 countHigh = _mm_popcnt_u64(buf[1]);
-			nz +=countHigh;
-		}
-	}
-	#endif
-	for( ; i < len; i++ )
-		nz += src[i] != 0;
+    int i=0, nz = 0;
+#if CV_SSE2
+    if(USE_SSE2)//5x-6x
+    {
+        __m128i pattern = _mm_setzero_si128 ();
+        static uchar tab[256];
+        static volatile bool initialized = false;
+        if( !initialized )
+        {
+            // we compute inverse popcount table,
+            // since we pass (img[x] == 0) mask as index in the table.
+            for( int j = 0; j < 256; j++ )
+            {
+                int val = 0;
+                for( int mask = 1; mask < 256; mask += mask )
+                    val += (j & mask) == 0;
+                tab[j] = (uchar)val;
+            }
+            initialized = true;
+        }
+        
+        for (; i<=len-16; i+=16)
+        {
+            __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
+            int val = _mm_movemask_epi8(_mm_cmpeq_epi8(r0, pattern));
+            nz += tab[val & 255] + tab[val >> 8];
+        }
+    }
+#endif
+    for( ; i < len; i++ )
+        nz += src[i] != 0;
     return nz;
 }
 
-static int countNonZero8u( const uchar* src, int len )
-{ return countNonZero_(src, len); }
-
 static int countNonZero16u( const ushort* src, int len )
 { return countNonZero_(src, len); }
 
diff --git a/modules/features2d/include/opencv2/features2d/features2d.hpp b/modules/features2d/include/opencv2/features2d/features2d.hpp
index d8cd2354a4..a191ca223c 100644
--- a/modules/features2d/include/opencv2/features2d/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d/features2d.hpp
@@ -473,12 +473,18 @@ protected:
 
 //! detects corners using FAST algorithm by E. Rosten
 CV_EXPORTS void FAST( InputArray image, CV_OUT vector<KeyPoint>& keypoints,
-                      int threshold, bool nonmaxSupression=true );
+                      int threshold, bool nonmaxSupression=true, int type = 2 );
 
 class CV_EXPORTS_W FastFeatureDetector : public FeatureDetector
 {
 public:
-    CV_WRAP FastFeatureDetector( int threshold=10, bool nonmaxSuppression=true );
+    enum
+    {
+      TYPE_5_8 = 0, TYPE_7_12 = 1, TYPE_9_16 = 2
+    };
+
+    CV_WRAP FastFeatureDetector( int threshold=10, bool nonmaxSuppression=true);
+    CV_WRAP FastFeatureDetector( int threshold, bool nonmaxSuppression, int type);
     AlgorithmInfo* info() const;
 
 protected:
@@ -486,6 +492,7 @@ protected:
 
     int threshold;
     bool nonmaxSuppression;
+    int type;
 };
 
 
diff --git a/modules/features2d/perf/perf_fast.cpp b/modules/features2d/perf/perf_fast.cpp
index da75f9d3f6..28f0cccb11 100644
--- a/modules/features2d/perf/perf_fast.cpp
+++ b/modules/features2d/perf/perf_fast.cpp
@@ -22,9 +22,13 @@ PERF_TEST_P(fast, detectForORB, testing::Values(FAST_IMAGES))
 
     declare.in(frame);
 
-    FastFeatureDetector fd(20, true);
+    FastFeatureDetector fd(20, true, FastFeatureDetector::TYPE_5_8);
     vector<KeyPoint> points;
 
     TEST_CYCLE() fd.detect(frame, points);
+    fd = FastFeatureDetector(20, true, FastFeatureDetector::TYPE_7_12);
+    TEST_CYCLE() fd.detect(frame, points);
+    fd = FastFeatureDetector(20, true, FastFeatureDetector::TYPE_9_16);
+    TEST_CYCLE() fd.detect(frame, points);
 }
 
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index 9495d35785..f496de3d51 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -46,27 +46,93 @@ The references are:
 namespace cv
 {
 
-static void makeOffsets(int pixel[], int row_stride)
+static void makeOffsets(int pixel[], int row_stride, int patternSize)
 {
-    pixel[0] = 0 + row_stride * 3;
-    pixel[1] = 1 + row_stride * 3;
-    pixel[2] = 2 + row_stride * 2;
-    pixel[3] = 3 + row_stride * 1;
-    pixel[4] = 3 + row_stride * 0;
-    pixel[5] = 3 + row_stride * -1;
-    pixel[6] = 2 + row_stride * -2;
-    pixel[7] = 1 + row_stride * -3;
-    pixel[8] = 0 + row_stride * -3;
-    pixel[9] = -1 + row_stride * -3;
-    pixel[10] = -2 + row_stride * -2;
-    pixel[11] = -3 + row_stride * -1;
-    pixel[12] = -3 + row_stride * 0;
-    pixel[13] = -3 + row_stride * 1;
-    pixel[14] = -2 + row_stride * 2;
-    pixel[15] = -1 + row_stride * 3;
+    switch(patternSize) {
+      case 16:
+        pixel[0] = 0 + row_stride * 3;
+        pixel[1] = 1 + row_stride * 3;
+        pixel[2] = 2 + row_stride * 2;
+        pixel[3] = 3 + row_stride * 1;
+        pixel[4] = 3 + row_stride * 0;
+        pixel[5] = 3 + row_stride * -1;
+        pixel[6] = 2 + row_stride * -2;
+        pixel[7] = 1 + row_stride * -3;
+        pixel[8] = 0 + row_stride * -3;
+        pixel[9] = -1 + row_stride * -3;
+        pixel[10] = -2 + row_stride * -2;
+        pixel[11] = -3 + row_stride * -1;
+        pixel[12] = -3 + row_stride * 0;
+        pixel[13] = -3 + row_stride * 1;
+        pixel[14] = -2 + row_stride * 2;
+        pixel[15] = -1 + row_stride * 3;
+        break;
+      case 12:
+        pixel[0] = 0 + row_stride * 2;
+        pixel[1] = 1 + row_stride * 2;
+        pixel[2] = 2 + row_stride * 1;
+        pixel[3] = 2 + row_stride * 0;
+        pixel[4] = 2 + row_stride * -1;
+        pixel[5] = 1 + row_stride * -2;
+        pixel[6] = 0 + row_stride * -2;
+        pixel[7] = -1 + row_stride * -2;
+        pixel[8] = -2 + row_stride * -1;
+        pixel[9] = -2 + row_stride * 0;
+        pixel[10] = -2 + row_stride * 1;
+        pixel[11] = -1 + row_stride * 2;
+        break;
+      case 8:
+        pixel[0] = 0 + row_stride * 1;
+        pixel[1] = 1 + row_stride * 1;
+        pixel[2] = 1 + row_stride * 0;
+        pixel[3] = 1 + row_stride * -1;
+        pixel[4] = 0 + row_stride * -1;
+        pixel[5] = -1 + row_stride * -1;
+        pixel[6] = 0 + row_stride * 0;
+        pixel[7] = 1 + row_stride * 1;
+        break;
+    }
 }
 
-static int cornerScore(const uchar* ptr, const int pixel[], int threshold)
+/*static void testCorner(const uchar* ptr, const int pixel[], int K, int N, int threshold) {
+    // check that with the computed "threshold" the pixel is still a corner
+    // and that with the increased-by-1 "threshold" the pixel is not a corner anymore
+    for( int delta = 0; delta <= 1; delta++ )
+    {
+        int v0 = std::min(ptr[0] + threshold + delta, 255);
+        int v1 = std::max(ptr[0] - threshold - delta, 0);
+        int c0 = 0, c1 = 0;
+
+        for( int k = 0; k < N; k++ )
+        {
+            int x = ptr[pixel[k]];
+            if(x > v0)
+            {
+                if( ++c0 > K )
+                    break;
+                c1 = 0;
+            }
+            else if( x < v1 )
+            {
+                if( ++c1 > K )
+                    break;
+                c0 = 0;
+            }
+            else
+            {
+                c0 = c1 = 0;
+            }
+        }
+        CV_Assert( (delta == 0 && std::max(c0, c1) > K) ||
+                   (delta == 1 && std::max(c0, c1) <= K) );
+    }
+}*/
+
+template<int patternSize>
+int cornerScore(const uchar* ptr, const int pixel[], int threshold);
+
+template<>
+int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
 {
     const int K = 8, N = 16 + K + 1;
     int k, v = ptr[0];
@@ -150,50 +216,170 @@ static int cornerScore(const uchar* ptr, const int pixel[], int threshold)
 #endif
 
 #if 0
-    // check that with the computed "threshold" the pixel is still a corner
-    // and that with the increased-by-1 "threshold" the pixel is not a corner anymore
-    for( int delta = 0; delta <= 1; delta++ )
-    {
-        int v0 = std::min(ptr[0] + threshold + delta, 255);
-        int v1 = std::max(ptr[0] - threshold - delta, 0);
-        int c0 = 0, c1 = 0;
-
-        for( int k = 0; k < N; k++ )
-        {
-            int x = ptr[pixel[k]];
-            if(x > v0)
-            {
-                if( ++c0 > K )
-                    break;
-                c1 = 0;
-            }
-            else if( x < v1 )
-            {
-                if( ++c1 > K )
-                    break;
-                c0 = 0;
-            }
-            else
-            {
-                c0 = c1 = 0;
-            }
-        }
-        CV_Assert( (delta == 0 && std::max(c0, c1) > K) ||
-                   (delta == 1 && std::max(c0, c1) <= K) );
-    }
+    testCorner(ptr, pixel, K, N, threshold);
 #endif
     return threshold;
 }
 
+template<>
+int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
+{
+    const int K = 6, N = 12 + K + 1;
+    int k, v = ptr[0];
+    short d[N];
+    for( k = 0; k < N; k++ )
+        d[k] = (short)(v - ptr[pixel[k]]);
 
-void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
+#if CV_SSE2
+    __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000);
+    for( k = 0; k < 16; k += 8 )
+    {
+        __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1));
+        __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2));
+        __m128i a = _mm_min_epi16(v0, v1);
+        __m128i b = _mm_max_epi16(v0, v1);
+        v0 = _mm_loadu_si128((__m128i*)(d+k+3));
+        a = _mm_min_epi16(a, v0);
+        b = _mm_max_epi16(b, v0);
+        v0 = _mm_loadu_si128((__m128i*)(d+k+4));
+        a = _mm_min_epi16(a, v0);
+        b = _mm_max_epi16(b, v0);
+        v0 = _mm_loadu_si128((__m128i*)(d+k+5));
+        a = _mm_min_epi16(a, v0);
+        b = _mm_max_epi16(b, v0);
+        v0 = _mm_loadu_si128((__m128i*)(d+k+6));
+        a = _mm_min_epi16(a, v0);
+        b = _mm_max_epi16(b, v0);
+        v0 = _mm_loadu_si128((__m128i*)(d+k));
+        q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
+        q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
+        v0 = _mm_loadu_si128((__m128i*)(d+k+7));
+        q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
+        q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
+    }
+    q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1));
+    q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0));
+    q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4));
+    q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
+    threshold = (short)_mm_cvtsi128_si32(q0) - 1;
+#else
+    int a0 = threshold;
+    for( k = 0; k < 12; k += 2 )
+    {
+        int a = std::min((int)d[k+1], (int)d[k+2]);
+        if( a <= a0 )
+            continue;
+        a = std::min(a, (int)d[k+3]);
+        a = std::min(a, (int)d[k+4]);
+        a = std::min(a, (int)d[k+5]);
+        a = std::min(a, (int)d[k+6]);
+        a0 = std::max(a0, std::min(a, (int)d[k]));
+        a0 = std::max(a0, std::min(a, (int)d[k+7]));
+    }
+
+    int b0 = -a0;
+    for( k = 0; k < 12; k += 2 )
+    {
+        int b = std::max((int)d[k+1], (int)d[k+2]);
+        b = std::max(b, (int)d[k+3]);
+        b = std::max(b, (int)d[k+4]);
+        if( b >= b0 )
+            continue;
+        b = std::max(b, (int)d[k+5]);
+        b = std::max(b, (int)d[k+6]);
+
+        b0 = std::min(b0, std::max(b, (int)d[k]));
+        b0 = std::min(b0, std::max(b, (int)d[k+7]));
+    }
+
+    threshold = -b0-1;
+#endif
+
+#if 0
+    testCorner(ptr, pixel, K, N, threshold);
+#endif
+    return threshold;
+}
+
+template<>
+int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
+{
+    const int K = 4, N = 8 + K + 1;
+    int k, v = ptr[0];
+    short d[N];
+    for( k = 0; k < N; k++ )
+        d[k] = (short)(v - ptr[pixel[k]]);
+
+#if CV_SSE2
+    __m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000);
+    for( k = 0; k < 16; k += 8 )
+    {
+        __m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1));
+        __m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2));
+        __m128i a = _mm_min_epi16(v0, v1);
+        __m128i b = _mm_max_epi16(v0, v1);
+        v0 = _mm_loadu_si128((__m128i*)(d+k+3));
+        a = _mm_min_epi16(a, v0);
+        b = _mm_max_epi16(b, v0);
+        v0 = _mm_loadu_si128((__m128i*)(d+k+4));
+        a = _mm_min_epi16(a, v0);
+        b = _mm_max_epi16(b, v0);
+        v0 = _mm_loadu_si128((__m128i*)(d+k));
+        q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
+        q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
+        v0 = _mm_loadu_si128((__m128i*)(d+k+5));
+        q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
+        q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
+    }
+    q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1));
+    q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0));
+    q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4));
+    q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
+    threshold = (short)_mm_cvtsi128_si32(q0) - 1;
+#else
+    int a0 = threshold;
+    for( k = 0; k < 8; k += 2 )
+    {
+        int a = std::min((int)d[k+1], (int)d[k+2]);
+        if( a <= a0 )
+            continue;
+        a = std::min(a, (int)d[k+3]);
+        a = std::min(a, (int)d[k+4]);
+        a0 = std::max(a0, std::min(a, (int)d[k]));
+        a0 = std::max(a0, std::min(a, (int)d[k+5]));
+    }
+
+    int b0 = -a0;
+    for( k = 0; k < 12; k += 2 )
+    {
+        int b = std::max((int)d[k+1], (int)d[k+2]);
+        b = std::max(b, (int)d[k+3]);
+        if( b >= b0 )
+            continue;
+        b = std::max(b, (int)d[k+4]);
+
+        b0 = std::min(b0, std::max(b, (int)d[k]));
+        b0 = std::min(b0, std::max(b, (int)d[k+5]));
+    }
+
+    threshold = -b0-1;
+#endif
+
+#if 0
+    testCorner(ptr, pixel, K, N, threshold);
+#endif
+    return threshold;
+}
+
+template<int patternSize>
+void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression)
 {
     Mat img = _img.getMat();
-    const int K = 8, N = 16 + K + 1;
-    int i, j, k, pixel[N];
-    makeOffsets(pixel, (int)img.step);
-    for(k = 16; k < N; k++)
-        pixel[k] = pixel[k - 16];
+    const int K = patternSize/2, N = patternSize + K + 1, quarterPatternSize = patternSize/4;
+    int i, j, k, pixel[25];
+    makeOffsets(pixel, (int)img.step, patternSize);
+    for(k = patternSize; k < 25; k++)
+        pixel[k] = pixel[k - patternSize];
 
     keypoints.clear();
 
@@ -235,9 +421,9 @@ void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool
                 v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);
 
                 __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
-                __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[4])), delta);
-                __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[8])), delta);
-                __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[12])), delta);
+                __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
+                __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
+                __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
                 m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
                 m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
                 m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
@@ -279,7 +465,7 @@ void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool
                     {
                         cornerpos[ncorners++] = j+k;
                         if(nonmax_suppression)
-                            curr[j+k] = (uchar)cornerScore(ptr+k, pixel, threshold);
+                            curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
                     }
             }
     #endif
@@ -317,7 +503,7 @@ void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool
                             {
                                 cornerpos[ncorners++] = j;
                                 if(nonmax_suppression)
-                                    curr[j] = (uchar)cornerScore(ptr, pixel, threshold);
+                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                 break;
                             }
                         }
@@ -339,7 +525,7 @@ void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool
                             {
                                 cornerpos[ncorners++] = j;
                                 if(nonmax_suppression)
-                                    curr[j] = (uchar)cornerScore(ptr, pixel, threshold);
+                                    curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold);
                                 break;
                             }
                         }
@@ -375,19 +561,36 @@ void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool
     }
 }
 
-
+void FAST(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression, int type)
+{
+  switch(type) {
+    case FastFeatureDetector::TYPE_5_8:
+      FAST_t<8>(_img, keypoints, threshold, nonmax_suppression);
+      break;
+    case FastFeatureDetector::TYPE_7_12:
+      FAST_t<12>(_img, keypoints, threshold, nonmax_suppression);
+      break;
+    case FastFeatureDetector::TYPE_9_16:
+      FAST_t<16>(_img, keypoints, threshold, nonmax_suppression);
+      break;
+  }
+}
 /*
  *   FastFeatureDetector
  */
 FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppression )
-: threshold(_threshold), nonmaxSuppression(_nonmaxSuppression)
+    : threshold(_threshold), nonmaxSuppression(_nonmaxSuppression), type(FastFeatureDetector::TYPE_9_16)
 {}
 
+FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppression, int _type )
+: threshold(_threshold), nonmaxSuppression(_nonmaxSuppression), type(_type)
+{}
+    
 void FastFeatureDetector::detectImpl( const Mat& image, vector<KeyPoint>& keypoints, const Mat& mask ) const
 {
     Mat grayImage = image;
     if( image.type() != CV_8U ) cvtColor( image, grayImage, CV_BGR2GRAY );
-    FAST( grayImage, keypoints, threshold, nonmaxSuppression );
+    FAST( grayImage, keypoints, threshold, nonmaxSuppression, type );
     KeyPointsFilter::runByPixelsMask( keypoints, mask );
 }
 
diff --git a/modules/features2d/src/features2d_init.cpp b/modules/features2d/src/features2d_init.cpp
index 0d884ef678..6ecffebd40 100644
--- a/modules/features2d/src/features2d_init.cpp
+++ b/modules/features2d/src/features2d_init.cpp
@@ -58,7 +58,8 @@ CV_INIT_ALGORITHM(BriefDescriptorExtractor, "Feature2D.BRIEF",
 
 CV_INIT_ALGORITHM(FastFeatureDetector, "Feature2D.FAST",
                   obj.info()->addParam(obj, "threshold", obj.threshold);
-                  obj.info()->addParam(obj, "nonmaxSuppression", obj.nonmaxSuppression));
+                  obj.info()->addParam(obj, "nonmaxSuppression", obj.nonmaxSuppression);
+                  obj.info()->addParam(obj, "type", obj.type, FastFeatureDetector::TYPE_9_16));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/modules/features2d/test/test_fast.cpp b/modules/features2d/test/test_fast.cpp
index d991a29057..671e66d5f4 100644
--- a/modules/features2d/test/test_fast.cpp
+++ b/modules/features2d/test/test_fast.cpp
@@ -58,6 +58,7 @@ CV_FastTest::~CV_FastTest() {}
 
 void CV_FastTest::run( int )
 {
+  for(int type=0; type <= 2; ++type) {
     Mat image1 = imread(string(ts->get_data_path()) + "inpaint/orig.jpg");
     Mat image2 = imread(string(ts->get_data_path()) + "cameracalibration/chess9.jpg");
     string xml = string(ts->get_data_path()) + "fast/result.xml";
@@ -74,8 +75,8 @@ void CV_FastTest::run( int )
 
     vector<KeyPoint> keypoints1;
     vector<KeyPoint> keypoints2;
-    FAST(gray1, keypoints1, 30);
-    FAST(gray2, keypoints2, 30);
+    FAST(gray1, keypoints1, 30, type);
+    FAST(gray2, keypoints2, 30, type);
 
     for(size_t i = 0; i < keypoints1.size(); ++i)
     {
@@ -109,17 +110,21 @@ void CV_FastTest::run( int )
     read( fs["exp_kps2"], exp_kps2, Mat() );
     fs.release();
 
+    // We only have testing data for 9_16 but it actually works equally well for 7_12
+    if ((type==1) || (type==2)){
     if ( 0 != norm(exp_kps1, kps1, NORM_L2) || 0 != norm(exp_kps2, kps2, NORM_L2))
     {
         ts->set_failed_test_info(cvtest::TS::FAIL_MISMATCH);
         return;
     }
+    }
 
- /*   cv::namedWindow("Img1"); cv::imshow("Img1", image1);
+    /*cv::namedWindow("Img1"); cv::imshow("Img1", image1);
     cv::namedWindow("Img2"); cv::imshow("Img2", image2);
     cv::waitKey(0);*/
+  }
 
-    ts->set_failed_test_info(cvtest::TS::OK);
+  ts->set_failed_test_info(cvtest::TS::OK);
 }
 
 TEST(Features2d_FAST, regression) { CV_FastTest test; test.safe_run(); }
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 515a4a2751..f6d869435c 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -622,6 +622,9 @@ CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0,
 //!            channel order.
 CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null());
 
+//! Routines for correcting image color gamma
+CV_EXPORTS void gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward = true, Stream& stream = Stream::Null());
+
 //! applies fixed threshold to the image
 CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
 
@@ -1411,7 +1414,7 @@ public:
 };
 
 ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////
-// The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
+// The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
 class CV_EXPORTS CascadeClassifier_GPU
 {
 public:
@@ -1421,28 +1424,28 @@ public:
 
     bool empty() const;
     bool load(const std::string& filename);
-    void release();
-
-    /* returns number of detected objects */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
-
-    bool findLargestObject;
-    bool visualizeInPlace;
-
-    Size getClassifierSize() const;
-
-private:
-    struct CascadeClassifierImpl;
-    CascadeClassifierImpl* impl;
-    struct HaarCascade;
-    struct LbpCascade;
-    friend class CascadeClassifier_GPU_LBP;
-
-public:
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
-};
-
-////////////////////////////////// SURF //////////////////////////////////////////
+    void release();
+
+    /* returns number of detected objects */
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
+
+    bool findLargestObject;
+    bool visualizeInPlace;
+
+    Size getClassifierSize() const;
+
+private:
+    struct CascadeClassifierImpl;
+    CascadeClassifierImpl* impl;
+    struct HaarCascade;
+    struct LbpCascade;
+    friend class CascadeClassifier_GPU_LBP;
+
+public:
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
+};
+
+////////////////////////////////// SURF //////////////////////////////////////////
 
 class CV_EXPORTS SURF_GPU
 {
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 61f5c94310..1f277f0cf7 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -1,51 +1,51 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <vector>
-#include <iostream>
-
-using namespace cv;
-using namespace cv::gpu;
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <vector>
+#include <iostream>
+
+using namespace cv;
+using namespace cv::gpu;
 using namespace std;
 
 #if !defined (HAVE_CUDA)
@@ -94,219 +94,221 @@ public:
                       /*out*/unsigned int& numDetections)
     {
         calculateMemReqsAndAllocate(src.size());
-
-        NCVMemPtr src_beg;
-        src_beg.ptr = (void*)src.ptr<Ncv8u>();
-        src_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment src_seg;
-        src_seg.begin = src_beg;
-        src_seg.size  = src.step * src.rows;
-
-        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
-        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        CV_Assert(objects.rows == 1);
-
-        NCVMemPtr objects_beg;
-        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
-        objects_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment objects_seg;
-        objects_seg.begin = objects_beg;
-        objects_seg.size = objects.step * objects.rows;
-        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
-        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        NcvSize32u roi;
-        roi.width = d_src.width();
-        roi.height = d_src.height();
-
+
+        NCVMemPtr src_beg;
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();
+        src_beg.memtype = NCVMemoryTypeDevice;
+
+        NCVMemSegment src_seg;
+        src_seg.begin = src_beg;
+        src_seg.size  = src.step * src.rows;
+
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
+        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
+
+        CV_Assert(objects.rows == 1);
+
+        NCVMemPtr objects_beg;
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
+        objects_beg.memtype = NCVMemoryTypeDevice;
+
+        NCVMemSegment objects_seg;
+        objects_seg.begin = objects_beg;
+        objects_seg.size = objects.step * objects.rows;
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
+        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
+
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+
         NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
 
-        Ncv32u flags = 0;
-        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
-        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;
-
-        ncvStat = ncvDetectObjectsMultiScale_device(
-            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures,
+        Ncv32u flags = 0;
+        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
+        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;
+
+        ncvStat = ncvDetectObjectsMultiScale_device(
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,
             winMinSize,
-            minNeighbors,
-            scaleStep, 1,
-            flags,
-            *gpuAllocator, *cpuAllocator, devProp, 0);
-        ncvAssertReturnNcvStat(ncvStat);
-        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-        return NCV_SUCCESS;
-    }
-
+            minNeighbors,
+            scaleStep, 1,
+            flags,
+            *gpuAllocator, *cpuAllocator, devProp, 0);
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+
     unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
                       bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size maxObjectSize)
     {
         CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
-
+
         const int defaultObjSearchNum = 100;
         if (objectsBuf.empty())
         {
             objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
         }
-
+
         cv::Size ncvMinSize = this->getClassifierCvSize();
-
+
         if (ncvMinSize.width < (unsigned)minSize.width && ncvMinSize.height < (unsigned)minSize.height)
         {
             ncvMinSize.width = minSize.width;
             ncvMinSize.height = minSize.height;
         }
-
+
         unsigned int numDetections;
         ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections));
-
+
         return numDetections;
     }
 
     cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }
-
+
 private:
     static void NCVDebugOutputHandler(const std::string &msg) { CV_Error(CV_GpuApiCallError, msg.c_str()); }
-
-    NCVStatus load(const string& classifierFile)
-    {
-        int devId = cv::gpu::getDevice();
-        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
-
-        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
-        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, static_cast<int>(devProp.textureAlignment));
-        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast<int>(devProp.textureAlignment));
-
-        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
-        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
-
-        Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
-        ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
-        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);
-
-        h_haarStages   = new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages);
-        h_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes);
-        h_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures);
-
-        ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
-        ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
-        ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
-
-        ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);
-        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);
-
-        d_haarStages   = new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages);
-        d_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes);
-        d_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures);
-
-        ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
-        ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
-        ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
-
-        ncvStat = h_haarStages->copySolid(*d_haarStages, 0);
-        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
-        ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);
-        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
-        ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);
-        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
-
-        return NCV_SUCCESS;
-    }
-
-    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
-    {
-        if (lastAllocatedFrameSize == frameSize)
-        {
-            return NCV_SUCCESS;
-        }
-
-        // Calculate memory requirements and create real allocators
-        NCVMemStackAllocator gpuCounter(static_cast<int>(devProp.textureAlignment));
-        NCVMemStackAllocator cpuCounter(static_cast<int>(devProp.textureAlignment));
-
-        ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);
-        ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);
-
-        NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);
-        NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);
-
-        ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-        ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-
-        NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100);
-        ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-
-        NcvSize32u roi;
-        roi.width = d_src.width();
-        roi.height = d_src.height();
-        Ncv32u numDetections;
-        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);
-
-        ncvAssertReturnNcvStat(ncvStat);
-        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-        gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
-        cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
-
-        ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);
-        ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);
-        return NCV_SUCCESS;
-    }
-
-    cudaDeviceProp devProp;
-    NCVStatus ncvStat;
-
-    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
-    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
-
-    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
-
-    HaarClassifierCascadeDescriptor haar;
-
-    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
-
-    Size lastAllocatedFrameSize;
-
-    Ptr<NCVMemStackAllocator> gpuAllocator;
-    Ptr<NCVMemStackAllocator> cpuAllocator;
+
+    NCVStatus load(const string& classifierFile)
+    {
+        int devId = cv::gpu::getDevice();
+        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
+
+        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, static_cast<int>(devProp.textureAlignment));
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
+
+        Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
+        ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);
+
+        h_haarStages   = new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages);
+        h_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes);
+        h_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures);
+
+        ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);
+
+        d_haarStages   = new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages);
+        d_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes);
+        d_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures);
+
+        ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = h_haarStages->copySolid(*d_haarStages, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+
+    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
+    {
+        if (lastAllocatedFrameSize == frameSize)
+        {
+            return NCV_SUCCESS;
+        }
+
+        // Calculate memory requirements and create real allocators
+        NCVMemStackAllocator gpuCounter(static_cast<int>(devProp.textureAlignment));
+        NCVMemStackAllocator cpuCounter(static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);
+
+        NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);
+        NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);
+
+        ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100);
+        ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+        Ncv32u numDetections;
+        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);
+
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
+        cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);
+
+        lastAllocatedFrameSize = frameSize;
+        return NCV_SUCCESS;
+    }
+
+    cudaDeviceProp devProp;
+    NCVStatus ncvStat;
+
+    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
+    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+
+    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
+    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+
+    HaarClassifierCascadeDescriptor haar;
+
+    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
+    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+
+    Size lastAllocatedFrameSize;
+
+    Ptr<NCVMemStackAllocator> gpuAllocator;
+    Ptr<NCVMemStackAllocator> cpuAllocator;
 
     virtual ~HaarCascade(){}
-};
-
+};
+
 cv::Size operator -(const cv::Size& a, const cv::Size& b)
 {
     return cv::Size(a.width - b.width, a.height - b.height);
 }
-
+
 cv::Size operator +(const cv::Size& a, const int& i)
 {
     return cv::Size(a.width + i, a.height + i);
 }
-
+
 cv::Size operator *(const cv::Size& a, const float& f)
 {
     return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
 }
-
+
 cv::Size operator /(const cv::Size& a, const float& f)
-{
+{
     return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
 }
 
 bool operator <=(const cv::Size& a, const cv::Size& b)
 {
     return a.width <= b.width && a.height <= b.width;
-}
-
+}
+
 struct PyrLavel
 {
     PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
@@ -669,18 +671,18 @@ cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); }
 void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } }
 
 bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; }
-
-Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const
-{
-    return this->empty() ? Size() : impl->getClassifierCvSize();
-}
-
-int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
-{
-    CV_Assert( !this->empty());
+
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const
+{
+    return this->empty() ? Size() : impl->getClassifierCvSize();
+}
+
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
+{
+    CV_Assert( !this->empty());
     return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
 }
-
+
 int cv::gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
 {
     CV_Assert( !this->empty());
@@ -695,261 +697,261 @@ bool cv::gpu::CascadeClassifier_GPU::load(const string& filename)
     std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
 
     if (fext == "nvbin")
-    {
+    {
         impl = new HaarCascade();
         return impl->read(filename);
-    }
-
+    }
+
     FileStorage fs(filename, FileStorage::READ);
-
+
     if (!fs.isOpened())
-    {
+    {
         impl = new HaarCascade();
         return impl->read(filename);
-    }
-
+    }
+
     const char *GPU_CC_LBP = "LBP";
     string featureTypeStr = (string)fs.getFirstTopLevelNode()["featureType"];
     if (featureTypeStr == GPU_CC_LBP)
         impl = new LbpCascade();
     else
         impl = new HaarCascade();
-
+
     impl->read(filename);
     return !this->empty();
-}
-
+}
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct RectConvert
-{
-    Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); }
-    NcvRect32u operator()(const Rect& nr) const
-    {
-        NcvRect32u rect;
-        rect.x = nr.x;
-        rect.y = nr.y;
-        rect.width = nr.width;
-        rect.height = nr.height;
-        return rect;
-    }
-};
-
-void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)
-{
-    vector<Rect> rects(hypotheses.size());
-    std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());
-
-    if (weights)
-    {
-        vector<int> weights_int;
-        weights_int.assign(weights->begin(), weights->end());
-        cv::groupRectangles(rects, weights_int, groupThreshold, eps);
-    }
-    else
-    {
-        cv::groupRectangles(rects, groupThreshold, eps);
-    }
-    std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());
-    hypotheses.resize(rects.size());
-}
-
-NCVStatus loadFromXML(const std::string &filename,
-                      HaarClassifierCascadeDescriptor &haar,
-                      std::vector<HaarStage64> &haarStages,
-                      std::vector<HaarClassifierNode128> &haarClassifierNodes,
-                      std::vector<HaarFeature64> &haarFeatures)
-{
-    NCVStatus ncvStat;
-
-    haar.NumStages = 0;
-    haar.NumClassifierRootNodes = 0;
-    haar.NumClassifierTotalNodes = 0;
-    haar.NumFeatures = 0;
-    haar.ClassifierSize.width = 0;
-    haar.ClassifierSize.height = 0;
-    haar.bHasStumpsOnly = true;
-    haar.bNeedsTiltedII = false;
-    Ncv32u curMaxTreeDepth;
-
-    std::vector<char> xmlFileCont;
-
-    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
-    haarStages.resize(0);
-    haarClassifierNodes.resize(0);
-    haarFeatures.resize(0);
-
-    Ptr<CvHaarClassifierCascade> oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0);
-    if (oldCascade.empty())
-    {
-        return NCV_HAAR_XML_LOADING_EXCEPTION;
-    }
-
-    haar.ClassifierSize.width = oldCascade->orig_window_size.width;
-    haar.ClassifierSize.height = oldCascade->orig_window_size.height;
-
-    int stagesCound = oldCascade->count;
-    for(int s = 0; s < stagesCound; ++s) // by stages
-    {
-        HaarStage64 curStage;
-        curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
-
-        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
-
-        int treesCount = oldCascade->stage_classifier[s].count;
-        for(int t = 0; t < treesCount; ++t) // by trees
-        {
-            Ncv32u nodeId = 0;
-            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
-
-            int nodesCount = tree->count;
-            for(int n = 0; n < nodesCount; ++n)  //by features
-            {
-                CvHaarFeature* feature = &tree->haar_feature[n];
-
-                HaarClassifierNode128 curNode;
-                curNode.setThreshold(tree->threshold[n]);
-
-                NcvBool bIsLeftNodeLeaf = false;
-                NcvBool bIsRightNodeLeaf = false;
-
-                HaarClassifierNodeDescriptor32 nodeLeft;
-                if ( tree->left[n] <= 0 )
-                {
-                    Ncv32f leftVal = tree->alpha[-tree->left[n]];
-                    ncvStat = nodeLeft.create(leftVal);
-                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
-                    bIsLeftNodeLeaf = true;
-                }
-                else
-                {
-                    Ncv32u leftNodeOffset = tree->left[n];
-                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
-                    haar.bHasStumpsOnly = false;
-                }
-                curNode.setLeftNodeDesc(nodeLeft);
-
-                HaarClassifierNodeDescriptor32 nodeRight;
-                if ( tree->right[n] <= 0 )
-                {
-                    Ncv32f rightVal = tree->alpha[-tree->right[n]];
-                    ncvStat = nodeRight.create(rightVal);
-                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
-                    bIsRightNodeLeaf = true;
-                }
-                else
-                {
-                    Ncv32u rightNodeOffset = tree->right[n];
-                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
-                    haar.bHasStumpsOnly = false;
-                }
-                curNode.setRightNodeDesc(nodeRight);
-
-                Ncv32u tiltedVal = feature->tilted;
-                haar.bNeedsTiltedII = (tiltedVal != 0);
-
-                Ncv32u featureId = 0;
-                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
-                {
-                    Ncv32u rectX = feature->rect[l].r.x;
-                    Ncv32u rectY = feature->rect[l].r.y;
-                    Ncv32u rectWidth = feature->rect[l].r.width;
-                    Ncv32u rectHeight = feature->rect[l].r.height;
-
-                    Ncv32f rectWeight = feature->rect[l].weight;
-
-                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
-                        break;
-
-                    HaarFeature64 curFeature;
-                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
-                    curFeature.setWeight(rectWeight);
-                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
-                    haarFeatures.push_back(curFeature);
-
-                    featureId++;
-                }
-
-                HaarFeatureDescriptor32 tmpFeatureDesc;
-                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
-                    featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
-                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
-                curNode.setFeatureDesc(tmpFeatureDesc);
-
-                if (!nodeId)
-                {
-                    //root node
-                    haarClassifierNodes.push_back(curNode);
-                    curMaxTreeDepth = 1;
-                }
-                else
-                {
-                    //other node
-                    h_TmpClassifierNotRootNodes.push_back(curNode);
-                    curMaxTreeDepth++;
-                }
-
-                nodeId++;
-            }
-        }
-
-        curStage.setNumClassifierRootNodes(treesCount);
-        haarStages.push_back(curStage);
-    }
-
-    //fill in cascade stats
-    haar.NumStages = static_cast<Ncv32u>(haarStages.size());
-    haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
-    haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
-    haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
-
-    //merge root and leaf nodes in one classifiers array
-    Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
-    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
-    {
-        HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
-
-        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
-        if (!featureDesc.isLeftNodeLeaf())
-        {
-            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
-            nodeLeft.create(newOffset);
-        }
-        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
-
-        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
-        if (!featureDesc.isRightNodeLeaf())
-        {
-            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
-            nodeRight.create(newOffset);
-        }
-        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
-    }
-
-    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
-    {
-        HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
-
-        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
-        if (!featureDesc.isLeftNodeLeaf())
-        {
-            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
-            nodeLeft.create(newOffset);
-        }
-        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
-
-        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
-        if (!featureDesc.isRightNodeLeaf())
-        {
-            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
-            nodeRight.create(newOffset);
-        }
-        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
-
-        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
-    }
-
-    return NCV_SUCCESS;
-}
-
-#endif /* HAVE_CUDA */
+
+struct RectConvert
+{
+    Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); }
+    NcvRect32u operator()(const Rect& nr) const
+    {
+        NcvRect32u rect;
+        rect.x = nr.x;
+        rect.y = nr.y;
+        rect.width = nr.width;
+        rect.height = nr.height;
+        return rect;
+    }
+};
+
+void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)
+{
+    vector<Rect> rects(hypotheses.size());
+    std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());
+
+    if (weights)
+    {
+        vector<int> weights_int;
+        weights_int.assign(weights->begin(), weights->end());
+        cv::groupRectangles(rects, weights_int, groupThreshold, eps);
+    }
+    else
+    {
+        cv::groupRectangles(rects, groupThreshold, eps);
+    }
+    std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());
+    hypotheses.resize(rects.size());
+}
+
+NCVStatus loadFromXML(const std::string &filename,
+                      HaarClassifierCascadeDescriptor &haar,
+                      std::vector<HaarStage64> &haarStages,
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes,
+                      std::vector<HaarFeature64> &haarFeatures)
+{
+    NCVStatus ncvStat;
+
+    haar.NumStages = 0;
+    haar.NumClassifierRootNodes = 0;
+    haar.NumClassifierTotalNodes = 0;
+    haar.NumFeatures = 0;
+    haar.ClassifierSize.width = 0;
+    haar.ClassifierSize.height = 0;
+    haar.bHasStumpsOnly = true;
+    haar.bNeedsTiltedII = false;
+    Ncv32u curMaxTreeDepth;
+
+    std::vector<char> xmlFileCont;
+
+    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
+    haarStages.resize(0);
+    haarClassifierNodes.resize(0);
+    haarFeatures.resize(0);
+
+    Ptr<CvHaarClassifierCascade> oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0);
+    if (oldCascade.empty())
+    {
+        return NCV_HAAR_XML_LOADING_EXCEPTION;
+    }
+
+    haar.ClassifierSize.width = oldCascade->orig_window_size.width;
+    haar.ClassifierSize.height = oldCascade->orig_window_size.height;
+
+    int stagesCound = oldCascade->count;
+    for(int s = 0; s < stagesCound; ++s) // by stages
+    {
+        HaarStage64 curStage;
+        curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
+
+        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
+
+        int treesCount = oldCascade->stage_classifier[s].count;
+        for(int t = 0; t < treesCount; ++t) // by trees
+        {
+            Ncv32u nodeId = 0;
+            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
+
+            int nodesCount = tree->count;
+            for(int n = 0; n < nodesCount; ++n)  //by features
+            {
+                CvHaarFeature* feature = &tree->haar_feature[n];
+
+                HaarClassifierNode128 curNode;
+                curNode.setThreshold(tree->threshold[n]);
+
+                NcvBool bIsLeftNodeLeaf = false;
+                NcvBool bIsRightNodeLeaf = false;
+
+                HaarClassifierNodeDescriptor32 nodeLeft;
+                if ( tree->left[n] <= 0 )
+                {
+                    Ncv32f leftVal = tree->alpha[-tree->left[n]];
+                    ncvStat = nodeLeft.create(leftVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    bIsLeftNodeLeaf = true;
+                }
+                else
+                {
+                    Ncv32u leftNodeOffset = tree->left[n];
+                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setLeftNodeDesc(nodeLeft);
+
+                HaarClassifierNodeDescriptor32 nodeRight;
+                if ( tree->right[n] <= 0 )
+                {
+                    Ncv32f rightVal = tree->alpha[-tree->right[n]];
+                    ncvStat = nodeRight.create(rightVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    bIsRightNodeLeaf = true;
+                }
+                else
+                {
+                    Ncv32u rightNodeOffset = tree->right[n];
+                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setRightNodeDesc(nodeRight);
+
+                Ncv32u tiltedVal = feature->tilted;
+                haar.bNeedsTiltedII = (tiltedVal != 0);
+
+                Ncv32u featureId = 0;
+                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
+                {
+                    Ncv32u rectX = feature->rect[l].r.x;
+                    Ncv32u rectY = feature->rect[l].r.y;
+                    Ncv32u rectWidth = feature->rect[l].r.width;
+                    Ncv32u rectHeight = feature->rect[l].r.height;
+
+                    Ncv32f rectWeight = feature->rect[l].weight;
+
+                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
+                        break;
+
+                    HaarFeature64 curFeature;
+                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
+                    curFeature.setWeight(rectWeight);
+                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                    haarFeatures.push_back(curFeature);
+
+                    featureId++;
+                }
+
+                HaarFeatureDescriptor32 tmpFeatureDesc;
+                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
+                    featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
+                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                curNode.setFeatureDesc(tmpFeatureDesc);
+
+                if (!nodeId)
+                {
+                    //root node
+                    haarClassifierNodes.push_back(curNode);
+                    curMaxTreeDepth = 1;
+                }
+                else
+                {
+                    //other node
+                    h_TmpClassifierNotRootNodes.push_back(curNode);
+                    curMaxTreeDepth++;
+                }
+
+                nodeId++;
+            }
+        }
+
+        curStage.setNumClassifierRootNodes(treesCount);
+        haarStages.push_back(curStage);
+    }
+
+    //fill in cascade stats
+    haar.NumStages = static_cast<Ncv32u>(haarStages.size());
+    haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
+    haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
+    haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
+
+    //merge root and leaf nodes in one classifiers array
+    Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
+    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
+    {
+        HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
+
+        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
+        if (!featureDesc.isLeftNodeLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
+        if (!featureDesc.isRightNodeLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
+    }
+
+    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
+    {
+        HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
+
+        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
+        if (!featureDesc.isLeftNodeLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
+        if (!featureDesc.isRightNodeLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
+
+        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
+    }
+
+    return NCV_SUCCESS;
+}
+
+#endif /* HAVE_CUDA */
diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp
index a47758c873..543227aeb3 100644
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -49,6 +49,7 @@ using namespace cv::gpu;
 
 void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
 void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }
+void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -1142,6 +1143,165 @@ namespace
 
         funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
     }
+
+    void bgr_to_lab(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        #if (CUDA_VERSION < 5000)
+            (void)src;
+            (void)dst;
+            (void)dcn;
+            (void)stream;
+            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+        #else
+            CV_Assert(src.depth() == CV_8U);
+            CV_Assert(src.channels() == 3);
+
+            dcn = src.channels();
+
+            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+            NppStreamHandler h(StreamAccessor::getStream(stream));
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            nppSafeCall( nppiBGRToLab_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+        #endif
+    }
+
+    void rgb_to_lab(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        bgr_to_rgb(src, dst, -1, stream);
+        bgr_to_lab(dst, dst, -1, stream);
+    }
+
+    void lab_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        #if (CUDA_VERSION < 5000)
+            (void)src;
+            (void)dst;
+            (void)dcn;
+            (void)stream;
+            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+        #else
+            CV_Assert(src.depth() == CV_8U);
+            CV_Assert(src.channels() == 3);
+
+            dcn = src.channels();
+
+            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+            NppStreamHandler h(StreamAccessor::getStream(stream));
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            nppSafeCall( nppiLabToBGR_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+        #endif
+    }
+
+    void lab_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        lab_to_bgr(src, dst, -1, stream);
+        bgr_to_rgb(dst, dst, -1, stream);
+    }
+
+    void rgb_to_luv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        #if (CUDA_VERSION < 5000)
+            (void)src;
+            (void)dst;
+            (void)dcn;
+            (void)stream;
+            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+        #else
+            CV_Assert(src.depth() == CV_8U);
+            CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+            dcn = src.channels();
+
+            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+            NppStreamHandler h(StreamAccessor::getStream(stream));
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            if (dcn == 3)
+                nppSafeCall( nppiRGBToLUV_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+            else
+                nppSafeCall( nppiRGBToLUV_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+        #endif
+    }
+
+    void bgr_to_luv(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        bgr_to_rgb(src, dst, -1, stream);
+        rgb_to_luv(dst, dst, -1, stream);
+    }
+
+    void luv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        #if (CUDA_VERSION < 5000)
+            (void)src;
+            (void)dst;
+            (void)dcn;
+            (void)stream;
+            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+        #else
+            CV_Assert(src.depth() == CV_8U);
+            CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+            dcn = src.channels();
+
+            dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
+
+            NppStreamHandler h(StreamAccessor::getStream(stream));
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            if (dcn == 3)
+                nppSafeCall( nppiLUVToRGB_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+            else
+                nppSafeCall( nppiLUVToRGB_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+        #endif
+    }
+
+    void luv_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        luv_to_rgb(src, dst, -1, stream);
+        bgr_to_rgb(dst, dst, -1, stream);
+    }
+
+    void rgba_to_mbgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+    #if (CUDA_VERSION < 5000)
+        (void)src;
+        (void)dst;
+        (void)stream;
+        CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+    #else
+        CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4);
+
+        dst.create(src.size(), src.type());
+
+        NppStreamHandler h(StreamAccessor::getStream(stream));
+
+        NppiSize oSizeROI;
+        oSizeROI.width = src.cols;
+        oSizeROI.height = src.rows;
+
+        if (src.depth() == CV_8U)
+            nppSafeCall( nppiAlphaPremul_8u_AC4R(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI) );
+        else
+            nppSafeCall( nppiAlphaPremul_16u_AC4R(src.ptr<Npp16u>(), static_cast<int>(src.step), dst.ptr<Npp16u>(), static_cast<int>(dst.step), oSizeROI) );
+    #endif
+    }
 }
 
 void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
@@ -1203,16 +1363,16 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
         0,                      //                =42
         0,                      //                =43
 
-        0,                      // CV_BGR2Lab     =44
-        0,                      // CV_RGB2Lab     =45
+        bgr_to_lab,             // CV_BGR2Lab     =44
+        rgb_to_lab,             // CV_RGB2Lab     =45
 
         0,                      // CV_BayerBG2BGR =46
         0,                      // CV_BayerGB2BGR =47
         0,                      // CV_BayerRG2BGR =48
         0,                      // CV_BayerGR2BGR =49
 
-        0,                      // CV_BGR2Luv     =50
-        0,                      // CV_RGB2Luv     =51
+        bgr_to_luv,             // CV_BGR2Luv     =50
+        rgb_to_luv,             // CV_RGB2Luv     =51
 
         bgr_to_hls,             // CV_BGR2HLS     =52
         rgb_to_hls,             // CV_RGB2HLS     =53
@@ -1220,10 +1380,10 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
         hsv_to_bgr,             // CV_HSV2BGR     =54
         hsv_to_rgb,             // CV_HSV2RGB     =55
 
-        0,                      // CV_Lab2BGR     =56
-        0,                      // CV_Lab2RGB     =57
-        0,                      // CV_Luv2BGR     =58
-        0,                      // CV_Luv2RGB     =59
+        lab_to_bgr,             // CV_Lab2BGR     =56
+        lab_to_rgb,             // CV_Lab2RGB     =57
+        luv_to_bgr,             // CV_Luv2BGR     =58
+        luv_to_rgb,             // CV_Luv2RGB     =59
 
         hls_to_bgr,             // CV_HLS2BGR     =60
         hls_to_rgb,             // CV_HLS2RGB     =61
@@ -1261,10 +1421,63 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
         0,                      // CV_BayerBG2GRAY = 86
         0,                      // CV_BayerGB2GRAY = 87
         0,                      // CV_BayerRG2GRAY = 88
-        0                       // CV_BayerGR2GRAY = 89
+        0,                      // CV_BayerGR2GRAY = 89
+
+        //YUV 4:2:0 formats family
+        0,                      // COLOR_YUV2RGB_NV12 = 90,
+        0,                      // COLOR_YUV2BGR_NV12 = 91,
+        0,                      // COLOR_YUV2RGB_NV21 = 92,
+        0,                      // COLOR_YUV2BGR_NV21 = 93,
+
+        0,                      // COLOR_YUV2RGBA_NV12 = 94,
+        0,                      // COLOR_YUV2BGRA_NV12 = 95,
+        0,                      // COLOR_YUV2RGBA_NV21 = 96,
+        0,                      // COLOR_YUV2BGRA_NV21 = 97,
+
+        0,                      // COLOR_YUV2RGB_YV12 = 98,
+        0,                      // COLOR_YUV2BGR_YV12 = 99,
+        0,                      // COLOR_YUV2RGB_IYUV = 100,
+        0,                      // COLOR_YUV2BGR_IYUV = 101,
+
+        0,                      // COLOR_YUV2RGBA_YV12 = 102,
+        0,                      // COLOR_YUV2BGRA_YV12 = 103,
+        0,                      // COLOR_YUV2RGBA_IYUV = 104,
+        0,                      // COLOR_YUV2BGRA_IYUV = 105,
+
+        0,                      // COLOR_YUV2GRAY_420 = 106,
+
+        //YUV 4:2:2 formats family
+        0,                      // COLOR_YUV2RGB_UYVY = 107,
+        0,                      // COLOR_YUV2BGR_UYVY = 108,
+        0,                      // //COLOR_YUV2RGB_VYUY = 109,
+        0,                      // //COLOR_YUV2BGR_VYUY = 110,
+
+        0,                      // COLOR_YUV2RGBA_UYVY = 111,
+        0,                      // COLOR_YUV2BGRA_UYVY = 112,
+        0,                      // //COLOR_YUV2RGBA_VYUY = 113,
+        0,                      // //COLOR_YUV2BGRA_VYUY = 114,
+
+        0,                      // COLOR_YUV2RGB_YUY2 = 115,
+        0,                      // COLOR_YUV2BGR_YUY2 = 116,
+        0,                      // COLOR_YUV2RGB_YVYU = 117,
+        0,                      // COLOR_YUV2BGR_YVYU = 118,
+
+        0,                      // COLOR_YUV2RGBA_YUY2 = 119,
+        0,                      // COLOR_YUV2BGRA_YUY2 = 120,
+        0,                      // COLOR_YUV2RGBA_YVYU = 121,
+        0,                      // COLOR_YUV2BGRA_YVYU = 122,
+
+        0,                      // COLOR_YUV2GRAY_UYVY = 123,
+        0,                      // COLOR_YUV2GRAY_YUY2 = 124,
+
+        // alpha premultiplication
+        rgba_to_mbgra,          // COLOR_RGBA2mRGBA = 125,
+        0,                      // COLOR_mRGBA2RGBA = 126,
+
+        0,                      // COLOR_COLORCVT_MAX  = 127
     };
 
-    CV_Assert(code < 94);
+    CV_Assert(code < 128);
 
     func_t func = funcs[code];
 
@@ -1292,4 +1505,45 @@ void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
         cudaSafeCall( cudaDeviceSynchronize() );
 }
 
+void cv::gpu::gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward, Stream& stream)
+{
+#if (CUDA_VERSION < 5000)
+    (void)src;
+    (void)dst;
+    (void)forward;
+    (void)stream;
+    CV_Error( CV_StsNotImplemented, "This function works only with CUDA 5.0 or higher" );
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc, int nSrcStep, Npp8u* pDst, int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*func_inplace_t)(Npp8u* pSrcDst, int nSrcDstStep, NppiSize oSizeROI);
+
+    static const func_t funcs[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3R, nppiGammaInv_8u_AC4R},
+        {0, 0, 0, nppiGammaFwd_8u_C3R, nppiGammaFwd_8u_AC4R}
+    };
+    static const func_inplace_t funcs_inplace[2][5] =
+    {
+        {0, 0, 0, nppiGammaInv_8u_C3IR, nppiGammaInv_8u_AC4IR},
+        {0, 0, 0, nppiGammaFwd_8u_C3IR, nppiGammaFwd_8u_AC4IR}
+    };
+
+    CV_Assert(src.type() == CV_8UC3 || src.type() == CV_8UC4);
+
+    dst.create(src.size(), src.type());
+
+    NppStreamHandler h(StreamAccessor::getStream(stream));
+
+    NppiSize oSizeROI;
+    oSizeROI.width = src.cols;
+    oSizeROI.height = src.rows;
+
+    if (dst.data == src.data)
+        funcs_inplace[forward][src.channels()](dst.ptr<Npp8u>(), static_cast<int>(src.step), oSizeROI);
+    else
+        funcs[forward][src.channels()](src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), oSizeROI);
+
+#endif
+}
+
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/graphcuts.cpp b/modules/gpu/src/graphcuts.cpp
index aba9ee340a..0546ce3ad9 100644
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -71,24 +71,32 @@ namespace
             return pState;
         }
 
-    private:        
+    private:
         NppiGraphcutState* pState;
     };
 }
 
 void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
 {
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
     Size src_size = terminals.size();
 
-    CV_Assert(terminals.type() == CV_32S);
     CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(leftTransp.type() == CV_32S);
+    CV_Assert(leftTransp.type() == terminals.type());
+
     CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(rightTransp.type() == CV_32S);
+    CV_Assert(rightTransp.type() == terminals.type());
+
     CV_Assert(top.size() == src_size);
-    CV_Assert(top.type() == CV_32S);
+    CV_Assert(top.type() == terminals.type());
+
     CV_Assert(bottom.size() == src_size);
-    CV_Assert(bottom.type() == CV_32S);
+    CV_Assert(bottom.type() == terminals.type());
 
     labels.create(src_size, CV_8U);
 
@@ -106,44 +114,61 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
     NppStreamHandler h(stream);
 
     NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
-    
+
+#if (CUDA_VERSION < 5000)
     nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
         static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), bottom.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
 }
 
-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, 
+void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
               GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
 {
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
     Size src_size = terminals.size();
 
-    CV_Assert(terminals.type() == CV_32S);
-
     CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(leftTransp.type() == CV_32S);
+    CV_Assert(leftTransp.type() == terminals.type());
 
     CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(rightTransp.type() == CV_32S);
+    CV_Assert(rightTransp.type() == terminals.type());
 
     CV_Assert(top.size() == src_size);
-    CV_Assert(top.type() == CV_32S);
+    CV_Assert(top.type() == terminals.type());
 
     CV_Assert(topLeft.size() == src_size);
-    CV_Assert(topLeft.type() == CV_32S);
+    CV_Assert(topLeft.type() == terminals.type());
 
     CV_Assert(topRight.size() == src_size);
-    CV_Assert(topRight.type() == CV_32S);
+    CV_Assert(topRight.type() == terminals.type());
 
     CV_Assert(bottom.size() == src_size);
-    CV_Assert(bottom.type() == CV_32S);
+    CV_Assert(bottom.type() == terminals.type());
 
     CV_Assert(bottomLeft.size() == src_size);
-    CV_Assert(bottomLeft.type() == CV_32S);
+    CV_Assert(bottomLeft.type() == terminals.type());
 
     CV_Assert(bottomRight.size() == src_size);
-    CV_Assert(bottomRight.type() == CV_32S);
+    CV_Assert(bottomRight.type() == terminals.type());
 
     labels.create(src_size, CV_8U);
 
@@ -161,11 +186,28 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
     NppStreamHandler h(stream);
 
     NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
-    
-    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), 
+
+#if (CUDA_VERSION < 5000)
+    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
         top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
         bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
         static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
+            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
+            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
+            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
diff --git a/modules/gpu/test/main.cpp b/modules/gpu/test/main.cpp
index 4d9d38014d..6a8c67d79f 100644
--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@@ -43,8 +43,6 @@
 
 #ifdef HAVE_CUDA
 
-#include <cuda_runtime_api.h>
-
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
diff --git a/modules/gpu/test/precomp.hpp b/modules/gpu/test/precomp.hpp
index cc708b55af..afc3be8559 100644
--- a/modules/gpu/test/precomp.hpp
+++ b/modules/gpu/test/precomp.hpp
@@ -72,4 +72,9 @@
 #include "utility.hpp"
 #include "interpolation.hpp"
 
+#ifdef HAVE_CUDA
+    #include <cuda.h>
+    #include <cuda_runtime.h>
+#endif
+
 #endif
diff --git a/modules/gpu/test/test_color.cpp b/modules/gpu/test/test_color.cpp
index 1d3ced0220..89ca1a79a8 100644
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@@ -1609,6 +1609,141 @@ TEST_P(CvtColor, RGBA2YUV4)
     EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
 }
 
+TEST_P(CvtColor, BGR2Lab)
+{
+    if (depth != CV_8U)
+        return;
+
+    try
+    {
+        cv::Mat src = readImage("stereobm/aloe-L.png");
+
+        cv::gpu::GpuMat dst_lab = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(loadMat(src, useRoi), dst_lab, cv::COLOR_BGR2Lab);
+
+        cv::gpu::GpuMat dst_bgr = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(dst_lab, dst_bgr, cv::COLOR_Lab2BGR);
+
+        EXPECT_MAT_NEAR(src, dst_bgr, 10);
+    }
+    catch (const cv::Exception& e)
+    {
+#if (CUDA_VERSION < 5000)
+        ASSERT_EQ(CV_StsBadFlag, e.code);
+#else
+        FAIL();
+#endif
+    }
+}
+
+TEST_P(CvtColor, RGB2Lab)
+{
+    if (depth != CV_8U)
+        return;
+
+    try
+    {
+        cv::Mat src = readImage("stereobm/aloe-L.png");
+
+        cv::gpu::GpuMat dst_lab = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(loadMat(src, useRoi), dst_lab, cv::COLOR_RGB2Lab);
+
+        cv::gpu::GpuMat dst_bgr = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(dst_lab, dst_bgr, cv::COLOR_Lab2RGB);
+
+        EXPECT_MAT_NEAR(src, dst_bgr, 10);
+    }
+    catch (const cv::Exception& e)
+    {
+#if (CUDA_VERSION < 5000)
+        ASSERT_EQ(CV_StsBadFlag, e.code);
+#else
+        FAIL();
+#endif
+    }
+}
+
+TEST_P(CvtColor, BGR2Luv)
+{
+    if (depth != CV_8U)
+        return;
+
+    try
+    {
+        cv::Mat src = img;
+
+        cv::gpu::GpuMat dst_luv = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(loadMat(src, useRoi), dst_luv, cv::COLOR_BGR2Luv);
+
+        cv::gpu::GpuMat dst_rgb = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(dst_luv, dst_rgb, cv::COLOR_Luv2BGR);
+
+        EXPECT_MAT_NEAR(src, dst_rgb, 10);
+    }
+    catch (const cv::Exception& e)
+    {
+#if (CUDA_VERSION < 5000)
+        ASSERT_EQ(CV_StsBadFlag, e.code);
+#else
+        FAIL();
+#endif
+    }
+}
+
+TEST_P(CvtColor, RGB2Luv)
+{
+    if (depth != CV_8U)
+        return;
+
+    try
+    {
+        cv::Mat src = img;
+
+        cv::gpu::GpuMat dst_luv = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(loadMat(src, useRoi), dst_luv, cv::COLOR_RGB2Luv);
+
+        cv::gpu::GpuMat dst_rgb = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(dst_luv, dst_rgb, cv::COLOR_Luv2RGB);
+
+        EXPECT_MAT_NEAR(src, dst_rgb, 10);
+    }
+    catch (const cv::Exception& e)
+    {
+#if (CUDA_VERSION < 5000)
+        ASSERT_EQ(CV_StsBadFlag, e.code);
+#else
+        FAIL();
+#endif
+    }
+}
+
+TEST_P(CvtColor, RGBA2mRGBA)
+{
+    if (depth != CV_8U)
+        return;
+
+    try
+    {
+        cv::Mat src = randomMat(size, CV_MAKE_TYPE(depth, 4));
+
+        cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+        cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_RGBA2mRGBA);
+
+        cv::Mat dst_gold;
+        cv::cvtColor(src, dst_gold, cv::COLOR_RGBA2mRGBA);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1);
+    }
+    catch (const cv::Exception& e)
+    {
+#if (CUDA_VERSION < 5000)
+        ASSERT_EQ(CV_StsBadFlag, e.code);
+#else
+        FAIL();
+#endif
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
diff --git a/modules/imgproc/perf/perf_bilateral.cpp b/modules/imgproc/perf/perf_bilateral.cpp
new file mode 100644
index 0000000000..85cfc7d0cd
--- /dev/null
+++ b/modules/imgproc/perf/perf_bilateral.cpp
@@ -0,0 +1,38 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+using namespace testing;
+using std::tr1::make_tuple;
+using std::tr1::get;
+
+CV_ENUM(Mat_Type, CV_8UC1, CV_8UC3, CV_32FC1, CV_32FC3)
+
+typedef TestBaseWithParam< tr1::tuple<Size, int, Mat_Type> > TestBilateralFilter;
+
+PERF_TEST_P( TestBilateralFilter, BilateralFilter,
+             Combine(
+                Values( szVGA, sz1080p ), // image size
+                Values( 3, 5 ), // d
+                ValuesIn( Mat_Type::all() ) // image type
+             )
+)
+{
+    Size sz;
+    int d, type;
+    const double sigmaColor = 1., sigmaSpace = 1.;
+
+    sz         = get<0>(GetParam());
+    d          = get<1>(GetParam());
+    type       = get<2>(GetParam());
+
+    Mat src(sz, type);
+    Mat dst(sz, type);
+
+    declare.in(src, WARMUP_RNG).out(dst).time(20);
+
+    TEST_CYCLE() bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, BORDER_DEFAULT);
+
+    SANITY_CHECK(dst);
+}
diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp
index fef5f755bd..998008ae27 100644
--- a/modules/imgproc/src/precomp.hpp
+++ b/modules/imgproc/src/precomp.hpp
@@ -50,6 +50,7 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/core/internal.hpp"
+#include "opencv2/core/parallel_tool.hpp"
 #include <math.h>
 #include <assert.h>
 #include <string.h>
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index faec530b85..1bc11c7fcb 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1288,48 +1288,119 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
 namespace cv
 {
 
+class BilateralFilter_8u_Invoker :
+    public ParallelLoopBody
+{
+public:
+    BilateralFilter_8u_Invoker(const Mat &_src, Mat& _dst, Mat _temp, int _radius, int _maxk,
+        int* _space_ofs, float *_space_weight, float *_color_weight) :
+        ParallelLoopBody(), src(_src), dst(_dst), temp(_temp), radius(_radius),
+        maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
+    {
+    }
+    
+    virtual void operator() (const Range& range) const
+    {
+        int i, j, cn = src.channels(), k;
+        Size size = src.size();
+        
+        for( i = range.start; i < range.end; i++ )
+        {
+            const uchar* sptr = temp.data + (i+radius)*temp.step + radius*cn;
+            uchar* dptr = dst.data + i*dst.step;
+            
+            if( cn == 1 )
+            {
+                for( j = 0; j < size.width; j++ )
+                {
+                    float sum = 0, wsum = 0;
+                    int val0 = sptr[j];
+                    for( k = 0; k < maxk; k++ )
+                    {
+                        int val = sptr[j + space_ofs[k]];
+                        float w = space_weight[k]*color_weight[std::abs(val - val0)];
+                        sum += val*w;
+                        wsum += w;
+                    }
+                    // overflow is not possible here => there is no need to use CV_CAST_8U
+                    dptr[j] = (uchar)cvRound(sum/wsum);
+                }
+            }
+            else
+            {
+                assert( cn == 3 );
+                for( j = 0; j < size.width*3; j += 3 )
+                {
+                    float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
+                    int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
+                    for( k = 0; k < maxk; k++ )
+                    {
+                        const uchar* sptr_k = sptr + j + space_ofs[k];
+                        int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
+                        float w = space_weight[k]*color_weight[std::abs(b - b0) +
+                                                               std::abs(g - g0) + std::abs(r - r0)];
+                        sum_b += b*w; sum_g += g*w; sum_r += r*w;
+                        wsum += w;
+                    }
+                    wsum = 1.f/wsum;
+                    b0 = cvRound(sum_b*wsum);
+                    g0 = cvRound(sum_g*wsum);
+                    r0 = cvRound(sum_r*wsum);
+                    dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
+                }
+            }
+        }
+    }
+    
+private:
+    const Mat& src;
+    Mat &dst, temp;
+    int radius, maxk, * space_ofs;
+    float *space_weight, *color_weight;
+};
+
 static void
 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
-                    double sigma_color, double sigma_space,
-                    int borderType )
+    double sigma_color, double sigma_space,
+    int borderType )
 {
     int cn = src.channels();
-    int i, j, k, maxk, radius;
+    int i, j, maxk, radius;
     Size size = src.size();
-
+    
     CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) &&
-        src.type() == dst.type() && src.size() == dst.size() &&
-        src.data != dst.data );
-
+              src.type() == dst.type() && src.size() == dst.size() &&
+              src.data != dst.data );
+    
     if( sigma_color <= 0 )
         sigma_color = 1;
     if( sigma_space <= 0 )
         sigma_space = 1;
-
+    
     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
-
+    
     if( d <= 0 )
         radius = cvRound(sigma_space*1.5);
     else
         radius = d/2;
     radius = MAX(radius, 1);
     d = radius*2 + 1;
-
+    
     Mat temp;
     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-
+    
     vector<float> _color_weight(cn*256);
     vector<float> _space_weight(d*d);
     vector<int> _space_ofs(d*d);
     float* color_weight = &_color_weight[0];
     float* space_weight = &_space_weight[0];
     int* space_ofs = &_space_ofs[0];
-
+    
     // initialize color-related bilateral filter coefficients
     for( i = 0; i < 256*cn; i++ )
         color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
-
+    
     // initialize space-related bilateral filter coefficients
     for( i = -radius, maxk = 0; i <= radius; i++ )
         for( j = -radius; j <= radius; j++ )
@@ -1340,55 +1411,89 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
             space_ofs[maxk++] = (int)(i*temp.step + j*cn);
         }
+    
+    BilateralFilter_8u_Invoker body(src, dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
+    parallel_for_(Range(0, size.height), body);
+}
 
-    for( i = 0; i < size.height; i++ )
+
+class BilateralFilter_32f_Invoker :
+    public ParallelLoopBody
+{
+public:
+
+    BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
+        Mat _temp, Mat *_dest, Size _size,
+        float _scale_index, float *_space_weight, float *_expLUT) :
+        ParallelLoopBody(), cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
+        temp(_temp), dest(_dest), size(_size), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
     {
-        const uchar* sptr = temp.data + (i+radius)*temp.step + radius*cn;
-        uchar* dptr = dst.data + i*dst.step;
+    }
 
-        if( cn == 1 )
+    virtual void operator() (const Range& range) const
+    {
+        Mat& dst = *dest;
+        int i, j, k;
+
+        for( i = range.start; i < range.end; i++ )
         {
-            for( j = 0; j < size.width; j++ )
+            const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn;
+            float* dptr = (float*)(dst.data + i*dst.step);
+
+            if( cn == 1 )
             {
-                float sum = 0, wsum = 0;
-                int val0 = sptr[j];
-                for( k = 0; k < maxk; k++ )
+                for( j = 0; j < size.width; j++ )
                 {
-                    int val = sptr[j + space_ofs[k]];
-                    float w = space_weight[k]*color_weight[std::abs(val - val0)];
-                    sum += val*w;
-                    wsum += w;
+                    float sum = 0, wsum = 0;
+                    float val0 = sptr[j];
+                    for( k = 0; k < maxk; k++ )
+                    {
+                        float val = sptr[j + space_ofs[k]];
+                        float alpha = (float)(std::abs(val - val0)*scale_index);
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
+                        sum += val*w;
+                        wsum += w;
+                    }
+                    dptr[j] = (float)(sum/wsum);
                 }
-                // overflow is not possible here => there is no need to use CV_CAST_8U
-                dptr[j] = (uchar)cvRound(sum/wsum);
             }
-        }
-        else
-        {
-            assert( cn == 3 );
-            for( j = 0; j < size.width*3; j += 3 )
+            else
             {
-                float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
-                int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
-                for( k = 0; k < maxk; k++ )
+                assert( cn == 3 );
+                for( j = 0; j < size.width*3; j += 3 )
                 {
-                    const uchar* sptr_k = sptr + j + space_ofs[k];
-                    int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
-                    float w = space_weight[k]*color_weight[std::abs(b - b0) +
-                        std::abs(g - g0) + std::abs(r - r0)];
-                    sum_b += b*w; sum_g += g*w; sum_r += r*w;
-                    wsum += w;
+                    float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
+                    float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
+                    for( k = 0; k < maxk; k++ )
+                    {
+                        const float* sptr_k = sptr + j + space_ofs[k];
+                        float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
+                        float alpha = (float)((std::abs(b - b0) +
+                            std::abs(g - g0) + std::abs(r - r0))*scale_index);
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
+                        sum_b += b*w; sum_g += g*w; sum_r += r*w;
+                        wsum += w;
+                    }
+                    wsum = 1.f/wsum;
+                    b0 = sum_b*wsum;
+                    g0 = sum_g*wsum;
+                    r0 = sum_r*wsum;
+                    dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
                 }
-                wsum = 1.f/wsum;
-                b0 = cvRound(sum_b*wsum);
-                g0 = cvRound(sum_g*wsum);
-                r0 = cvRound(sum_r*wsum);
-                dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
             }
         }
     }
-}
 
+private:
+    int cn, radius, maxk, *space_ofs;
+    Mat temp, *dest;
+    Size size;
+    float scale_index, *space_weight, *expLUT;
+};
 
 static void
 bilateralFilter_32f( const Mat& src, Mat& dst, int d,
@@ -1396,7 +1501,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
                      int borderType )
 {
     int cn = src.channels();
-    int i, j, k, maxk, radius;
+    int i, j, maxk, radius;
     double minValSrc=-1, maxValSrc=1;
     const int kExpNumBinsPerChannel = 1 << 12;
     int kExpNumBins = 0;
@@ -1474,57 +1579,10 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
             space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
         }
 
-    for( i = 0; i < size.height; i++ )
-    {
-        const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn;
-        float* dptr = (float*)(dst.data + i*dst.step);
+    // parallel_for usage
 
-        if( cn == 1 )
-        {
-            for( j = 0; j < size.width; j++ )
-            {
-                float sum = 0, wsum = 0;
-                float val0 = sptr[j];
-                for( k = 0; k < maxk; k++ )
-                {
-                    float val = sptr[j + space_ofs[k]];
-                    float alpha = (float)(std::abs(val - val0)*scale_index);
-                    int idx = cvFloor(alpha);
-                    alpha -= idx;
-                    float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
-                    sum += val*w;
-                    wsum += w;
-                }
-                dptr[j] = (float)(sum/wsum);
-            }
-        }
-        else
-        {
-            assert( cn == 3 );
-            for( j = 0; j < size.width*3; j += 3 )
-            {
-                float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
-                float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
-                for( k = 0; k < maxk; k++ )
-                {
-                    const float* sptr_k = sptr + j + space_ofs[k];
-                    float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
-                    float alpha = (float)((std::abs(b - b0) +
-                        std::abs(g - g0) + std::abs(r - r0))*scale_index);
-                    int idx = cvFloor(alpha);
-                    alpha -= idx;
-                    float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
-                    sum_b += b*w; sum_g += g*w; sum_r += r*w;
-                    wsum += w;
-                }
-                wsum = 1.f/wsum;
-                b0 = sum_b*wsum;
-                g0 = sum_g*wsum;
-                r0 = sum_r*wsum;
-                dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
-            }
-        }
-    }
+    BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, &dst, size, scale_index, space_weight, expLUT);
+    parallel_for_(Range(0, size.height), body);
 }
 
 }
diff --git a/modules/imgproc/test/test_bilateral_filter.cpp b/modules/imgproc/test/test_bilateral_filter.cpp
new file mode 100644
index 0000000000..034f9c363f
--- /dev/null
+++ b/modules/imgproc/test/test_bilateral_filter.cpp
@@ -0,0 +1,290 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+
+namespace cvtest 
+{ 
+    class CV_BilateralFilterTest : 
+        public cvtest::BaseTest
+    {
+    public:
+        enum 
+        {
+            MAX_WIDTH = 1920, MIN_WIDTH = 1,
+            MAX_HEIGHT = 1080, MIN_HEIGHT = 1 
+        };
+
+        CV_BilateralFilterTest();
+        ~CV_BilateralFilterTest();
+
+    protected:
+        virtual void run_func();
+        virtual int prepare_test_case(int test_case_index);
+        virtual int validate_test_results(int test_case_index);
+        
+    private:
+        void reference_bilateral_filter(const Mat& src, Mat& dst, int d, double sigma_color,
+            double sigma_space, int borderType = BORDER_DEFAULT);
+            
+        int getRandInt(RNG& rng, int min_value, int max_value) const;
+                       
+        double _sigma_color;
+        double _sigma_space;
+        
+        Mat _src;
+        Mat _parallel_dst;
+        int _d;
+    };
+
+    CV_BilateralFilterTest::CV_BilateralFilterTest() : 
+        cvtest::BaseTest(), _src(), _parallel_dst(), _d()
+    {
+        test_case_count = 1000;
+    }
+
+    CV_BilateralFilterTest::~CV_BilateralFilterTest()
+    {
+    }
+
+    int CV_BilateralFilterTest::getRandInt(RNG& rng, int min_value, int max_value) const
+    {
+        double rand_value = rng.uniform(log(min_value), log(max_value + 1));
+        return cvRound(exp(rand_value));
+    }
+
+    void CV_BilateralFilterTest::reference_bilateral_filter(const Mat &src, Mat &dst, int d, 
+        double sigma_color, double sigma_space, int borderType)
+    {
+        int cn = src.channels();
+        int i, j, k, maxk, radius;
+        double minValSrc = -1, maxValSrc = 1;
+        const int kExpNumBinsPerChannel = 1 << 12;
+        int kExpNumBins = 0;
+        float lastExpVal = 1.f;
+        float len, scale_index;
+        Size size = src.size();
+
+        dst.create(size, src.type());
+
+        CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) &&
+            src.type() == dst.type() && src.size() == dst.size() &&
+            src.data != dst.data );
+
+        if( sigma_color <= 0 )
+            sigma_color = 1;
+        if( sigma_space <= 0 )
+            sigma_space = 1;
+
+        double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
+        double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
+
+        if( d <= 0 )
+            radius = cvRound(sigma_space*1.5);
+        else
+            radius = d/2;
+        radius = MAX(radius, 1);
+        d = radius*2 + 1;
+        // compute the min/max range for the input image (even if multichannel)
+
+        minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
+        if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
+        {
+            src.copyTo(dst);
+            return;
+        }
+
+        // temporary copy of the image with borders for easy processing
+        Mat temp;
+        copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
+        patchNaNs(temp);
+
+        // allocate lookup tables
+        vector<float> _space_weight(d*d);
+        vector<int> _space_ofs(d*d);
+        float* space_weight = &_space_weight[0];
+        int* space_ofs = &_space_ofs[0];
+
+        // assign a length which is slightly more than needed
+        len = (float)(maxValSrc - minValSrc) * cn;
+        kExpNumBins = kExpNumBinsPerChannel * cn;
+        vector<float> _expLUT(kExpNumBins+2);
+        float* expLUT = &_expLUT[0];
+
+        scale_index = kExpNumBins/len;
+
+        // initialize the exp LUT
+        for( i = 0; i < kExpNumBins+2; i++ )
+        {
+            if( lastExpVal > 0.f )
+            {
+                double val =  i / scale_index;
+                expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
+                lastExpVal = expLUT[i];
+            }
+            else
+                expLUT[i] = 0.f;
+        }
+
+        // initialize space-related bilateral filter coefficients
+        for( i = -radius, maxk = 0; i <= radius; i++ )
+            for( j = -radius; j <= radius; j++ )
+            {
+                double r = std::sqrt((double)i*i + (double)j*j);
+                if( r > radius )
+                    continue;
+                space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
+                space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
+            }
+
+        for( i = 0; i < size.height; i++ )
+        {
+            const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn;
+            float* dptr = (float*)(dst.data + i*dst.step);
+
+            if( cn == 1 )
+            {
+                for( j = 0; j < size.width; j++ )
+                {
+                    float sum = 0, wsum = 0;
+                    float val0 = sptr[j];
+                    for( k = 0; k < maxk; k++ )
+                    {
+                        float val = sptr[j + space_ofs[k]];
+                        float alpha = (float)(std::abs(val - val0)*scale_index);
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
+                        sum += val*w;
+                        wsum += w;
+                    }
+                    dptr[j] = (float)(sum/wsum);
+                }
+            }
+            else
+            {
+                assert( cn == 3 );
+                for( j = 0; j < size.width*3; j += 3 )
+                {
+                    float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
+                    float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
+                    for( k = 0; k < maxk; k++ )
+                    {
+                        const float* sptr_k = sptr + j + space_ofs[k];
+                        float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
+                        float alpha = (float)((std::abs(b - b0) +
+                            std::abs(g - g0) + std::abs(r - r0))*scale_index);
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
+                        sum_b += b*w; sum_g += g*w; sum_r += r*w;
+                        wsum += w;
+                    }
+                    wsum = 1.f/wsum;
+                    b0 = sum_b*wsum;
+                    g0 = sum_g*wsum;
+                    r0 = sum_r*wsum;
+                    dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
+                }
+            }
+        }
+    }
+
+    int CV_BilateralFilterTest::prepare_test_case(int /* test_case_index */)
+    {
+        const static int types[] = { CV_32FC1, CV_32FC3, CV_8UC1, CV_8UC3 };
+        RNG& rng = ts->get_rng();
+        Size size(getRandInt(rng, MIN_WIDTH, MAX_WIDTH), getRandInt(rng, MIN_HEIGHT, MAX_HEIGHT));
+        int type = types[rng(sizeof(types) / sizeof(types[0]))];
+    
+        _d = rng.uniform(0., 1.) > 0.5 ? 5 : 3; 
+        
+        _src.create(size, type);
+        
+        rng.fill(_src, RNG::UNIFORM, 0, 256);
+        
+        _sigma_color = _sigma_space = 1.;
+        
+        return 1;
+    }
+
+    int CV_BilateralFilterTest::validate_test_results(int test_case_index)
+    {
+        static const double eps = 1;
+        
+        Mat reference_dst, reference_src;
+        if (_src.depth() == CV_32F)
+            reference_bilateral_filter(_src, reference_dst, _d, _sigma_color, _sigma_space);
+        else
+        {
+            int type = _src.type();
+            _src.convertTo(reference_src, CV_32F);
+            reference_bilateral_filter(reference_src, reference_dst, _d, _sigma_color, _sigma_space);
+            reference_dst.convertTo(reference_dst, type);
+        }
+        
+        double e = norm(reference_dst, _parallel_dst);
+        if (e > eps)
+        {
+            ts->printf(cvtest::TS::CONSOLE, "actual error: %g, expected: %g", e, eps);
+            ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
+        }
+        else
+            ts->set_failed_test_info(cvtest::TS::OK);
+        
+        return BaseTest::validate_test_results(test_case_index);
+    }
+
+    void CV_BilateralFilterTest::run_func()
+    {
+        bilateralFilter(_src, _parallel_dst, _d, _sigma_color, _sigma_space);
+    }
+
+    TEST(Imgproc_BilateralFilter, accuracy)
+    { 
+        CV_BilateralFilterTest test; 
+        test.safe_run(); 
+    }
+
+} // end of namespace cvtest
diff --git a/samples/python2/facerec_demo.py b/samples/python2/facerec_demo.py
index 74f2681f3e..6fd3199d8d 100644
--- a/samples/python2/facerec_demo.py
+++ b/samples/python2/facerec_demo.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # Software License Agreement (BSD License)
 #
-# Copyright (c) 2012, Philipp Wagner
+# Copyright (c) 2012, Philipp Wagner <bytefish[at]gmx[dot]de>.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/samples/python2/feature_homography.py b/samples/python2/feature_homography.py
index c8b09de4cc..d553deb977 100644
--- a/samples/python2/feature_homography.py
+++ b/samples/python2/feature_homography.py
@@ -19,11 +19,8 @@ import numpy as np
 import cv2
 import video
 import common
-from operator import attrgetter
-
-def get_size(a):
-    h, w = a.shape[:2]
-    return w, h
+from collections import namedtuple
+from common import getsize
 
     
 FLANN_INDEX_KDTREE = 1
@@ -33,12 +30,29 @@ flann_params= dict(algorithm = FLANN_INDEX_LSH,
                    key_size = 12,     # 20
                    multi_probe_level = 1) #2
 
-
 MIN_MATCH_COUNT = 10
 
+
+ar_verts = np.float32([[0, 0, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0],
+                       [0, 0, 1], [0, 1, 1], [1, 1, 1], [1, 0, 1], 
+                       [0.5, 0.5, 2]])
+ar_edges = [(0, 1), (1, 2), (2, 3), (3, 0), 
+            (4, 5), (5, 6), (6, 7), (7, 4),
+            (0, 4), (1, 5), (2, 6), (3, 7), 
+            (4, 8), (5, 8), (6, 8), (7, 8)]
+
+
+
+def draw_keypoints(vis, keypoints, color = (0, 255, 255)):
+    for kp in keypoints:
+            x, y = kp.pt
+            cv2.circle(vis, (int(x), int(y)), 2, color)
+
 class App:
     def __init__(self, src):
         self.cap = video.create_capture(src)
+        self.frame = None
+        self.paused = False
         self.ref_frame  = None
 
         self.detector = cv2.ORB( nfeatures = 1000 )
@@ -47,19 +61,18 @@ class App:
         cv2.namedWindow('plane')
         self.rect_sel = common.RectSelector('plane', self.on_rect)
 
-        self.frame = None
 
     def match_frames(self):
         if len(self.frame_desc) < MIN_MATCH_COUNT or len(self.frame_desc) < MIN_MATCH_COUNT:
             return
         
-        raw_matches = self.matcher.knnMatch(self.ref_descs, trainDescriptors = self.frame_desc, k = 2)
+        raw_matches = self.matcher.knnMatch(self.frame_desc, k = 2)
         p0, p1 = [], []
         for m in raw_matches:
             if len(m) == 2 and m[0].distance < m[1].distance * 0.75:
                 m = m[0]
-                p0.append( self.ref_points[m.queryIdx].pt )
-                p1.append( self.frame_points[m.trainIdx].pt )
+                p0.append( self.ref_points[m.trainIdx].pt )  # queryIdx
+                p1.append( self.frame_points[m.queryIdx].pt )
         p0, p1 = np.float32((p0, p1))
         if len(p0) < MIN_MATCH_COUNT:
             return
@@ -72,44 +85,31 @@ class App:
         return p0, p1, H
 
 
-    def on_frame(self, frame):
-        if self.frame is None or not self.rect_sel.dragging:
-            self.frame = frame = np.fliplr(frame).copy()
-            self.frame_points, self.frame_desc = self.detector.detectAndCompute(self.frame, None)
-            if self.frame_desc is None:  # detectAndCompute returns descs=None if not keypoints found
-                self.frame_desc = []
-        else:
-            self.ref_frame = None
+    def on_frame(self, vis):
+        match = self.match_frames()
+        if match is None:
+            return
+        w, h = getsize(self.frame)
+        p0, p1, H = match
+        for (x0, y0), (x1, y1) in zip(np.int32(p0), np.int32(p1)):
+            cv2.line(vis, (x0+w, y0), (x1, y1), (0, 255, 0))
+        x0, y0, x1, y1 = self.ref_rect
+        corners0 = np.float32([[x0, y0], [x1, y0], [x1, y1], [x0, y1]])
+        img_corners = cv2.perspectiveTransform(corners0.reshape(1, -1, 2), H)
+        cv2.polylines(vis, [np.int32(img_corners)], True, (255, 255, 255), 2)
 
-        w, h = get_size(self.frame)
-        vis = np.zeros((h, w*2, 3), np.uint8)
-        vis[:h,:w] = self.frame
-        self.rect_sel.draw(vis)
-        for kp in self.frame_points:
-            x, y = kp.pt
-            cv2.circle(vis, (int(x), int(y)), 2, (0, 255, 255))
-        
-        if self.ref_frame is not None:
-            vis[:h,w:] = self.ref_frame
-            x0, y0, x1, y1 = self.ref_rect
-            cv2.rectangle(vis, (x0+w, y0), (x1+w, y1), (0, 255, 0), 2)
-
-            for kp in self.ref_points:
-                x, y = kp.pt
-                cv2.circle(vis, (int(x+w), int(y)), 2, (0, 255, 255))
-
-
-            match = self.match_frames()
-            if match is not None:
-                p0, p1, H = match
-                for (x0, y0), (x1, y1) in zip(np.int32(p0), np.int32(p1)):
-                    cv2.line(vis, (x0+w, y0), (x1, y1), (0, 255, 0))
-                x0, y0, x1, y1 = self.ref_rect
-                corners = np.float32([[x0, y0], [x1, y0], [x1, y1], [x0, y1]])
-                corners = np.int32( cv2.perspectiveTransform(corners.reshape(1, -1, 2), H) )
-                cv2.polylines(vis, [corners], True, (255, 255, 255), 2)
-        
-        cv2.imshow('plane', vis)
+        corners3d = np.hstack([corners0, np.zeros((4, 1), np.float32)])
+        fx = 0.9
+        K = np.float64([[fx*w, 0, 0.5*(w-1)],
+                        [0, fx*w, 0.5*(h-1)],
+                        [0.0,0.0,      1.0]])
+        dist_coef = np.zeros(4)
+        ret, rvec, tvec = cv2.solvePnP(corners3d, img_corners, K, dist_coef)
+        verts = ar_verts * [(x1-x0), (y1-y0), -(x1-x0)*0.3] + (x0, y0, 0)
+        verts = cv2.projectPoints(verts, rvec, tvec, K, dist_coef)[0].reshape(-1, 2)
+        for i, j in ar_edges:
+            (x0, y0), (x1, y1) = verts[i], verts[j]
+            cv2.line(vis, (int(x0), int(y0)), (int(x1), int(y1)), (255, 255, 0), 2)
 
     def on_rect(self, rect):
         x0, y0, x1, y1 = rect
@@ -123,11 +123,39 @@ class App:
                 descs.append(desc)
         self.ref_points, self.ref_descs = points, np.uint8(descs)
 
+        self.matcher.clear()
+        self.matcher.add([self.ref_descs])
+
     def run(self):
         while True:
-            ret, frame = self.cap.read()
-            self.on_frame(frame)
+            playing = not self.paused and not self.rect_sel.dragging
+            if playing or self.frame is None:
+                ret, frame = self.cap.read()
+                if not ret:
+                    break
+                self.frame = np.fliplr(frame).copy()
+                self.frame_points, self.frame_desc = self.detector.detectAndCompute(self.frame, None)
+                if self.frame_desc is None:  # detectAndCompute returns descs=None if not keypoints found
+                    self.frame_desc = []
+            
+            w, h = getsize(self.frame)
+            vis = np.zeros((h, w*2, 3), np.uint8)
+            vis[:h,:w] = self.frame
+            if self.ref_frame is not None:
+                vis[:h,w:] = self.ref_frame
+                x0, y0, x1, y1 = self.ref_rect
+                cv2.rectangle(vis, (x0+w, y0), (x1+w, y1), (0, 255, 0), 2)
+                draw_keypoints(vis[:,w:], self.ref_points)
+            draw_keypoints(vis, self.frame_points)
+
+            if playing and self.ref_frame is not None:
+                self.on_frame(vis)
+            
+            self.rect_sel.draw(vis)
+            cv2.imshow('plane', vis)
             ch = cv2.waitKey(1)
+            if ch == ord(' '):
+                self.paused = not self.paused
             if ch == 27:
                 break
 
@@ -136,5 +164,5 @@ if __name__ == '__main__':
 
     import sys
     try: video_src = sys.argv[1]
-    except: video_src = '0'
+    except: video_src = 0
     App(video_src).run()