diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index e731dedca7..98d62a3b2c 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -44,6 +44,7 @@ The references are:
 #include "precomp.hpp"
 #include "fast_score.hpp"
 #include "opencl_kernels_features2d.hpp"
+#include "opencv2/core/hal/intrin.hpp"
 
 #include "opencv2/core/openvx/ovx_defs.hpp"
 #if defined _MSC_VER
@@ -58,9 +59,10 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
 {
     Mat img = _img.getMat();
     const int K = patternSize/2, N = patternSize + K + 1;
-#if CV_SSE2
+#if CV_SIMD128
     const int quarterPatternSize = patternSize/4;
-    (void)quarterPatternSize;
+    v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K);
+    bool hasSimd = hasSIMD128();
 #endif
     int i, j, k, pixel[25];
     makeOffsets(pixel, (int)img.step, patternSize);
@@ -69,12 +71,6 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
 
     threshold = std::min(std::max(threshold, 0), 255);
 
-#if CV_SSE2
-    __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K);
-    (void)K16;
-    (void)delta;
-    (void)t;
-#endif
     uchar threshold_tab[512];
     for( i = -255; i <= 255; i++ )
         threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0);
@@ -99,66 +95,76 @@ void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bo
         if( i < img.rows - 3 )
         {
             j = 3;
-    #if CV_SSE2
-            if( patternSize == 16 )
+#if CV_SIMD128
+            if( hasSimd )
             {
-                for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
+                if( patternSize == 16 )
                 {
-                    __m128i m0, m1;
-                    __m128i v0 = _mm_loadu_si128((const __m128i*)ptr);
-                    __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta);
-                    v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta);
-
-                    __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta);
-                    __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta);
-                    __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta);
-                    __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta);
-                    m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0));
-                    m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1));
-                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0)));
-                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2)));
-                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0)));
-                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3)));
-                    m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0)));
-                    m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0)));
-                    m0 = _mm_or_si128(m0, m1);
-                    int mask = _mm_movemask_epi8(m0);
-                    if( mask == 0 )
-                        continue;
-                    if( (mask & 255) == 0 )
+                    for(; j < img.cols - 16 - 3; j += 16, ptr += 16)
                     {
-                        j -= 8;
-                        ptr -= 8;
-                        continue;
-                    }
+                        v_uint8x16 v = v_load(ptr);
+                        v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta);
+                        v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta);
 
-                    __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0;
-                    for( k = 0; k < N; k++ )
-                    {
-                        __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta);
-                        m0 = _mm_cmpgt_epi8(x, v0);
-                        m1 = _mm_cmpgt_epi8(v1, x);
+                        v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta));
+                        v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta));
+                        v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta));
+                        v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta));
 
-                        c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0);
-                        c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1);
+                        v_int8x16 m0, m1;
+                        m0 = (v0 < x0) & (v0 < x1);
+                        m1 = (x0 < v1) & (x1 < v1);
+                        m0 = m0 | ((v0 < x1) & (v0 < x2));
+                        m1 = m1 | ((x1 < v1) & (x2 < v1));
+                        m0 = m0 | ((v0 < x2) & (v0 < x3));
+                        m1 = m1 | ((x2 < v1) & (x3 < v1));
+                        m0 = m0 | ((v0 < x3) & (v0 < x0));
+                        m1 = m1 | ((x3 < v1) & (x0 < v1));
+                        m0 = m0 | m1;
 
-                        max0 = _mm_max_epu8(max0, c0);
-                        max1 = _mm_max_epu8(max1, c1);
-                    }
-
-                    max0 = _mm_max_epu8(max0, max1);
-                    int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16));
-
-                    for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
-                        if(m & 1)
+                        int mask = v_signmask(m0);
+                        if( mask == 0 )
+                            continue;
+                        if( (mask & 255) == 0 )
                         {
-                            cornerpos[ncorners++] = j+k;
-                            if(nonmax_suppression)
-                                curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                            j -= 8;
+                            ptr -= 8;
+                            continue;
                         }
+
+                        v_int8x16 c0 = v_setzero_s8();
+                        v_int8x16 c1 = v_setzero_s8();
+                        v_uint8x16 max0 = v_setzero_u8();
+                        v_uint8x16 max1 = v_setzero_u8();
+                        for( k = 0; k < N; k++ )
+                        {
+                            v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta);
+                            m0 = v0 < x;
+                            m1 = x < v1;
+
+                            c0 = v_sub_wrap(c0, m0) & m0;
+                            c1 = v_sub_wrap(c1, m1) & m1;
+
+                            max0 = v_max(max0, v_reinterpret_as_u8(c0));
+                            max1 = v_max(max1, v_reinterpret_as_u8(c1));
+                        }
+
+                        max0 = v_max(max0, max1);
+                        int m = v_signmask(K16 < max0);
+
+                        for( k = 0; m > 0 && k < 16; k++, m >>= 1 )
+                        {
+                            if(m & 1)
+                            {
+                                cornerpos[ncorners++] = j+k;
+                                if(nonmax_suppression)
+                                    curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold);
+                            }
+                        }
+                    }
                 }
             }
-    #endif
+#endif
             for( ; j < img.cols - 3; j++, ptr++ )
             {
                 int v = ptr[0];