Optimize OpenCL version of BFMatcher

2014-09-23 15:13:46 +04:00
parent 22ff1e8826
commit 3787388eac
2 changed files with 296 additions and 723 deletions
@@ -59,39 +59,71 @@
 #define MAX_DESC_LEN 64
 #endif

+#define BLOCK_SIZE_ODD          (BLOCK_SIZE + 1)
+#ifndef SHARED_MEM_SZ
+#  if (BLOCK_SIZE < MAX_DESC_LEN)
+#    define SHARED_MEM_SZ      (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))
+#  else
+#    define SHARED_MEM_SZ      (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)
+#  endif
+#endif
+
 #ifndef DIST_TYPE
 #define DIST_TYPE 2
 #endif

 // dirty fix for non-template support
-#if   (DIST_TYPE == 2) // L1Dist
+#if (DIST_TYPE == 2) // L1Dist
 #   ifdef T_FLOAT
-#       define DIST(x, y) fabs((x) - (y))
-        typedef float value_type;
        typedef float result_type;
+#       if (8 == kercn)
+            typedef float8 value_type;
+#           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
+#       elif (4 == kercn)
+            typedef float4 value_type;
+#           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
+#       else
+            typedef float value_type;
+#           define DIST(x, y) result += fabs((x) - (y))
+#       endif
 #   else
-#       define DIST(x, y) abs((x) - (y))
-        typedef int value_type;
        typedef int result_type;
+#       if (8 == kercn)
+            typedef int8 value_type;
+#           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
+#       elif (4 == kercn)
+            typedef int4 value_type;
+#           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
+#       else
+            typedef int  value_type;
+#           define DIST(x, y) result += abs((x) - (y))
+#       endif
 #   endif
-#define DIST_RES(x) (x)
+#   define DIST_RES(x) (x)
 #elif (DIST_TYPE == 4) // L2Dist
-#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
-typedef float value_type;
-typedef float result_type;
-#define DIST_RES(x) sqrt(x)
+    typedef float result_type;
+#   if (8 == kercn)
+        typedef float8 value_type;
+#       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}
+#   elif (4 == kercn)
+        typedef float4      value_type;
+#       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d, d);}
+#   else
+        typedef float       value_type;
+#       define DIST(x, y)   {value_type d = ((x) - (y)); result = mad(d, d, result);}
+#   endif
+#   define DIST_RES(x) sqrt(x)
 #elif (DIST_TYPE == 6) // Hamming
-//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-inline int bit1Count(int v)
-{
-    v = v - ((v >> 1) & 0x55555555);                    // reuse input as temporary
-    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);     // temp
-    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
-}
-#define DIST(x, y) bit1Count( (x) ^ (y) )
-typedef int value_type;
-typedef int result_type;
-#define DIST_RES(x) (x)
+#   if (8 == kercn)
+        typedef int8 value_type;
+#   elif (4 == kercn)
+        typedef int4 value_type;
+#   else
+        typedef int value_type;
+#   endif
+    typedef int result_type;
+#   define DIST(x, y) result += popcount( (x) ^ (y) )
+#   define DIST_RES(x) (x)
 #endif

 inline result_type reduce_block(
@@ -105,9 +137,7 @@ inline result_type reduce_block(
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
-        result += DIST(
-            s_query[lidy * BLOCK_SIZE + j],
-            s_train[j * BLOCK_SIZE + lidx]);
+        DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
    }
    return DIST_RES(result);
 }
@@ -123,11 +153,9 @@ inline result_type reduce_block_match(
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
-        result += DIST(
-            s_query[lidy * BLOCK_SIZE + j],
-            s_train[j * BLOCK_SIZE + lidx]);
+        DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
    }
-    return (result);
+    return result;
 }

 inline result_type reduce_multi_block(
@@ -142,23 +170,16 @@ inline result_type reduce_multi_block(
    #pragma unroll
    for (int j = 0 ; j < BLOCK_SIZE ; j++)
    {
-        result += DIST(
-            s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
-            s_train[j * BLOCK_SIZE + lidx]);
+        DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
    }
    return result;
 }

-/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
-local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
-*/
-__kernel void BruteForceMatch_UnrollMatch(
+__kernel void BruteForceMatch_Match(
    __global T *query,
    __global T *train,
-    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
-    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
@@ -170,17 +191,26 @@ __kernel void BruteForceMatch_UnrollMatch(
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);

-    __local value_type *s_query = (__local value_type *)sharebuffer;
-    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
+    const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
+    const int queryOffset = min(queryIdx, query_rows - 1) * step;
+    __global TN *query_vec = (__global TN *)(query + queryOffset);
+    query_cols /= kercn;

-    int queryIdx = groupidx * BLOCK_SIZE + lidy;
+    __local float sharebuffer[SHARED_MEM_SZ];
+    __local value_type *s_query = (__local value_type *)sharebuffer;
+
+#if 0 < MAX_DESC_LEN
+    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
    // load the query into local memory.
    #pragma unroll
-    for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)
+    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
    {
-        int loadx = lidx + i * BLOCK_SIZE;
-        s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        const int loadx = mad24(BLOCK_SIZE, i, lidx);
+        s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
    }
+#else
+    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
+#endif

    float myBestDistance = MAX_FLOAT;
    int myBestTrainIdx = -1;
@@ -189,12 +219,16 @@ __kernel void BruteForceMatch_UnrollMatch(
    for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
    {
        result_type result = 0;
+
+        const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
+        __global TN *train_vec = (__global TN *)(train + trainOffset);
+#if 0 < MAX_DESC_LEN
        #pragma unroll
-        for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
+        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
        {
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
-            const int loadx = lidx + i * BLOCK_SIZE;
-            s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+            const int loadx = mad24(BLOCK_SIZE, i, lidx);
+            s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;

            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);
@@ -203,89 +237,18 @@ __kernel void BruteForceMatch_UnrollMatch(

            barrier(CLK_LOCAL_MEM_FENCE);
        }
-
-        result = DIST_RES(result);
-
-        int trainIdx = t * BLOCK_SIZE + lidx;
-
-        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
+#else
+        for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)
        {
-            myBestDistance = result;
-            myBestTrainIdx = trainIdx;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    __local float *s_distance = (__local float*)(sharebuffer);
-    __local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
-
-    //find BestMatch
-    s_distance += lidy * BLOCK_SIZE;
-    s_trainIdx += lidy * BLOCK_SIZE;
-    s_distance[lidx] = myBestDistance;
-    s_trainIdx[lidx] = myBestTrainIdx;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //reduce -- now all reduce implement in each threads.
-    #pragma unroll
-    for (int k = 0 ; k < BLOCK_SIZE; k++)
-    {
-        if (myBestDistance > s_distance[k])
-        {
-            myBestDistance = s_distance[k];
-            myBestTrainIdx = s_trainIdx[k];
-        }
-    }
-
-    if (queryIdx < query_rows && lidx == 0)
-    {
-        bestTrainIdx[queryIdx] = myBestTrainIdx;
-        bestDistance[queryIdx] = myBestDistance;
-    }
-}
-
-__kernel void BruteForceMatch_Match(
-    __global T *query,
-    __global T *train,
-    //__global float *mask,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    __local float *sharebuffer,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step
-)
-{
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-
-    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
-
-    float myBestDistance = MAX_FLOAT;
-    int myBestTrainIdx = -1;
-
-    __local value_type *s_query = (__local value_type *)sharebuffer;
-    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
-
-    // loop
-    for (int t = 0 ;  t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
-    {
-        result_type result = 0;
-        for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
-        {
-            const int loadx = lidx + i * BLOCK_SIZE;
+            const int loadx = mad24(i, BLOCK_SIZE, lidx);
            //load query and train into local memory
-            s_query[lidy * BLOCK_SIZE + lidx] = 0;
-            s_train[lidx * BLOCK_SIZE + lidy] = 0;
+            s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
+            s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;

            if (loadx < query_cols)
            {
-                s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
-                s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+                s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
+                s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@@ -294,10 +257,10 @@ __kernel void BruteForceMatch_Match(

            barrier(CLK_LOCAL_MEM_FENCE);
        }
-
+#endif
        result = DIST_RES(result);

-        const int trainIdx = t * BLOCK_SIZE + lidx;
+        const int trainIdx = mad24(BLOCK_SIZE, t, lidx);

        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
        {
@@ -309,17 +272,18 @@ __kernel void BruteForceMatch_Match(
    barrier(CLK_LOCAL_MEM_FENCE);

    __local float *s_distance = (__local float *)sharebuffer;
-    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);

    //findBestMatch
-    s_distance += lidy * BLOCK_SIZE;
-    s_trainIdx += lidy * BLOCK_SIZE;
+    s_distance += lidy * BLOCK_SIZE_ODD;
+    s_trainIdx += lidy * BLOCK_SIZE_ODD;
    s_distance[lidx] = myBestDistance;
    s_trainIdx[lidx] = myBestTrainIdx;

    barrier(CLK_LOCAL_MEM_FENCE);

    //reduce -- now all reduce implement in each threads.
+    #pragma unroll
    for (int k = 0 ; k < BLOCK_SIZE; k++)
    {
        if (myBestDistance > s_distance[k])
@@ -336,76 +300,14 @@ __kernel void BruteForceMatch_Match(
    }
 }

-//radius_unrollmatch
-__kernel void BruteForceMatch_RadiusUnrollMatch(
-    __global T *query,
-    __global T *train,
-    float maxDistance,
-    //__global float *mask,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    __global int *nMatches,
-    __local float *sharebuffer,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int bestTrainIdx_cols,
-    int step,
-    int ostep
-)
-{
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-    const int groupidy = get_group_id(1);
-
-    const int queryIdx = groupidy * BLOCK_SIZE + lidy;
-    const int trainIdx = groupidx * BLOCK_SIZE + lidx;
-
-    __local value_type *s_query = (__local value_type *)sharebuffer;
-    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
-
-    result_type result = 0;
-    for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
-    {
-        //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
-        const int loadx = lidx + i * BLOCK_SIZE;
-
-        s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-        s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-
-        //synchronize to make sure each elem for reduceIteration in share memory is written already.
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        result += reduce_block(s_query, s_train, lidx, lidy);
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (queryIdx < query_rows && trainIdx < train_rows &&
-        convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
-    {
-        int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-
-        if(ind < bestTrainIdx_cols)
-        {
-            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
-            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
-        }
-    }
-}
-
 //radius_match
 __kernel void BruteForceMatch_RadiusMatch(
    __global T *query,
    __global T *train,
    float maxDistance,
-    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
-    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
@@ -420,20 +322,34 @@ __kernel void BruteForceMatch_RadiusMatch(
    const int groupidx = get_group_id(0);
    const int groupidy = get_group_id(1);

-    const int queryIdx = groupidy * BLOCK_SIZE + lidy;
-    const int trainIdx = groupidx * BLOCK_SIZE + lidx;
+    const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);
+    const int queryOffset = min(queryIdx, query_rows - 1) * step;
+    __global TN *query_vec = (__global TN *)(query + queryOffset);

+    const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);
+    const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;
+    __global TN *train_vec = (__global TN *)(train + trainOffset);
+
+    query_cols /= kercn;
+
+    __local float sharebuffer[SHARED_MEM_SZ];
    __local value_type *s_query = (__local value_type *)sharebuffer;
-    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
+    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;

    result_type result = 0;
    for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
    {
        //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
-        const int loadx = lidx + i * BLOCK_SIZE;
+        const int loadx = mad24(BLOCK_SIZE, i, lidx);

-        s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-        s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
+        s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
+
+        if (loadx < query_cols)
+        {
+            s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
+            s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
+        }

        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -442,28 +358,23 @@ __kernel void BruteForceMatch_RadiusMatch(

        barrier(CLK_LOCAL_MEM_FENCE);
    }
-
-    if (queryIdx < query_rows && trainIdx < train_rows &&
-        convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
+    if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)
    {
        int ind = atom_inc(nMatches + queryIdx);

        if(ind < bestTrainIdx_cols)
        {
-            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
-            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+            bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;
+            bestDistance[mad24(queryIdx, ostep, ind)] = result;
        }
    }
 }

-
-__kernel void BruteForceMatch_knnUnrollMatch(
+__kernel void BruteForceMatch_knnMatch(
    __global T *query,
    __global T *train,
-    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
-    __local float *sharebuffer,
    int query_rows,
    int query_cols,
    int train_rows,
@@ -475,31 +386,45 @@ __kernel void BruteForceMatch_knnUnrollMatch(
    const int lidy = get_local_id(1);
    const int groupidx = get_group_id(0);

-    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
-    __local value_type *s_query = (__local value_type *)sharebuffer;
-    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
+    const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
+    const int queryOffset = min(queryIdx, query_rows - 1) * step;
+    __global TN *query_vec = (__global TN *)(query + queryOffset);
+    query_cols /= kercn;

+    __local float sharebuffer[SHARED_MEM_SZ];
+    __local value_type *s_query = (__local value_type *)sharebuffer;
+
+#if 0 < MAX_DESC_LEN
+    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
    // load the query into local memory.
+    #pragma unroll
    for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)
    {
-        int loadx = lidx + i * BLOCK_SIZE;
-        s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        int loadx = mad24(BLOCK_SIZE, i, lidx);
+        s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
    }
+#else
+    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
+#endif

    float myBestDistance1 = MAX_FLOAT;
    float myBestDistance2 = MAX_FLOAT;
    int myBestTrainIdx1 = -1;
    int myBestTrainIdx2 = -1;

-    //loopUnrolledCached
-    for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
+    for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)
    {
        result_type result = 0;
+
+        int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
+        __global TN *train_vec = (__global TN *)(train + trainOffset);
+#if 0 < MAX_DESC_LEN
+        #pragma unroll
        for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
        {
            //load a BLOCK_SIZE * BLOCK_SIZE block into local train.
-            const int loadx = lidx + i * BLOCK_SIZE;
-            s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+            const int loadx = mad24(BLOCK_SIZE, i, lidx);
+            s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;

            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);
@@ -508,10 +433,30 @@ __kernel void BruteForceMatch_knnUnrollMatch(

            barrier(CLK_LOCAL_MEM_FENCE);
        }
+#else
+        for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)
+        {
+            const int loadx = mad24(BLOCK_SIZE, i, lidx);
+            //load query and train into local memory
+            s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
+            s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;

+            if (loadx < query_cols)
+            {
+                s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
+                s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            result += reduce_block_match(s_query, s_train, lidx, lidy);
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+#endif
        result = DIST_RES(result);

-        const int trainIdx = t * BLOCK_SIZE + lidx;
+        const int trainIdx = mad24(BLOCK_SIZE, t, lidx);

        if (queryIdx < query_rows && trainIdx < train_rows)
        {
@@ -532,13 +477,12 @@ __kernel void BruteForceMatch_knnUnrollMatch(

    barrier(CLK_LOCAL_MEM_FENCE);

-    __local float *s_distance = (local float *)sharebuffer;
-    __local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
+    __local float *s_distance = (__local float *)sharebuffer;
+    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);

    // find BestMatch
-    s_distance += lidy * BLOCK_SIZE;
-    s_trainIdx += lidy * BLOCK_SIZE;
-
+    s_distance += lidy * BLOCK_SIZE_ODD;
+    s_trainIdx += lidy * BLOCK_SIZE_ODD;
    s_distance[lidx] = myBestDistance1;
    s_trainIdx[lidx] = myBestTrainIdx1;

@@ -601,189 +545,4 @@ __kernel void BruteForceMatch_knnUnrollMatch(
        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
    }
-}
-
-__kernel void BruteForceMatch_knnMatch(
-    __global T *query,
-    __global T *train,
-    //__global float *mask,
-    __global int2 *bestTrainIdx,
-    __global float2 *bestDistance,
-    __local float *sharebuffer,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step
-)
-{
-    const int lidx = get_local_id(0);
-    const int lidy = get_local_id(1);
-    const int groupidx = get_group_id(0);
-
-    const int queryIdx = groupidx * BLOCK_SIZE + lidy;
-    __local value_type *s_query = (__local value_type *)sharebuffer;
-    __local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
-
-    float myBestDistance1 = MAX_FLOAT;
-    float myBestDistance2 = MAX_FLOAT;
-    int myBestTrainIdx1 = -1;
-    int myBestTrainIdx2 = -1;
-
-    //loop
-    for (int  t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
-    {
-        result_type result = 0.0f;
-        for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
-        {
-            const int loadx = lidx + i * BLOCK_SIZE;
-            //load query and train into local memory
-            s_query[lidy * BLOCK_SIZE + lidx] = 0;
-            s_train[lidx * BLOCK_SIZE + lidy] = 0;
-
-            if (loadx < query_cols)
-            {
-                s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
-                s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            result += reduce_block_match(s_query, s_train, lidx, lidy);
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        result = DIST_RES(result);
-
-        const int trainIdx = t * BLOCK_SIZE + lidx;
-
-        if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
-        {
-            if (result < myBestDistance1)
-            {
-                myBestDistance2 = myBestDistance1;
-                myBestTrainIdx2 = myBestTrainIdx1;
-                myBestDistance1 = result;
-                myBestTrainIdx1 = trainIdx;
-            }
-            else if (result < myBestDistance2)
-            {
-                myBestDistance2 = result;
-                myBestTrainIdx2 = trainIdx;
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    __local float *s_distance = (__local float *)sharebuffer;
-    __local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
-
-    //findBestMatch
-    s_distance += lidy * BLOCK_SIZE;
-    s_trainIdx += lidy * BLOCK_SIZE;
-
-    s_distance[lidx] = myBestDistance1;
-    s_trainIdx[lidx] = myBestTrainIdx1;
-
-    float bestDistance1 = MAX_FLOAT;
-    float bestDistance2 = MAX_FLOAT;
-    int bestTrainIdx1 = -1;
-    int bestTrainIdx2 = -1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lidx == 0)
-    {
-        for (int i = 0 ; i < BLOCK_SIZE ; i++)
-        {
-            float val = s_distance[i];
-            if (val < bestDistance1)
-            {
-                bestDistance2 = bestDistance1;
-                bestTrainIdx2 = bestTrainIdx1;
-
-                bestDistance1 = val;
-                bestTrainIdx1 = s_trainIdx[i];
-            }
-            else if (val < bestDistance2)
-            {
-                bestDistance2 = val;
-                bestTrainIdx2 = s_trainIdx[i];
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    s_distance[lidx] = myBestDistance2;
-    s_trainIdx[lidx] = myBestTrainIdx2;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lidx == 0)
-    {
-        for (int i = 0 ; i < BLOCK_SIZE ; i++)
-        {
-            float val = s_distance[i];
-
-            if (val < bestDistance2)
-            {
-                bestDistance2 = val;
-                bestTrainIdx2 = s_trainIdx[i];
-            }
-        }
-    }
-
-    myBestDistance1 = bestDistance1;
-    myBestDistance2 = bestDistance2;
-
-    myBestTrainIdx1 = bestTrainIdx1;
-    myBestTrainIdx2 = bestTrainIdx2;
-
-    if (queryIdx < query_rows && lidx == 0)
-    {
-        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
-        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
-    }
-}
-
-kernel void BruteForceMatch_calcDistanceUnrolled(
-    __global T *query,
-    __global T *train,
-    //__global float *mask,
-    __global float *allDist,
-    __local float *sharebuffer,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step)
-{
-    /* Todo */
-}
-
-kernel void BruteForceMatch_calcDistance(
-    __global T *query,
-    __global T *train,
-    //__global float *mask,
-    __global float *allDist,
-    __local float *sharebuffer,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step)
-{
-    /* Todo */
-}
-
-kernel void BruteForceMatch_findBestMatch(
-    __global float *allDist,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    int k
-)
-{
-    /* Todo */
-}
+}