Optimize OpenCL version of BFMatcher
This commit is contained in:
@@ -59,39 +59,71 @@
|
||||
#define MAX_DESC_LEN 64
|
||||
#endif
|
||||
|
||||
#define BLOCK_SIZE_ODD (BLOCK_SIZE + 1)
|
||||
#ifndef SHARED_MEM_SZ
|
||||
# if (BLOCK_SIZE < MAX_DESC_LEN)
|
||||
# define SHARED_MEM_SZ (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))
|
||||
# else
|
||||
# define SHARED_MEM_SZ (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef DIST_TYPE
|
||||
#define DIST_TYPE 2
|
||||
#endif
|
||||
|
||||
// dirty fix for non-template support
|
||||
#if (DIST_TYPE == 2) // L1Dist
|
||||
#if (DIST_TYPE == 2) // L1Dist
|
||||
# ifdef T_FLOAT
|
||||
# define DIST(x, y) fabs((x) - (y))
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
# if (8 == kercn)
|
||||
typedef float8 value_type;
|
||||
# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
|
||||
# elif (4 == kercn)
|
||||
typedef float4 value_type;
|
||||
# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
|
||||
# else
|
||||
typedef float value_type;
|
||||
# define DIST(x, y) result += fabs((x) - (y))
|
||||
# endif
|
||||
# else
|
||||
# define DIST(x, y) abs((x) - (y))
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
# if (8 == kercn)
|
||||
typedef int8 value_type;
|
||||
# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}
|
||||
# elif (4 == kercn)
|
||||
typedef int4 value_type;
|
||||
# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}
|
||||
# else
|
||||
typedef int value_type;
|
||||
# define DIST(x, y) result += abs((x) - (y))
|
||||
# endif
|
||||
# endif
|
||||
#define DIST_RES(x) (x)
|
||||
# define DIST_RES(x) (x)
|
||||
#elif (DIST_TYPE == 4) // L2Dist
|
||||
#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
#define DIST_RES(x) sqrt(x)
|
||||
typedef float result_type;
|
||||
# if (8 == kercn)
|
||||
typedef float8 value_type;
|
||||
# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}
|
||||
# elif (4 == kercn)
|
||||
typedef float4 value_type;
|
||||
# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d, d);}
|
||||
# else
|
||||
typedef float value_type;
|
||||
# define DIST(x, y) {value_type d = ((x) - (y)); result = mad(d, d, result);}
|
||||
# endif
|
||||
# define DIST_RES(x) sqrt(x)
|
||||
#elif (DIST_TYPE == 6) // Hamming
|
||||
//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
inline int bit1Count(int v)
|
||||
{
|
||||
v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
|
||||
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
|
||||
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
|
||||
}
|
||||
#define DIST(x, y) bit1Count( (x) ^ (y) )
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
#define DIST_RES(x) (x)
|
||||
# if (8 == kercn)
|
||||
typedef int8 value_type;
|
||||
# elif (4 == kercn)
|
||||
typedef int4 value_type;
|
||||
# else
|
||||
typedef int value_type;
|
||||
# endif
|
||||
typedef int result_type;
|
||||
# define DIST(x, y) result += popcount( (x) ^ (y) )
|
||||
# define DIST_RES(x) (x)
|
||||
#endif
|
||||
|
||||
inline result_type reduce_block(
|
||||
@@ -105,9 +137,7 @@ inline result_type reduce_block(
|
||||
#pragma unroll
|
||||
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||
{
|
||||
result += DIST(
|
||||
s_query[lidy * BLOCK_SIZE + j],
|
||||
s_train[j * BLOCK_SIZE + lidx]);
|
||||
DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
|
||||
}
|
||||
return DIST_RES(result);
|
||||
}
|
||||
@@ -123,11 +153,9 @@ inline result_type reduce_block_match(
|
||||
#pragma unroll
|
||||
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||
{
|
||||
result += DIST(
|
||||
s_query[lidy * BLOCK_SIZE + j],
|
||||
s_train[j * BLOCK_SIZE + lidx]);
|
||||
DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);
|
||||
}
|
||||
return (result);
|
||||
return result;
|
||||
}
|
||||
|
||||
inline result_type reduce_multi_block(
|
||||
@@ -142,23 +170,16 @@ inline result_type reduce_multi_block(
|
||||
#pragma unroll
|
||||
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||
{
|
||||
result += DIST(
|
||||
s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
|
||||
s_train[j * BLOCK_SIZE + lidx]);
|
||||
DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
|
||||
local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
|
||||
*/
|
||||
__kernel void BruteForceMatch_UnrollMatch(
|
||||
__kernel void BruteForceMatch_Match(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@@ -170,17 +191,26 @@ __kernel void BruteForceMatch_UnrollMatch(
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
|
||||
const int queryOffset = min(queryIdx, query_rows - 1) * step;
|
||||
__global TN *query_vec = (__global TN *)(query + queryOffset);
|
||||
query_cols /= kercn;
|
||||
|
||||
int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||
__local float sharebuffer[SHARED_MEM_SZ];
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
|
||||
#if 0 < MAX_DESC_LEN
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
// load the query into local memory.
|
||||
#pragma unroll
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
|
||||
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
|
||||
{
|
||||
int loadx = lidx + i * BLOCK_SIZE;
|
||||
s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
const int loadx = mad24(BLOCK_SIZE, i, lidx);
|
||||
s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
|
||||
}
|
||||
#else
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
|
||||
#endif
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
@@ -189,12 +219,16 @@ __kernel void BruteForceMatch_UnrollMatch(
|
||||
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
|
||||
{
|
||||
result_type result = 0;
|
||||
|
||||
const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
|
||||
__global TN *train_vec = (__global TN *)(train + trainOffset);
|
||||
#if 0 < MAX_DESC_LEN
|
||||
#pragma unroll
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
|
||||
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)
|
||||
{
|
||||
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
const int loadx = mad24(BLOCK_SIZE, i, lidx);
|
||||
s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -203,89 +237,18 @@ __kernel void BruteForceMatch_UnrollMatch(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
result = DIST_RES(result);
|
||||
|
||||
int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
#else
|
||||
for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)
|
||||
{
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
__local float *s_distance = (__local float*)(sharebuffer);
|
||||
__local int* s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
//find BestMatch
|
||||
s_distance += lidy * BLOCK_SIZE;
|
||||
s_trainIdx += lidy * BLOCK_SIZE;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
#pragma unroll
|
||||
for (int k = 0 ; k < BLOCK_SIZE; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_Match(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
// loop
|
||||
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||
{
|
||||
result_type result = 0;
|
||||
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
const int loadx = mad24(i, BLOCK_SIZE, lidx);
|
||||
//load query and train into local memory
|
||||
s_query[lidy * BLOCK_SIZE + lidx] = 0;
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = 0;
|
||||
s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
|
||||
s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
|
||||
s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -294,10 +257,10 @@ __kernel void BruteForceMatch_Match(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
#endif
|
||||
result = DIST_RES(result);
|
||||
|
||||
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
@@ -309,17 +272,18 @@ __kernel void BruteForceMatch_Match(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * BLOCK_SIZE;
|
||||
s_trainIdx += lidy * BLOCK_SIZE;
|
||||
s_distance += lidy * BLOCK_SIZE_ODD;
|
||||
s_trainIdx += lidy * BLOCK_SIZE_ODD;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
#pragma unroll
|
||||
for (int k = 0 ; k < BLOCK_SIZE; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
@@ -336,76 +300,14 @@ __kernel void BruteForceMatch_Match(
|
||||
}
|
||||
}
|
||||
|
||||
//radius_unrollmatch
|
||||
__kernel void BruteForceMatch_RadiusUnrollMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * BLOCK_SIZE + lidy;
|
||||
const int trainIdx = groupidx * BLOCK_SIZE + lidx;
|
||||
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
result_type result = 0;
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
|
||||
{
|
||||
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
|
||||
s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block(s_query, s_train, lidx, lidy);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows &&
|
||||
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if(ind < bestTrainIdx_cols)
|
||||
{
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//radius_match
|
||||
__kernel void BruteForceMatch_RadiusMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@@ -420,20 +322,34 @@ __kernel void BruteForceMatch_RadiusMatch(
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * BLOCK_SIZE + lidy;
|
||||
const int trainIdx = groupidx * BLOCK_SIZE + lidx;
|
||||
const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);
|
||||
const int queryOffset = min(queryIdx, query_rows - 1) * step;
|
||||
__global TN *query_vec = (__global TN *)(query + queryOffset);
|
||||
|
||||
const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);
|
||||
const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;
|
||||
__global TN *train_vec = (__global TN *)(train + trainOffset);
|
||||
|
||||
query_cols /= kercn;
|
||||
|
||||
__local float sharebuffer[SHARED_MEM_SZ];
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
|
||||
|
||||
result_type result = 0;
|
||||
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
|
||||
{
|
||||
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
const int loadx = mad24(BLOCK_SIZE, i, lidx);
|
||||
|
||||
s_query[lidy * BLOCK_SIZE + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = loadx < query_cols ? train[min(groupidx * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
|
||||
s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
|
||||
s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
|
||||
}
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -442,28 +358,23 @@ __kernel void BruteForceMatch_RadiusMatch(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows &&
|
||||
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)
|
||||
{
|
||||
int ind = atom_inc(nMatches + queryIdx);
|
||||
|
||||
if(ind < bestTrainIdx_cols)
|
||||
{
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;
|
||||
bestDistance[mad24(queryIdx, ostep, ind)] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch(
|
||||
__kernel void BruteForceMatch_knnMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@@ -475,31 +386,45 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);
|
||||
const int queryOffset = min(queryIdx, query_rows - 1) * step;
|
||||
__global TN *query_vec = (__global TN *)(query + queryOffset);
|
||||
query_cols /= kercn;
|
||||
|
||||
__local float sharebuffer[SHARED_MEM_SZ];
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
|
||||
#if 0 < MAX_DESC_LEN
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
// load the query into local memory.
|
||||
#pragma unroll
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
|
||||
{
|
||||
int loadx = lidx + i * BLOCK_SIZE;
|
||||
s_query[lidy * MAX_DESC_LEN + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
int loadx = mad24(BLOCK_SIZE, i, lidx);
|
||||
s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;
|
||||
}
|
||||
#else
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;
|
||||
#endif
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loopUnrolledCached
|
||||
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)
|
||||
{
|
||||
result_type result = 0;
|
||||
|
||||
int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;
|
||||
__global TN *train_vec = (__global TN *)(train + trainOffset);
|
||||
#if 0 < MAX_DESC_LEN
|
||||
#pragma unroll
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
|
||||
{
|
||||
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
const int loadx = mad24(BLOCK_SIZE, i, lidx);
|
||||
s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -508,10 +433,30 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
#else
|
||||
for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)
|
||||
{
|
||||
const int loadx = mad24(BLOCK_SIZE, i, lidx);
|
||||
//load query and train into local memory
|
||||
s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = 0;
|
||||
s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[mad24(BLOCK_SIZE_ODD, lidy, lidx)] = query_vec[loadx];
|
||||
s_train[mad24(BLOCK_SIZE_ODD, lidx, lidy)] = train_vec[loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block_match(s_query, s_train, lidx, lidy);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
#endif
|
||||
result = DIST_RES(result);
|
||||
|
||||
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
const int trainIdx = mad24(BLOCK_SIZE, t, lidx);
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows)
|
||||
{
|
||||
@@ -532,13 +477,12 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);
|
||||
|
||||
// find BestMatch
|
||||
s_distance += lidy * BLOCK_SIZE;
|
||||
s_trainIdx += lidy * BLOCK_SIZE;
|
||||
|
||||
s_distance += lidy * BLOCK_SIZE_ODD;
|
||||
s_trainIdx += lidy * BLOCK_SIZE_ODD;
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
@@ -601,189 +545,4 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_knnMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loop
|
||||
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||
{
|
||||
result_type result = 0.0f;
|
||||
for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * BLOCK_SIZE + lidx] = 0;
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * BLOCK_SIZE + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block_match(s_query, s_train, lidx, lidy);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
result = DIST_RES(result);
|
||||
|
||||
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * BLOCK_SIZE;
|
||||
s_trainIdx += lidy * BLOCK_SIZE;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < BLOCK_SIZE ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < BLOCK_SIZE ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistance(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_findBestMatch(
|
||||
__global float *allDist,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
int k
|
||||
)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user