Merge pull request #15623 from ChipKerchner:optimizeHOGpipeline
* Use circular lut hustory buffer in computeGradient of HOG * Initialize prefetch data outside main loop. Avoid code duplication.
This commit is contained in:
parent
732657cc46
commit
a71ff50130
@ -299,6 +299,11 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
|
||||
Mat Dy(1, width, CV_32F, dbuf + width);
|
||||
Mat Mag(1, width, CV_32F, dbuf + width*2);
|
||||
Mat Angle(1, width, CV_32F, dbuf + width*3);
|
||||
#if CV_SIMD128
|
||||
int widthP2 = width+2;
|
||||
AutoBuffer<float> _lutBuf(9*widthP2);
|
||||
float* const lutBuf = _lutBuf.data();
|
||||
#endif
|
||||
|
||||
if (cn == 3)
|
||||
{
|
||||
@ -317,6 +322,63 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
|
||||
xmap += 1;
|
||||
}
|
||||
|
||||
#if CV_SIMD128
|
||||
typedef const uchar* const T;
|
||||
float *lutPrev, *lutCurr, *lutNext;
|
||||
{
|
||||
y = 0;
|
||||
const uchar* imgPtr = img.ptr(ymap[y]);
|
||||
const uchar* prevPtr = img.data + img.step*ymap[y-1];
|
||||
|
||||
lutPrev = lutBuf+widthP2*0;
|
||||
lutCurr = lutBuf+widthP2*3;
|
||||
|
||||
{
|
||||
int x0 = xmap[-1], x1 = xmap[0];
|
||||
T p02 = imgPtr + x0, p12 = imgPtr + x1;
|
||||
|
||||
lutPrev[0+widthP2*0] = lut[prevPtr[x0+0]];
|
||||
lutPrev[0+widthP2*1] = lut[prevPtr[x0+1]];
|
||||
lutPrev[0+widthP2*2] = lut[prevPtr[x0+2]];
|
||||
lutCurr[0+widthP2*0] = lut[p02[0]]; lutCurr[1+widthP2*0] = lut[p12[0]];
|
||||
lutCurr[0+widthP2*1] = lut[p02[1]]; lutCurr[1+widthP2*1] = lut[p12[1]];
|
||||
lutCurr[0+widthP2*2] = lut[p02[2]]; lutCurr[1+widthP2*2] = lut[p12[2]];
|
||||
}
|
||||
|
||||
for( x = 0; x <= width - 4; x += 4 )
|
||||
{
|
||||
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
|
||||
T p02 = imgPtr + xmap[x+1];
|
||||
T p12 = imgPtr + xmap[x+2];
|
||||
T p22 = imgPtr + xmap[x+3];
|
||||
T p32 = imgPtr + xmap[x+4];
|
||||
|
||||
v_float32x4 _dx00 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]);
|
||||
v_float32x4 _dx10 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]);
|
||||
v_float32x4 _dx20 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]);
|
||||
|
||||
v_store(lutCurr+x+widthP2*0+2, _dx00);
|
||||
v_store(lutCurr+x+widthP2*1+2, _dx10);
|
||||
v_store(lutCurr+x+widthP2*2+2, _dx20);
|
||||
|
||||
v_float32x4 _dy00 = v_float32x4(lut[prevPtr[x0+0]], lut[prevPtr[x1+0]], lut[prevPtr[x2+0]], lut[prevPtr[x3+0]]);
|
||||
v_float32x4 _dy10 = v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
|
||||
v_float32x4 _dy20 = v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
|
||||
|
||||
v_store(lutPrev+x+widthP2*0+1, _dy00);
|
||||
v_store(lutPrev+x+widthP2*1+1, _dy10);
|
||||
v_store(lutPrev+x+widthP2*2+1, _dy20);
|
||||
}
|
||||
{
|
||||
int x0 = xmap[x];
|
||||
|
||||
lutPrev[x+widthP2*0+1] = lut[prevPtr[x0+0]];
|
||||
lutPrev[x+widthP2*1+1] = lut[prevPtr[x0+1]];
|
||||
lutPrev[x+widthP2*2+1] = lut[prevPtr[x0+2]];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
float angleScale = signedGradient ? (float)(nbins/(2.0*CV_PI)) : (float)(nbins/CV_PI);
|
||||
for( y = 0; y < gradsize.height; y++ )
|
||||
{
|
||||
@ -342,28 +404,57 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
|
||||
{
|
||||
x = 0;
|
||||
#if CV_SIMD128
|
||||
int yMod = y%3;
|
||||
|
||||
// Circular lut history buffer
|
||||
if (yMod == 0)
|
||||
{
|
||||
lutPrev = lutBuf+widthP2*0;
|
||||
lutCurr = lutBuf+widthP2*3;
|
||||
lutNext = lutBuf+widthP2*6;
|
||||
}
|
||||
else if (yMod == 1)
|
||||
{
|
||||
lutPrev = lutBuf+widthP2*3;
|
||||
lutCurr = lutBuf+widthP2*6;
|
||||
lutNext = lutBuf+widthP2*0;
|
||||
}
|
||||
else
|
||||
{
|
||||
lutPrev = lutBuf+widthP2*6;
|
||||
lutCurr = lutBuf+widthP2*0;
|
||||
lutNext = lutBuf+widthP2*3;
|
||||
}
|
||||
|
||||
{
|
||||
int x0 = xmap[-1];
|
||||
|
||||
lutNext[0+widthP2*0] = lut[nextPtr[x0+0]];
|
||||
lutNext[0+widthP2*1] = lut[nextPtr[x0+1]];
|
||||
lutNext[0+widthP2*2] = lut[nextPtr[x0+2]];
|
||||
}
|
||||
for( ; x <= width - 4; x += 4 )
|
||||
{
|
||||
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
|
||||
typedef const uchar* const T;
|
||||
T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
|
||||
T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
|
||||
T p22 = imgPtr + xmap[x+3], p20 = p02;
|
||||
T p32 = imgPtr + xmap[x+4], p30 = p12;
|
||||
|
||||
v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) -
|
||||
v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]);
|
||||
v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) -
|
||||
v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]);
|
||||
v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) -
|
||||
v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]);
|
||||
v_float32x4 _dx0 = v_load(lutCurr+x+widthP2*0+2) - v_load(lutCurr+x+widthP2*0);
|
||||
v_float32x4 _dx1 = v_load(lutCurr+x+widthP2*1+2) - v_load(lutCurr+x+widthP2*1);
|
||||
v_float32x4 _dx2 = v_load(lutCurr+x+widthP2*2+2) - v_load(lutCurr+x+widthP2*2);
|
||||
|
||||
v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) -
|
||||
v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]);
|
||||
v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) -
|
||||
v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
|
||||
v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) -
|
||||
v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
|
||||
v_float32x4 _dy00 = v_float32x4(lut[nextPtr[x0+0]], lut[nextPtr[x1+0]], lut[nextPtr[x2+0]], lut[nextPtr[x3+0]]);
|
||||
v_float32x4 _dy0 = _dy00 - v_load(lutPrev+x+widthP2*0+1);
|
||||
|
||||
v_store(lutNext+x+widthP2*0+1, _dy00);
|
||||
|
||||
v_float32x4 _dy10 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]);
|
||||
v_float32x4 _dy1 = _dy10 - v_load(lutPrev+x+widthP2*1+1);
|
||||
|
||||
v_store(lutNext+x+widthP2*1+1, _dy10);
|
||||
|
||||
v_float32x4 _dy20 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]);
|
||||
v_float32x4 _dy2 = _dy20 - v_load(lutPrev+x+widthP2*2+1);
|
||||
|
||||
v_store(lutNext+x+widthP2*2+1, _dy20);
|
||||
|
||||
v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
|
||||
v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
|
||||
@ -380,6 +471,13 @@ void HOGDescriptor::computeGradient(const Mat& img, Mat& grad, Mat& qangle,
|
||||
v_store(dbuf + x, _dx2);
|
||||
v_store(dbuf + x + width, _dy2);
|
||||
}
|
||||
{
|
||||
int x0 = xmap[x];
|
||||
|
||||
lutNext[x+widthP2*0+1] = lut[nextPtr[x0+0]];
|
||||
lutNext[x+widthP2*1+1] = lut[nextPtr[x0+1]];
|
||||
lutNext[x+widthP2*2+1] = lut[nextPtr[x0+2]];
|
||||
}
|
||||
#endif
|
||||
for( ; x < width; x++ )
|
||||
{
|
||||
|
||||
Loading…
Reference in New Issue
Block a user