cv::resize (INTER_LINEAR && INTER_CUBIC)
This commit is contained in:
parent
00b8f0dec1
commit
183e378bd0
@ -49,8 +49,6 @@
|
||||
#include "precomp.hpp"
|
||||
#include "opencl_kernels_imgproc.hpp"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
|
||||
static IppStatus sts = ippInit();
|
||||
#endif
|
||||
@ -896,6 +894,183 @@ struct VResizeCubicVec_32f
|
||||
}
|
||||
};
|
||||
|
||||
#elif CV_NEON
|
||||
|
||||
typedef VResizeNoVec VResizeLinearVec_32s8u;
|
||||
|
||||
struct VResizeLinearVec_32f16u
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1];
|
||||
ushort* dst = (ushort*)_dst;
|
||||
int x = 0;
|
||||
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
|
||||
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
|
||||
|
||||
float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
|
||||
float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
|
||||
|
||||
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
|
||||
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeLinearVec_32f16s
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1];
|
||||
short* dst = (short*)_dst;
|
||||
int x = 0;
|
||||
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
|
||||
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
|
||||
|
||||
float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
|
||||
float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
|
||||
|
||||
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeLinearVec_32f
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1];
|
||||
float* dst = (float*)_dst;
|
||||
int x = 0;
|
||||
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
|
||||
float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
|
||||
|
||||
vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
|
||||
vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
typedef VResizeNoVec VResizeCubicVec_32s8u;
|
||||
|
||||
struct VResizeCubicVec_32f16u
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
||||
ushort* dst = (ushort*)_dst;
|
||||
int x = 0;
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
|
||||
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
|
||||
v_b1, vld1q_f32(S1 + x)),
|
||||
v_b2, vld1q_f32(S2 + x)),
|
||||
v_b3, vld1q_f32(S3 + x));
|
||||
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
|
||||
v_b1, vld1q_f32(S1 + x + 4)),
|
||||
v_b2, vld1q_f32(S2 + x + 4)),
|
||||
v_b3, vld1q_f32(S3 + x + 4));
|
||||
|
||||
vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
|
||||
vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeCubicVec_32f16s
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
||||
short* dst = (short*)_dst;
|
||||
int x = 0;
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
|
||||
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
|
||||
v_b1, vld1q_f32(S1 + x)),
|
||||
v_b2, vld1q_f32(S2 + x)),
|
||||
v_b3, vld1q_f32(S3 + x));
|
||||
float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
|
||||
v_b1, vld1q_f32(S1 + x + 4)),
|
||||
v_b2, vld1q_f32(S2 + x + 4)),
|
||||
v_b3, vld1q_f32(S3 + x + 4));
|
||||
|
||||
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct VResizeCubicVec_32f
|
||||
{
|
||||
int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
|
||||
{
|
||||
const float** src = (const float**)_src;
|
||||
const float* beta = (const float*)_beta;
|
||||
const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
|
||||
float* dst = (float*)_dst;
|
||||
int x = 0;
|
||||
float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
|
||||
v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
|
||||
|
||||
for( ; x <= width - 8; x += 8 )
|
||||
{
|
||||
vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
|
||||
v_b1, vld1q_f32(S1 + x)),
|
||||
v_b2, vld1q_f32(S2 + x)),
|
||||
v_b3, vld1q_f32(S3 + x)));
|
||||
vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
|
||||
v_b1, vld1q_f32(S1 + x + 4)),
|
||||
v_b2, vld1q_f32(S2 + x + 4)),
|
||||
v_b3, vld1q_f32(S3 + x + 4)));
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
typedef VResizeNoVec VResizeLinearVec_32s8u;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user