diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index fe157f8684..b7f02c5252 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1987,6 +1987,238 @@ void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst ) namespace cv { +template +struct Mul_SIMD +{ + int operator() (const T *, const T *, T *, int, WT) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct Mul_SIMD +{ + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); + int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); + int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1q_f32(dst + x, v_dst1); + vst1q_f32(dst + x + 4, v_dst2); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + v_dst1 = vmulq_f32(v_dst1, v_scale); + + float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + vst1q_f32(dst + x, v_dst1); + vst1q_f32(dst + x + 4, v_dst2); + } + } + + return x; + } +}; + +#endif + template static void mul_( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, WT scale ) @@ -1995,11 +2227,13 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); + Mul_SIMD vop; + if( scale == (WT)1. ) { for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int i=0; + int i = vop(src1, src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for(; i <= size.width - 4; i += 4 ) { @@ -2024,7 +2258,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2, { for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int i = 0; + int i = vop(src1, src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for(; i <= size.width - 4; i += 4 ) { @@ -2367,6 +2601,114 @@ void cv::divide(double scale, InputArray src2, namespace cv { +template +struct AddWeighted_SIMD +{ + int operator() (const T *, const T *, T *, int, WT, WT, WT) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct AddWeighted_SIMD +{ + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32 (gamma); + + for( ; x <= width - 8; x += 8 ) + { + int8x8_t in1 = vld1_s8(src1 + x); + int16x8_t in1_16 = vmovl_s8(in1); + float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); + float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); + + int8x8_t in2 = vld1_s8(src2+x); + int16x8_t in2_16 = vmovl_s8(in2); + float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); + float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); + + float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); + float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); + out_f_l = vaddq_f32(out_f_l, g); + out_f_h = vaddq_f32(out_f_h, g); + + int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); + int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); + + int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); + int8x8_t out = vqmovn_s16(out_16); + + vst1_s8(dst + x, out); + } + + return x; + } +}; + +template <> +struct AddWeighted_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32(gamma); + + for( ; x <= width - 8; x += 8 ) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); + float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); + uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); + v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); + uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); + } + + return x; + } +}; + +template <> +struct AddWeighted_SIMD +{ + int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32(gamma); + + for( ; x <= width - 8; x += 8 ) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); + float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); + int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); + v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); + int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); + } + + return x; + } +}; + +#endif + template static void addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, void* _scalars ) @@ -2377,9 +2719,11 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); + AddWeighted_SIMD vop; + for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int x = 0; + int x = vop(src1, src2, dst, size.width, alpha, beta, gamma); #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { @@ -2457,8 +2801,8 @@ addWeighted8u( const uchar* src1, size_t step1, out_f_l = vaddq_f32(out_f_l, g); out_f_h = vaddq_f32(out_f_h, g); - uint16x4_t out_16_l = vqmovun_s32(vcvtq_s32_f32(out_f_l)); - uint16x4_t out_16_h = vqmovun_s32(vcvtq_s32_f32(out_f_h)); + uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l)); + uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h)); uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); uint8x8_t out = vqmovn_u16(out_16); @@ -2557,6 +2901,213 @@ void cv::addWeighted( InputArray src1, double alpha, InputArray src2, namespace cv { +template +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int) + { + } + + int operator () (const T *, const T *, uchar *, int) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + v_mask = vdupq_n_u8(255); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); + + return x; + } + + int code; + uint8x16_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); + vst1_u8(dst + x, veor_u8(v_dst, v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const float * src1, const float * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); + vst1_u8(dst + x, veor_u8(v_dst, v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +#endif + template static void cmp_(const T* src1, size_t step1, const T* src2, size_t step2, uchar* dst, size_t step, Size size, int code) @@ -2570,12 +3121,14 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, code = code == CMP_GE ? CMP_LE : CMP_GT; } + Cmp_SIMD vop(code); + if( code == CMP_GT || code == CMP_LE ) { int m = code == CMP_GT ? 0 : 255; for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { - int x = 0; + int x = vop(src1, src2, dst, size.width); #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { @@ -2590,7 +3143,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2, #endif for( ; x < size.width; x++ ) dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } + } } else if( code == CMP_EQ || code == CMP_NE ) { diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 1eeb0ecdba..6a7b1ef52e 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1480,6 +1480,724 @@ cvtScaleAbs_( const T* src, size_t sstep, } } +template +struct cvtScale_SIMD +{ + int operator () (const T *, DT *, int, WT, WT) const + { + return 0; + } +}; + +#if CV_NEON + +// from uchar + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); + vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const uchar * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(src + x)); + vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); + vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); + } + + return x; + } +}; + +// from schar + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); + vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const schar * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); + vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); + vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); + } + + return x; + } +}; + +// from ushort + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift); + + vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1)); + vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const ushort * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift)); + vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift)); + } + + return x; + } +}; + +// from short + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vld1q_s16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vld1q_s16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vld1q_s16(src + x); + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const short * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vld1q_s16(src + x); + vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift)); + vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift)); + } + + return x; + } +}; + +// from int + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const int * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + + return x; + } +}; + +// from float + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, uchar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, schar * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, ushort * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, short * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift); + float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, int * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 4; x += 4) + vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift))); + + return x; + } +}; + +template <> +struct cvtScale_SIMD +{ + int operator () (const float * src, float * dst, int width, float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale); + + for ( ; x <= width - 4; x += 4) + vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)); + + return x; + } +}; + +#endif + template static void cvtScale_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size, @@ -1488,9 +2206,11 @@ cvtScale_( const T* src, size_t sstep, sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); + cvtScale_SIMD vop; + for( ; size.height--; src += sstep, dst += dstep ) { - int x = 0; + int x = vop(src, dst, size.width, scale, shift); #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) @@ -1755,6 +2475,25 @@ struct Cvt_SIMD } }; +template <> +struct Cvt_SIMD +{ + int operator() (const schar * src, ushort * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src = vmovl_s8(vld1_s8(src + x)); + vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))), + vqmovun_s32(vmovl_s16(vget_high_s16(v_src))))); + } + + return x; + } +}; + + template <> struct Cvt_SIMD { @@ -1810,6 +2549,49 @@ struct Cvt_SIMD } }; +template <> +struct Cvt_SIMD +{ + int operator() (const ushort * src, schar * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 16; x += 16) + { + uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8); + int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1))); + int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1))); + int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2))); + int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2))); + + vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))), + vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21))))); + } + + return x; + } +}; + +template <> +struct Cvt_SIMD +{ + int operator() (const ushort * src, short * dst, int width) const + { + int x = 0; + + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))); + int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); + } + + return x; + } +}; + template <> struct Cvt_SIMD { diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index c5ce6d0fbf..2501fb2d0c 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -2804,7 +2804,8 @@ dotProd_(const T* src1, const T* src2, int len) { int i = 0; double result = 0; - #if CV_ENABLE_UNROLLED + + #if CV_ENABLE_UNROLLED for( ; i <= len - 4; i += 4 ) result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] + (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3]; @@ -2833,10 +2834,12 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) { int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize; __m128i z = _mm_setzero_si128(); + CV_DECL_ALIGNED(16) int buf[4]; + while( i < len0 ) { blockSize = std::min(len0 - i, blockSize0); - __m128i s = _mm_setzero_si128(); + __m128i s = z; j = 0; for( ; j <= blockSize - 16; j += 16 ) { @@ -2860,7 +2863,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) s0 = _mm_madd_epi16(s0, s1); s = _mm_add_epi32(s, s0); } - CV_DECL_ALIGNED(16) int buf[4]; + _mm_store_si128((__m128i*)buf, s); r += buf[0] + buf[1] + buf[2] + buf[3]; @@ -2869,6 +2872,45 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) i += blockSize; } } +#elif CV_NEON + int len0 = len & -8, blockSize0 = (1 << 15), blockSize; + uint32x4_t v_zero = vdupq_n_u32(0u); + CV_DECL_ALIGNED(16) uint buf[4]; + + while( i < len0 ) + { + blockSize = std::min(len0 - i, blockSize0); + uint32x4_t v_sum = v_zero; + + int j = 0; + for( ; j <= blockSize - 16; j += 16 ) + { + uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j); + + uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2)); + v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); + v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); + + v_src10 = vmovl_u8(vget_high_u8(v_src1)); + v_src20 = vmovl_u8(vget_high_u8(v_src2)); + v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20)); + v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20)); + } + + for( ; j <= blockSize - 8; j += 8 ) + { + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j)); + v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2)); + v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2)); + } + + vst1q_u32(buf, v_sum); + r += buf[0] + buf[1] + buf[2] + buf[3]; + + src1 += blockSize; + src2 += blockSize; + i += blockSize; + } #endif return r + dotProd_(src1, src2, len - i); } @@ -2876,7 +2918,51 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len) static double dotProd_8s(const schar* src1, const schar* src2, int len) { - return dotProd_(src1, src2, len); + int i = 0; + double r = 0.0; + +#if CV_NEON + int len0 = len & -8, blockSize0 = (1 << 14), blockSize; + int32x4_t v_zero = vdupq_n_s32(0); + CV_DECL_ALIGNED(16) int buf[4]; + + while( i < len0 ) + { + blockSize = std::min(len0 - i, blockSize0); + int32x4_t v_sum = v_zero; + + int j = 0; + for( ; j <= blockSize - 16; j += 16 ) + { + int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j); + + int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2)); + v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); + v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); + + v_src10 = vmovl_s8(vget_high_s8(v_src1)); + v_src20 = vmovl_s8(vget_high_s8(v_src2)); + v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20)); + v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20)); + } + + for( ; j <= blockSize - 8; j += 8 ) + { + int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j)); + v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2)); + v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2)); + } + + vst1q_s32(buf, v_sum); + r += buf[0] + buf[1] + buf[2] + buf[3]; + + src1 += blockSize; + src2 += blockSize; + i += blockSize; + } +#endif + + return r + dotProd_(src1, src2, len - i); } static double dotProd_16u(const ushort* src1, const ushort* src2, int len) @@ -2914,13 +3000,36 @@ static double dotProd_32s(const int* src1, const int* src2, int len) static double dotProd_32f(const float* src1, const float* src2, int len) { + double r = 0.0; + int i = 0; + #if (ARITHM_USE_IPP == 1) - double r = 0; if (0 <= ippsDotProd_32f64f(src1, src2, len, &r)) return r; setIppErrorStatus(); +#elif CV_NEON + int len0 = len & -4, blockSize0 = (1 << 13), blockSize; + float32x4_t v_zero = vdupq_n_f32(0.0f); + CV_DECL_ALIGNED(16) float buf[4]; + + while( i < len0 ) + { + blockSize = std::min(len0 - i, blockSize0); + float32x4_t v_sum = v_zero; + + int j = 0; + for( ; j <= blockSize - 4; j += 4 ) + v_sum = vmlaq_f32(v_sum, vld1q_f32(src1 + j), vld1q_f32(src2 + j)); + + vst1q_f32(buf, v_sum); + r += buf[0] + buf[1] + buf[2] + buf[3]; + + src1 += blockSize; + src2 += blockSize; + i += blockSize; + } #endif - return dotProd_(src1, src2, len); + return r + dotProd_(src1, src2, len - i); } static double dotProd_64f(const double* src1, const double* src2, int len) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 2cc3c8d348..1abb316186 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -90,20 +90,20 @@ struct Sum_SIMD uint8x16_t v_src = vld1q_u8(src0 + x); uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src)); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half))); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half))); + v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); + v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); v_half = vmovl_u8(vget_high_u8(v_src)); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half))); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half))); + v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); + v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); } for ( ; x <= len - 8; x += 8) { uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x)); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src))); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src))); + v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); + v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); } unsigned int CV_DECL_ALIGNED(16) ar[4]; @@ -133,20 +133,20 @@ struct Sum_SIMD int8x16_t v_src = vld1q_s8(src0 + x); int16x8_t v_half = vmovl_s8(vget_low_s8(v_src)); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half))); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half))); + v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); + v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); v_half = vmovl_s8(vget_high_s8(v_src)); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half))); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half))); + v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); + v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); } for ( ; x <= len - 8; x += 8) { int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x)); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src))); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src))); + v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); + v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); } int CV_DECL_ALIGNED(16) ar[4]; @@ -175,13 +175,13 @@ struct Sum_SIMD { uint16x8_t v_src = vld1q_u16(src0 + x); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src))); - v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src))); + v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); + v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); } for ( ; x <= len - 4; x += 4) - v_sum = vaddq_u32(v_sum, vmovl_u16(vld1_u16(src0 + x))); - + v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x)); + unsigned int CV_DECL_ALIGNED(16) ar[4]; vst1q_u32(ar, v_sum); @@ -208,13 +208,13 @@ struct Sum_SIMD { int16x8_t v_src = vld1q_s16(src0 + x); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src))); - v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src))); + v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); + v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); } for ( ; x <= len - 4; x += 4) - v_sum = vaddq_s32(v_sum, vmovl_s16(vld1_s16(src0 + x))); - + v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x)); + int CV_DECL_ALIGNED(16) ar[4]; vst1q_s32(ar, v_sum); @@ -426,6 +426,38 @@ static int countNonZero8u( const uchar* src, int len ) nz += tab[val & 255] + tab[val >> 8]; } } +#elif CV_NEON + int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6; + uint32x4_t v_nz = vdupq_n_u32(0u); + uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1); + const uchar * src0 = src; + + while( i < len0 ) + { + int blockSizei = std::min(len0 - i, blockSize0), j = 0; + + while (j < blockSizei) + { + int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; + uint8x16_t v_pz = v_zero; + + for( ; k <= blockSizej - 16; k += 16 ) + v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1)); + + uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz)); + v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz); + v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz); + + src0 += blockSizej; + j += blockSizej; + } + + i += blockSizei; + } + + CV_DECL_ALIGNED(16) unsigned int buf[4]; + vst1q_u32(buf, v_nz); + nz += i - saturate_cast(buf[0] + buf[1] + buf[2] + buf[3]); #endif for( ; i < len; i++ ) nz += src[i] != 0; @@ -433,13 +465,116 @@ static int countNonZero8u( const uchar* src, int len ) } static int countNonZero16u( const ushort* src, int len ) -{ return countNonZero_(src, len); } +{ + int i = 0, nz = 0; +#if CV_NEON + int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; + uint32x4_t v_nz = vdupq_n_u32(0u); + uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1); + + while( i < len0 ) + { + int blockSizei = std::min(len0 - i, blockSize0), j = 0; + + while (j < blockSizei) + { + int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; + uint16x8_t v_pz = v_zero; + + for( ; k <= blockSizej - 8; k += 8 ) + v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1)); + + v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); + + src += blockSizej; + j += blockSizej; + } + + i += blockSizei; + } + + CV_DECL_ALIGNED(16) unsigned int buf[4]; + vst1q_u32(buf, v_nz); + nz += i - saturate_cast(buf[0] + buf[1] + buf[2] + buf[3]); +#endif + return nz + countNonZero_(src, len - i); +} static int countNonZero32s( const int* src, int len ) -{ return countNonZero_(src, len); } +{ + int i = 0, nz = 0; +#if CV_NEON + int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; + uint32x4_t v_nz = vdupq_n_u32(0u); + int32x4_t v_zero = vdupq_n_s32(0.0f); + uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u); + + while( i < len0 ) + { + int blockSizei = std::min(len0 - i, blockSize0), j = 0; + + while (j < blockSizei) + { + int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; + uint16x8_t v_pz = v_zerou; + + for( ; k <= blockSizej - 8; k += 8 ) + v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)), + vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1)); + + v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); + + src += blockSizej; + j += blockSizej; + } + + i += blockSizei; + } + + CV_DECL_ALIGNED(16) unsigned int buf[4]; + vst1q_u32(buf, v_nz); + nz += i - saturate_cast(buf[0] + buf[1] + buf[2] + buf[3]); +#endif + return nz + countNonZero_(src, len - i); +} static int countNonZero32f( const float* src, int len ) -{ return countNonZero_(src, len); } +{ + int i = 0, nz = 0; +#if CV_NEON + int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; + uint32x4_t v_nz = vdupq_n_u32(0u); + float32x4_t v_zero = vdupq_n_f32(0.0f); + uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u); + + while( i < len0 ) + { + int blockSizei = std::min(len0 - i, blockSize0), j = 0; + + while (j < blockSizei) + { + int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; + uint16x8_t v_pz = v_zerou; + + for( ; k <= blockSizej - 8; k += 8 ) + v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)), + vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1)); + + v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); + + src += blockSizej; + j += blockSizej; + } + + i += blockSizei; + } + + CV_DECL_ALIGNED(16) unsigned int buf[4]; + vst1q_u32(buf, v_nz); + nz += i - saturate_cast(buf[0] + buf[1] + buf[2] + buf[3]); +#endif + return nz + countNonZero_(src, len - i); +} static int countNonZero64f( const double* src, int len ) { return countNonZero_(src, len); } @@ -1956,6 +2091,14 @@ float normL1_(const float* a, const float* b, int n) d = buf[0] + buf[1] + buf[2] + buf[3]; } else +#elif CV_NEON + float32x4_t v_sum = vdupq_n_f32(0.0f); + for ( ; j <= n - 4; j += 4) + v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); + + float CV_DECL_ALIGNED(16) buf[4]; + vst1q_f32(buf, v_sum); + d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) @@ -1996,6 +2139,19 @@ int normL1_(const uchar* a, const uchar* b, int n) d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); } else +#elif CV_NEON + uint32x4_t v_sum = vdupq_n_u32(0.0f); + for ( ; j <= n - 16; j += 16) + { + uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); + uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); + v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); + v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); + } + + uint CV_DECL_ALIGNED(16) buf[4]; + vst1q_u32(buf, v_sum); + d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp index a6e31d3a8d..3fc953ef18 100644 --- a/modules/imgproc/perf/perf_blur.cpp +++ b/modules/imgproc/perf/perf_blur.cpp @@ -98,6 +98,11 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16, Size size = get<0>(GetParam()); int type = get<1>(GetParam()); BorderType btype = get<2>(GetParam()); + double eps = 1e-3; + +#if CV_NEON + eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : eps; +#endif Mat src(size, type); Mat dst(size, type); @@ -106,7 +111,7 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16, TEST_CYCLE() blur(src, dst, Size(16,16), Point(-1,-1), btype); - SANITY_CHECK(dst, 1e-3); + SANITY_CHECK(dst, eps); } PERF_TEST_P(Size_MatType_BorderType3x3, box3x3, diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index 9f23d3443b..b8906a1caa 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -46,10 +46,433 @@ namespace cv { +template +struct Acc_SIMD +{ + int operator() (const T *, AT *, const uchar *, int, int) const + { + return 0; + } +}; + +template +struct AccSqr_SIMD +{ + int operator() (const T *, AT *, const uchar *, int, int) const + { + return 0; + } +}; + +template +struct AccProd_SIMD +{ + int operator() (const T *, const T *, AT *, const uchar *, int, int) const + { + return 0; + } +}; + +template +struct AccW_SIMD +{ + int operator() (const T *, AT *, const uchar *, int, int, AT) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct Acc_SIMD +{ + int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + uint8x16_t v_src = vld1q_u8(src + x); + uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src)); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); + vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); + vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); + } + } + else if (cn == 1) + { + uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0); + + for ( ; x <= len - 16; x += 16) + { + uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0))); + uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src)); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); + vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); + vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); + } + } + + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src)); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); + } + } + + return x; + } +}; + +template <> +struct Acc_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vld1q_f32(src + x))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src + x + 4))); + } + } + + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + uint8x16_t v_src = vld1q_u8(src + x); + uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src); + uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); + vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); + vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); + } + } + else if (cn == 1) + { + uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0); + + for ( ; x <= len - 16; x += 16) + { + uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0))); + uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src); + uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); + vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); + vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); + } + } + + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src); + uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); + } + } + else if (cn == 1) + { + uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0); + + for ( ; x <= len - 8; x += 8) + { + uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0)); + uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src); + uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])), + v_src = vandq_u16(vld1q_u16(src + x), v_mask); + + uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src); + uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); + } + } + + return x; + } +}; + +template <> +struct AccSqr_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + float32x4_t v_src = vld1q_f32(src + x); + vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), v_src, v_src)); + + v_src = vld1q_f32(src + x + 4); + vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), v_src, v_src)); + } + } + + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + uint8x16_t v_1src = vld1q_u8(src1 + x), v_2src = vld1q_u8(src2 + x); + uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)), + v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src)); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); + vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); + vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); + } + } + else if (cn == 1) + { + uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0); + + for ( ; x <= len - 16; x += 16) + { + uint8x16_t v_mask = veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)); + uint8x16_t v_1src = vandq_u8(vld1q_u8(src1 + x), v_mask), v_2src = vandq_u8(vld1q_u8(src2 + x), v_mask); + uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)), + v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src)); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))))); + vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))))); + vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))))); + } + } + + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + uint16x8_t v_1src = vld1q_u16(src1 + x), v_2src = vld1q_u16(src2 + x); + uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)), + v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src)); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); + } + } + else if (cn == 1) + { + uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0); + + for ( ; x <= len - 8; x += 8) + { + uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0)); + uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src); + uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])), + v_1src = vandq_u16(vld1q_u16(src1 + x), v_mask), + v_2src = vandq_u16(vld1q_u16(src2 + x), v_mask); + + uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)), + v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src)); + + vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0))); + vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1))); + } + } + + return x; + } +}; + +template <> +struct AccProd_SIMD +{ + int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const + { + int x = 0; + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), vld1q_f32(src1 + x), vld1q_f32(src2 + x))); + vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4))); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const + { + int x = 0; + float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 16; x += 16) + { + uint8x16_t v_src = vld1q_u8(src + x); + uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src)); + + vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), v_alpha)); + vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), v_alpha)); + vst1q_f32(dst + x + 8, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 8), v_beta), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_alpha)); + vst1q_f32(dst + x + 12, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 12), v_beta), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha)); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const + { + int x = 0; + float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + uint16x8_t v_src = vld1q_u16(src + x); + uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src)); + + vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vcvtq_f32_u32(v_src0), v_alpha)); + vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha)); + } + } + + return x; + } +}; + +template <> +struct AccW_SIMD +{ + int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const + { + int x = 0; + float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha); + + if (!mask) + { + len *= cn; + for ( ; x <= len - 8; x += 8) + { + vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vld1q_f32(src + x), v_alpha)); + vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vld1q_f32(src + x + 4), v_alpha)); + } + } + + return x; + } +}; + +#endif + template void acc_( const T* src, AT* dst, const uchar* mask, int len, int cn ) { - int i = 0; + int i = Acc_SIMD()(src, dst, mask, len, cn); if( !mask ) { @@ -107,7 +530,7 @@ acc_( const T* src, AT* dst, const uchar* mask, int len, int cn ) template void accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn ) { - int i = 0; + int i = AccSqr_SIMD()(src, dst, mask, len, cn); if( !mask ) { @@ -165,7 +588,7 @@ accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn ) template void accProd_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn ) { - int i = 0; + int i = AccProd_SIMD()(src1, src2, dst, mask, len, cn); if( !mask ) { @@ -224,7 +647,7 @@ template void accW_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha ) { AT a = (AT)alpha, b = 1 - a; - int i = 0; + int i = AccW_SIMD()(src, dst, mask, len, cn, a); if( !mask ) { diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index fa751c9108..e9e64c571f 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -361,6 +361,15 @@ void cv::Canny( InputArray _src, OutputArray _dst, _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm); } } +#elif CV_NEON + for ( ; j <= width - 8; j += 8) + { + int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); + vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))), + vabsq_s32(vmovl_s16(vget_low_s16(v_dy))))); + vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))), + vabsq_s32(vmovl_s16(vget_high_s16(v_dy))))); + } #endif for ( ; j < width; ++j) _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j])); @@ -386,6 +395,18 @@ void cv::Canny( InputArray _src, OutputArray _dst, _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm); } } +#elif CV_NEON + for ( ; j <= width - 8; j += 8) + { + int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j); + int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy); + int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); + vst1q_s32(_norm + j, v_dst); + + v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy); + v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp); + vst1q_s32(_norm + j + 4, v_dst); + } #endif for ( ; j < width; ++j) _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j]; diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp index c329148f24..1e5ecc3a7e 100644 --- a/modules/imgproc/src/clahe.cpp +++ b/modules/imgproc/src/clahe.cpp @@ -233,6 +233,31 @@ namespace CLAHE_Interpolation_Body(const cv::Mat& src, const cv::Mat& dst, const cv::Mat& lut, const cv::Size& tileSize, const int& tilesX, const int& tilesY) : src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY) { + buf.allocate(src.cols << 2); + ind1_p = (int *)buf; + ind2_p = ind1_p + src.cols; + xa_p = (float *)(ind2_p + src.cols); + xa1_p = xa_p + src.cols; + + int lut_step = static_cast(lut_.step / sizeof(T)); + float inv_tw = 1.0f / tileSize_.width; + + for (int x = 0; x < src.cols; ++x) + { + float txf = x * inv_tw - 0.5f; + + int tx1 = cvFloor(txf); + int tx2 = tx1 + 1; + + xa_p[x] = txf - tx1; + xa1_p[x] = 1.0f - xa_p[x]; + + tx1 = std::max(tx1, 0); + tx2 = std::min(tx2, tilesX_ - 1); + + ind1_p[x] = tx1 * lut_step; + ind2_p[x] = tx2 * lut_step; + } } void operator ()(const cv::Range& range) const; @@ -245,24 +270,28 @@ namespace cv::Size tileSize_; int tilesX_; int tilesY_; + + cv::AutoBuffer buf; + int * ind1_p, * ind2_p; + float * xa_p, * xa1_p; }; template void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const { - const size_t lut_step = lut_.step / sizeof(T); + float inv_th = 1.0f / tileSize_.height; for (int y = range.start; y < range.end; ++y) { const T* srcRow = src_.ptr(y); T* dstRow = dst_.ptr(y); - const float tyf = (static_cast(y) / tileSize_.height) - 0.5f; + float tyf = y * inv_th - 0.5f; int ty1 = cvFloor(tyf); int ty2 = ty1 + 1; - const float ya = tyf - ty1; + float ya = tyf - ty1, ya1 = 1.0f - ya; ty1 = std::max(ty1, 0); ty2 = std::min(ty2, tilesY_ - 1); @@ -272,27 +301,13 @@ namespace for (int x = 0; x < src_.cols; ++x) { - const float txf = (static_cast(x) / tileSize_.width) - 0.5f; + int srcVal = srcRow[x]; - int tx1 = cvFloor(txf); - int tx2 = tx1 + 1; + int ind1 = ind1_p[x] + srcVal; + int ind2 = ind2_p[x] + srcVal; - const float xa = txf - tx1; - - tx1 = std::max(tx1, 0); - tx2 = std::min(tx2, tilesX_ - 1); - - const int srcVal = srcRow[x]; - - const size_t ind1 = tx1 * lut_step + srcVal; - const size_t ind2 = tx2 * lut_step + srcVal; - - float res = 0; - - res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya)); - res += lutPlane1[ind2] * ((xa) * (1.0f - ya)); - res += lutPlane2[ind1] * ((1.0f - xa) * (ya)); - res += lutPlane2[ind2] * ((xa) * (ya)); + float res = (lutPlane1[ind1] * xa1_p[x] + lutPlane1[ind2] * xa_p[x]) * ya1 + + (lutPlane2[ind1] * xa1_p[x] + lutPlane2[ind2] * xa_p[x]) * ya; dstRow[x] = cv::saturate_cast(res); } @@ -403,7 +418,9 @@ namespace calcLutBody = cv::makePtr >(srcForLut, lut_, tileSize, tilesX_, clipLimit, lutScale); else if (_src.type() == CV_16UC1) calcLutBody = cv::makePtr >(srcForLut, lut_, tileSize, tilesX_, clipLimit, lutScale); - CV_Assert(!calcLutBody.empty()); + else + CV_Error( CV_StsBadArg, "Unsupported type" ); + cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), *calcLutBody); cv::Ptr interpolationBody; @@ -411,7 +428,7 @@ namespace interpolationBody = cv::makePtr >(src, dst, lut_, tileSize, tilesX_, tilesY_); else if (_src.type() == CV_16UC1) interpolationBody = cv::makePtr >(src, dst, lut_, tileSize, tilesX_, tilesY_); - CV_Assert(!interpolationBody.empty()); + cv::parallel_for_(cv::Range(0, src.rows), *interpolationBody); } diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 78236e03cf..9a00701731 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -580,6 +580,143 @@ template struct RGB2RGB int srccn, dstcn, blueIdx; }; +#if CV_NEON + +template<> struct RGB2RGB +{ + typedef uchar channel_type; + + RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : + srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) + { + v_alpha = vdupq_n_u8(ColorChannel::max()); + v_alpha2 = vget_low_u8(v_alpha); + } + + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0; + if (dcn == 3) + { + n *= 3; + if (scn == 3) + { + for ( ; i <= n - 48; i += 48, src += 48 ) + { + uint8x16x3_t v_src = vld3q_u8(src), v_dst; + v_dst.val[0] = v_src.val[bidx]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[2] = v_src.val[bidx ^ 2]; + vst3q_u8(dst + i, v_dst); + } + for ( ; i <= n - 24; i += 24, src += 24 ) + { + uint8x8x3_t v_src = vld3_u8(src), v_dst; + v_dst.val[0] = v_src.val[bidx]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[2] = v_src.val[bidx ^ 2]; + vst3_u8(dst + i, v_dst); + } + for ( ; i < n; i += 3, src += 3 ) + { + uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2]; + dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2; + } + } + else + { + for ( ; i <= n - 48; i += 48, src += 64 ) + { + uint8x16x4_t v_src = vld4q_u8(src); + uint8x16x3_t v_dst; + v_dst.val[0] = v_src.val[bidx]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[2] = v_src.val[bidx ^ 2]; + vst3q_u8(dst + i, v_dst); + } + for ( ; i <= n - 24; i += 24, src += 32 ) + { + uint8x8x4_t v_src = vld4_u8(src); + uint8x8x3_t v_dst; + v_dst.val[0] = v_src.val[bidx]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[2] = v_src.val[bidx ^ 2]; + vst3_u8(dst + i, v_dst); + } + for ( ; i < n; i += 3, src += 4 ) + { + uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2]; + dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2; + } + } + } + else if (scn == 3) + { + n *= 3; + for ( ; i <= n - 48; i += 48, dst += 64 ) + { + uint8x16x3_t v_src = vld3q_u8(src + i); + uint8x16x4_t v_dst; + v_dst.val[bidx] = v_src.val[0]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[bidx ^ 2] = v_src.val[2]; + v_dst.val[3] = v_alpha; + vst4q_u8(dst, v_dst); + } + for ( ; i <= n - 24; i += 24, dst += 32 ) + { + uint8x8x3_t v_src = vld3_u8(src + i); + uint8x8x4_t v_dst; + v_dst.val[bidx] = v_src.val[0]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[bidx ^ 2] = v_src.val[2]; + v_dst.val[3] = v_alpha2; + vst4_u8(dst, v_dst); + } + uchar alpha = ColorChannel::max(); + for (; i < n; i += 3, dst += 4 ) + { + uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2]; + dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha; + } + } + else + { + n *= 4; + for ( ; i <= n - 64; i += 64 ) + { + uint8x16x4_t v_src = vld4q_u8(src + i), v_dst; + v_dst.val[0] = v_src.val[2]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[2] = v_src.val[0]; + v_dst.val[3] = v_src.val[3]; + vst4q_u8(dst + i, v_dst); + } + for ( ; i <= n - 32; i += 32 ) + { + uint8x8x4_t v_src = vld4_u8(src + i), v_dst; + v_dst.val[0] = v_src.val[2]; + v_dst.val[1] = v_src.val[1]; + v_dst.val[2] = v_src.val[0]; + v_dst.val[3] = v_src.val[3]; + vst4_u8(dst + i, v_dst); + } + for ( ; i < n; i += 4) + { + uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3]; + dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3; + } + } + } + + int srccn, dstcn, blueIdx; + + uint8x16_t v_alpha; + uint8x8_t v_alpha2; +}; + +#endif + /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// struct RGB5x52RGB @@ -587,13 +724,51 @@ struct RGB5x52RGB typedef uchar channel_type; RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits) - : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) {} + : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) + { + #if CV_NEON + v_n3 = vdupq_n_u16(~3); + v_n7 = vdupq_n_u16(~7); + v_255 = vdupq_n_u8(255); + v_0 = vdupq_n_u8(0); + v_mask = vdupq_n_u16(0x8000); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { - int dcn = dstcn, bidx = blueIdx; + int dcn = dstcn, bidx = blueIdx, i = 0; if( greenBits == 6 ) - for( int i = 0; i < n; i++, dst += dcn ) + { + #if CV_NEON + for ( ; i <= n - 16; i += 16, dst += dcn * 16) + { + uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8); + uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3))); + uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)), + vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3))); + uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)), + vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7))); + if (dcn == 3) + { + uint8x16x3_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + vst3q_u8(dst, v_dst); + } + else + { + uint8x16x4_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + v_dst.val[3] = v_255; + vst4q_u8(dst, v_dst); + } + } + #endif + for( ; i < n; i++, dst += dcn ) { unsigned t = ((const ushort*)src)[i]; dst[bidx] = (uchar)(t << 3); @@ -602,8 +777,39 @@ struct RGB5x52RGB if( dcn == 4 ) dst[3] = 255; } + } else - for( int i = 0; i < n; i++, dst += dcn ) + { + #if CV_NEON + for ( ; i <= n - 16; i += 16, dst += dcn * 16) + { + uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8); + uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3))); + uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)), + vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7))); + uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)), + vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7))); + if (dcn == 3) + { + uint8x16x3_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + vst3q_u8(dst, v_dst); + } + else + { + uint8x16x4_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)), + vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0); + vst4q_u8(dst, v_dst); + } + } + #endif + for( ; i < n; i++, dst += dcn ) { unsigned t = ((const ushort*)src)[i]; dst[bidx] = (uchar)(t << 3); @@ -612,9 +818,14 @@ struct RGB5x52RGB if( dcn == 4 ) dst[3] = t & 0x8000 ? 255 : 0; } + } } int dstcn, blueIdx, greenBits; + #if CV_NEON + uint16x8_t v_n3, v_n7, v_mask; + uint8x16_t v_255, v_0; + #endif }; @@ -623,30 +834,92 @@ struct RGB2RGB5x5 typedef uchar channel_type; RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits) - : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) {} + : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) + { + #if CV_NEON + v_n3 = vdup_n_u8(~3); + v_n7 = vdup_n_u8(~7); + v_mask = vdupq_n_u16(0x8000); + v_0 = vdupq_n_u16(0); + v_full = vdupq_n_u16(0xffff); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { - int scn = srccn, bidx = blueIdx; - if( greenBits == 6 ) - for( int i = 0; i < n; i++, src += scn ) + int scn = srccn, bidx = blueIdx, i = 0; + if (greenBits == 6) + { + if (scn == 3) { - ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8)); + #if CV_NEON + for ( ; i <= n - 8; i += 8, src += 24 ) + { + uint8x8x3_t v_src = vld3_u8(src); + uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8)); + vst1q_u16((ushort *)dst + i, v_dst); + } + #endif + for ( ; i < n; i++, src += 3 ) + ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8)); } - else if( scn == 3 ) - for( int i = 0; i < n; i++, src += 3 ) + else { + #if CV_NEON + for ( ; i <= n - 8; i += 8, src += 32 ) + { + uint8x8x4_t v_src = vld4_u8(src); + uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8)); + vst1q_u16((ushort *)dst + i, v_dst); + } + #endif + for ( ; i < n; i++, src += 4 ) + ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8)); + } + } + else if (scn == 3) + { + #if CV_NEON + for ( ; i <= n - 8; i += 8, src += 24 ) + { + uint8x8x3_t v_src = vld3_u8(src); + uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7)); + vst1q_u16((ushort *)dst + i, v_dst); + } + #endif + for ( ; i < n; i++, src += 3 ) ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7)); - } + } else - for( int i = 0; i < n; i++, src += 4 ) + { + #if CV_NEON + for ( ; i <= n - 8; i += 8, src += 32 ) { + uint8x8x4_t v_src = vld4_u8(src); + uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2)); + v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7), + vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0))); + vst1q_u16((ushort *)dst + i, v_dst); + } + #endif + for ( ; i < n; i++, src += 4 ) ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)| ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0)); - } + } } int srccn, blueIdx, greenBits; + #if CV_NEON + uint8x8_t v_n3, v_n7; + uint16x8_t v_mask, v_0, v_full; + #endif }; ///////////////////////////////// Color to/from Grayscale //////////////////////////////// @@ -683,23 +956,57 @@ struct Gray2RGB5x5 { typedef uchar channel_type; - Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) {} + Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) + { + #if CV_NEON + v_n7 = vdup_n_u8(~7); + v_n3 = vdup_n_u8(~3); + #endif + } + void operator()(const uchar* src, uchar* dst, int n) const { + int i = 0; if( greenBits == 6 ) - for( int i = 0; i < n; i++ ) + { + #if CV_NEON + for ( ; i <= n - 8; i += 8 ) + { + uint8x8_t v_src = vld1_u8(src + i); + uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3)); + v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8)); + vst1q_u16((ushort *)dst + i, v_dst); + } + #endif + for ( ; i < n; i++ ) { int t = src[i]; ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8)); } + } else - for( int i = 0; i < n; i++ ) + { + #if CV_NEON + for ( ; i <= n - 8; i += 8 ) + { + uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3)); + uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10)); + vst1q_u16((ushort *)dst + i, v_dst); + } + #endif + for( ; i < n; i++ ) { int t = src[i] >> 3; ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10)); } + } } int greenBits; + + #if CV_NEON + uint8x8_t v_n7, v_n3; + #endif }; @@ -722,27 +1029,85 @@ struct RGB5x52Gray { typedef uchar channel_type; - RGB5x52Gray(int _greenBits) : greenBits(_greenBits) {} + RGB5x52Gray(int _greenBits) : greenBits(_greenBits) + { + #if CV_NEON + v_b2y = vdup_n_u16(B2Y); + v_g2y = vdup_n_u16(G2Y); + v_r2y = vdup_n_u16(R2Y); + v_delta = vdupq_n_u32(1 << (yuv_shift - 1)); + v_f8 = vdupq_n_u16(0xf8); + v_fc = vdupq_n_u16(0xfc); + #endif + } + void operator()(const uchar* src, uchar* dst, int n) const { + int i = 0; if( greenBits == 6 ) - for( int i = 0; i < n; i++ ) + { + #if CV_NEON + for ( ; i <= n - 8; i += 8) + { + uint16x8_t v_src = vld1q_u16((ushort *)src + i); + uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8), + v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc), + v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8); + + uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y), + vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y); + uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y), + vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y); + v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift); + v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift); + + vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); + } + #endif + for ( ; i < n; i++) { int t = ((ushort*)src)[i]; dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + ((t >> 3) & 0xfc)*G2Y + ((t >> 8) & 0xf8)*R2Y, yuv_shift); } + } else - for( int i = 0; i < n; i++ ) + { + #if CV_NEON + for ( ; i <= n - 8; i += 8) + { + uint16x8_t v_src = vld1q_u16((ushort *)src + i); + uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8), + v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8), + v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8); + + uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y), + vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y); + uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y), + vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y); + v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift); + v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift); + + vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)))); + } + #endif + for ( ; i < n; i++) { int t = ((ushort*)src)[i]; dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + ((t >> 2) & 0xf8)*G2Y + ((t >> 7) & 0xf8)*R2Y, yuv_shift); } + } } int greenBits; + + #if CV_NEON + uint16x4_t v_b2y, v_g2y, v_r2y; + uint32x4_t v_delta; + uint16x8_t v_f8, v_fc; + #endif }; @@ -769,7 +1134,6 @@ template struct RGB2Gray float coeffs[3]; }; - template<> struct RGB2Gray { typedef uchar channel_type; @@ -800,6 +1164,166 @@ template<> struct RGB2Gray int tab[256*3]; }; +#if CV_NEON + +template <> +struct RGB2Gray +{ + typedef ushort channel_type; + + RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : + srccn(_srccn) + { + static const int coeffs0[] = { R2Y, G2Y, B2Y }; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0])); + if( blueIdx == 0 ) + std::swap(coeffs[0], coeffs[2]); + + v_cb = vdup_n_u16(coeffs[0]); + v_cg = vdup_n_u16(coeffs[1]); + v_cr = vdup_n_u16(coeffs[2]); + v_delta = vdupq_n_u32(1 << (yuv_shift - 1)); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0; + + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + uint16x8_t v_b, v_r, v_g; + if (scn == 3) + { + uint16x8x3_t v_src = vld3q_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + else + { + uint16x8x4_t v_src = vld4q_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + + uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16( + vmull_u16(vget_low_u16(v_b), v_cb), + vget_low_u16(v_g), v_cg), + vget_low_u16(v_r), v_cr); + uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16( + vmull_u16(vget_high_u16(v_b), v_cb), + vget_high_u16(v_g), v_cg), + vget_high_u16(v_r), v_cr); + + uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift)); + uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift)); + + vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1)); + } + + for ( ; i <= n - 4; i += 4, src += scn * 4) + { + uint16x4_t v_b, v_r, v_g; + if (scn == 3) + { + uint16x4x3_t v_src = vld3_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + else + { + uint16x4x4_t v_src = vld4_u16(src); + v_b = v_src.val[0]; + v_g = v_src.val[1]; + v_r = v_src.val[2]; + } + + uint32x4_t v_dst = vmlal_u16(vmlal_u16( + vmull_u16(v_b, v_cb), + v_g, v_cg), + v_r, v_cr); + + vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift))); + } + + for( ; i < n; i++, src += scn) + dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift); + } + + int srccn, coeffs[3]; + uint16x4_t v_cb, v_cg, v_cr; + uint32x4_t v_delta; +}; + +template <> +struct RGB2Gray +{ + typedef float channel_type; + + RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const float coeffs0[] = { 0.299f, 0.587f, 0.114f }; + memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) ); + if(blueIdx == 0) + std::swap(coeffs[0], coeffs[2]); + + v_cb = vdupq_n_f32(coeffs[0]); + v_cg = vdupq_n_f32(coeffs[1]); + v_cr = vdupq_n_f32(coeffs[2]); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, i = 0; + float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2]; + + if (scn == 3) + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + float32x4x3_t v_src = vld3q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + + v_src = vld3q_f32(src + scn * 4); + vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + + for ( ; i <= n - 4; i += 4, src += scn * 4) + { + float32x4x3_t v_src = vld3q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + } + else + { + for ( ; i <= n - 8; i += 8, src += scn * 8) + { + float32x4x4_t v_src = vld4q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + + v_src = vld4q_f32(src + scn * 4); + vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + + for ( ; i <= n - 4; i += 4, src += scn * 4) + { + float32x4x4_t v_src = vld4q_f32(src); + vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr)); + } + } + + for ( ; i < n; i++, src += scn) + dst[i] = src[0]*cb + src[1]*cg + src[2]*cr; + } + + int srccn; + float coeffs[3]; + float32x4_t v_cb, v_cg, v_cr; +}; + +#else template<> struct RGB2Gray { @@ -823,6 +1347,7 @@ template<> struct RGB2Gray int coeffs[3]; }; +#endif ///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// @@ -855,6 +1380,72 @@ template struct RGB2YCrCb_f float coeffs[5]; }; +#if CV_NEON + +template <> +struct RGB2YCrCb_f +{ + typedef float channel_type; + + RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : + srccn(_srccn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if(blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = vdupq_n_f32(coeffs[0]); + v_c1 = vdupq_n_f32(coeffs[1]); + v_c2 = vdupq_n_f32(coeffs[2]); + v_c3 = vdupq_n_f32(coeffs[3]); + v_c4 = vdupq_n_f32(coeffs[4]); + v_delta = vdupq_n_f32(ColorChannel::half()); + } + + void operator()(const float * src, float * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + n *= 3; + + if (scn == 3) + for ( ; i <= n - 12; i += 12, src += 12) + { + float32x4x3_t v_src = vld3q_f32(src), v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3); + v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4); + + vst3q_f32(dst + i, v_dst); + } + else + for ( ; i <= n - 12; i += 12, src += 16) + { + float32x4x4_t v_src = vld4q_f32(src); + float32x4x3_t v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3); + v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4); + + vst3q_f32(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + float Y = src[0]*C0 + src[1]*C1 + src[2]*C2; + float Cr = (src[bidx^2] - Y)*C3 + delta; + float Cb = (src[bidx] - Y)*C4 + delta; + dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb; + } + } + int srccn, blueIdx; + float coeffs[5]; + float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta; +}; + +#endif template struct RGB2YCrCb_i { @@ -887,6 +1478,224 @@ template struct RGB2YCrCb_i int coeffs[5]; }; +#if CV_NEON + +template <> +struct RGB2YCrCb_i +{ + typedef uchar channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = vdup_n_s16(coeffs[0]); + v_c1 = vdup_n_s16(coeffs[1]); + v_c2 = vdup_n_s16(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_c4 = vdupq_n_s32(coeffs[4]); + v_delta = vdupq_n_s32(ColorChannel::half()*(1 << yuv_shift)); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + } + + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint8x8x3_t v_dst; + int16x8x3_t v_src16; + + if (scn == 3) + { + uint8x8x3_t v_src = vld3_u8(src); + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + } + else + { + uint8x8x4_t v_src = vld4_u8(src); + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + } + + int16x4x3_t v_src0; + v_src0.val[0] = vget_low_s16(v_src16.val[0]); + v_src0.val[1] = vget_low_s16(v_src16.val[1]); + v_src0.val[2] = vget_low_s16(v_src16.val[2]); + + int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift); + int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3); + v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift); + int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4); + v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift); + + v_src0.val[0] = vget_high_s16(v_src16.val[0]); + v_src0.val[1] = vget_high_s16(v_src16.val[1]); + v_src0.val[2] = vget_high_s16(v_src16.val[2]); + + int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift); + int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3); + v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift); + int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4); + v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift); + + v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1))); + v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1))); + v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1))); + + vst3_u8(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + int srccn, blueIdx, coeffs[5]; + int16x4_t v_c0, v_c1, v_c2; + int32x4_t v_c3, v_c4, v_delta, v_delta2; +}; + +template <> +struct RGB2YCrCb_i +{ + typedef ushort channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_c4 = vdupq_n_s32(coeffs[4]); + v_delta = vdupq_n_s32(ColorChannel::half()*(1 << yuv_shift)); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + } + + void operator()(const ushort * src, ushort * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint16x8x3_t v_src, v_dst; + int32x4x3_t v_src0; + + if (scn == 3) + v_src = vld3q_u16(src); + else + { + uint16x8x4_t v_src_ = vld4q_u16(src); + v_src.val[0] = v_src_.val[0]; + v_src.val[1] = v_src_.val[1]; + v_src.val[2] = v_src_.val[2]; + } + + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2]))); + + int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift); + int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3); + v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift); + int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4); + v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift); + + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2]))); + + int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift); + int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3); + v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift); + int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4); + v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift); + + v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1)); + v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1)); + v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1)); + + vst3q_u16(dst + i, v_dst); + } + + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + uint16x4x3_t v_dst; + int32x4x3_t v_src0; + + if (scn == 3) + { + uint16x4x3_t v_src = vld3_u16(src); + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + } + else + { + uint16x4x4_t v_src = vld4_u16(src); + v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])); + v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])); + v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + } + + int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2); + v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift); + int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3); + v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift); + int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4); + v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift); + + v_dst.val[0] = vqmovun_s32(v_Y); + v_dst.val[1] = vqmovun_s32(v_Cr); + v_dst.val[2] = vqmovun_s32(v_Cb); + + vst3_u16(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + int srccn, blueIdx, coeffs[5]; + int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2; +}; + +#endif template struct YCrCb2RGB_f { @@ -923,6 +1732,80 @@ template struct YCrCb2RGB_f float coeffs[4]; }; +#if CV_NEON + +template <> +struct YCrCb2RGB_f +{ + typedef float channel_type; + + YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = vdupq_n_f32(coeffs[0]); + v_c1 = vdupq_n_f32(coeffs[1]); + v_c2 = vdupq_n_f32(coeffs[2]); + v_c3 = vdupq_n_f32(coeffs[3]); + v_delta = vdupq_n_f32(ColorChannel::half()); + v_alpha = vdupq_n_f32(ColorChannel::max()); + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const float delta = ColorChannel::half(), alpha = ColorChannel::max(); + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + if (dcn == 3) + for ( ; i <= n - 12; i += 12, dst += 12) + { + float32x4x3_t v_src = vld3q_f32(src + i), v_dst; + float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2]; + + v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3); + v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y); + v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0); + + vst3q_f32(dst, v_dst); + } + else + for ( ; i <= n - 12; i += 12, dst += 16) + { + float32x4x3_t v_src = vld3q_f32(src + i); + float32x4x4_t v_dst; + float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2]; + + v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3); + v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y); + v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0); + v_dst.val[3] = v_alpha; + + vst4q_f32(dst, v_dst); + } + + for ( ; i < n; i += 3, dst += dcn) + { + float Y = src[i], Cr = src[i+1], Cb = src[i+2]; + + float b = Y + (Cb - delta)*C3; + float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1; + float r = Y + (Cr - delta)*C0; + + dst[bidx] = b; dst[1] = g; dst[bidx^2] = r; + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + float coeffs[4]; + float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta; +}; + +#endif template struct YCrCb2RGB_i { @@ -962,6 +1845,254 @@ template struct YCrCb2RGB_i int coeffs[4]; }; +#if CV_NEON + +template <> +struct YCrCb2RGB_i +{ + typedef uchar channel_type; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {22987, -11698, -5636, 29049}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_delta = vdup_n_s16(ColorChannel::half()); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + v_alpha = vdup_n_u8(ColorChannel::max()); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const uchar delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint8x8x3_t v_src = vld3_u8(src + i); + int16x8x3_t v_src16; + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + + int16x4_t v_Y = vget_low_s16(v_src16.val[0]), + v_Cr = vget_low_s16(v_src16.val[1]), + v_Cb = vget_low_s16(v_src16.val[2]); + + int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta)); + v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y); + int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2); + v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y); + int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta)); + v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y); + + v_Y = vget_high_s16(v_src16.val[0]); + v_Cr = vget_high_s16(v_src16.val[1]); + v_Cb = vget_high_s16(v_src16.val[2]); + + int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta)); + v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y); + int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2); + v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y); + int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta)); + v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y); + + uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1))); + uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1))); + uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1))); + + if (dcn == 3) + { + uint8x8x3_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + vst3_u8(dst, v_dst); + } + else + { + uint8x8x4_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + uchar Y = src[i]; + uchar Cr = src[i+1]; + uchar Cb = src[i+2]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[4]; + + int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2; + int16x4_t v_delta; + uint8x8_t v_alpha; +}; + +template <> +struct YCrCb2RGB_i +{ + typedef ushort channel_type; + + YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {22987, -11698, -5636, 29049}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0])); + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_delta = vdupq_n_s32(ColorChannel::half()); + v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1)); + v_alpha = vdupq_n_u16(ColorChannel::max()); + v_alpha2 = vget_low_u16(v_alpha); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int dcn = dstcn, bidx = blueIdx, i = 0; + const ushort delta = ColorChannel::half(), alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint16x8x3_t v_src = vld3q_u16(src + i); + + int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))), + v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))), + v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2]))); + + int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); + v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y); + int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); + v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y); + int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta)); + v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y); + + v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))), + v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))), + v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2]))); + + int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); + v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y); + int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); + v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y); + int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta)); + v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y); + + uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1)); + uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1)); + uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1)); + + if (dcn == 3) + { + uint16x8x3_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + vst3q_u16(dst, v_dst); + } + else + { + uint16x8x4_t v_dst; + v_dst.val[bidx] = v_b; + v_dst.val[1] = v_g; + v_dst.val[bidx^2] = v_r; + v_dst.val[3] = v_alpha; + vst4q_u16(dst, v_dst); + } + } + + for ( ; i <= n - 12; i += 12, dst += dcn * 4) + { + uint16x4x3_t v_src = vld3_u16(src + i); + + int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])), + v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])), + v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + + int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta)); + v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y); + int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2); + v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y); + int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0); + v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y); + + uint16x4_t v_bd = vqmovun_s32(v_b); + uint16x4_t v_gd = vqmovun_s32(v_g); + uint16x4_t v_rd = vqmovun_s32(v_r); + + if (dcn == 3) + { + uint16x4x3_t v_dst; + v_dst.val[bidx] = v_bd; + v_dst.val[1] = v_gd; + v_dst.val[bidx^2] = v_rd; + vst3_u16(dst, v_dst); + } + else + { + uint16x4x4_t v_dst; + v_dst.val[bidx] = v_bd; + v_dst.val[1] = v_gd; + v_dst.val[bidx^2] = v_rd; + v_dst.val[3] = v_alpha2; + vst4_u16(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + ushort Y = src[i]; + ushort Cr = src[i+1]; + ushort Cb = src[i+2]; + + int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift); + int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift); + int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift); + + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[4]; + + int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta; + uint16x8_t v_alpha; + uint16x4_t v_alpha2; +}; + +#endif ////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// @@ -1013,6 +2144,78 @@ template struct RGB2XYZ_f float coeffs[9]; }; +#if CV_NEON + +template <> +struct RGB2XYZ_f +{ + typedef float channel_type; + + RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0])); + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = vdupq_n_f32(coeffs[0]); + v_c1 = vdupq_n_f32(coeffs[1]); + v_c2 = vdupq_n_f32(coeffs[2]); + v_c3 = vdupq_n_f32(coeffs[3]); + v_c4 = vdupq_n_f32(coeffs[4]); + v_c5 = vdupq_n_f32(coeffs[5]); + v_c6 = vdupq_n_f32(coeffs[6]); + v_c7 = vdupq_n_f32(coeffs[7]); + v_c8 = vdupq_n_f32(coeffs[8]); + } + + void operator()(const float* src, float* dst, int n) const + { + int scn = srccn, i = 0; + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + + n *= 3; + + if (scn == 3) + for ( ; i <= n - 12; i += 12, src += 12) + { + float32x4x3_t v_src = vld3q_f32(src), v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5); + v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8); + vst3q_f32(dst + i, v_dst); + } + else + for ( ; i <= n - 12; i += 12, src += 16) + { + float32x4x4_t v_src = vld4q_f32(src); + float32x4x3_t v_dst; + v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2); + v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5); + v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8); + vst3q_f32(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + float X = saturate_cast(src[0]*C0 + src[1]*C1 + src[2]*C2); + float Y = saturate_cast(src[0]*C3 + src[1]*C4 + src[2]*C5); + float Z = saturate_cast(src[0]*C6 + src[1]*C7 + src[2]*C8); + dst[i] = X; dst[i+1] = Y; dst[i+2] = Z; + } + } + + int srccn; + float coeffs[9]; + float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; +}; + +#endif template struct RGB2XYZ_i { @@ -1055,6 +2258,247 @@ template struct RGB2XYZ_i int coeffs[9]; }; +#if CV_NEON + +template <> +struct RGB2XYZ_i +{ + typedef uchar channel_type; + + RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const int coeffs0[] = + { + 1689, 1465, 739, + 871, 2929, 296, + 79, 488, 3892 + }; + for( int i = 0; i < 9; i++ ) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = vdup_n_u16(coeffs[0]); + v_c1 = vdup_n_u16(coeffs[1]); + v_c2 = vdup_n_u16(coeffs[2]); + v_c3 = vdup_n_u16(coeffs[3]); + v_c4 = vdup_n_u16(coeffs[4]); + v_c5 = vdup_n_u16(coeffs[5]); + v_c6 = vdup_n_u16(coeffs[6]); + v_c7 = vdup_n_u16(coeffs[7]); + v_c8 = vdup_n_u16(coeffs[8]); + v_delta = vdupq_n_u32(1 << (xyz_shift - 1)); + } + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint8x8x3_t v_dst; + uint16x8x3_t v_src16; + + if (scn == 3) + { + uint8x8x3_t v_src = vld3_u8(src); + v_src16.val[0] = vmovl_u8(v_src.val[0]); + v_src16.val[1] = vmovl_u8(v_src.val[1]); + v_src16.val[2] = vmovl_u8(v_src.val[2]); + } + else + { + uint8x8x4_t v_src = vld4_u8(src); + v_src16.val[0] = vmovl_u8(v_src.val[0]); + v_src16.val[1] = vmovl_u8(v_src.val[1]); + v_src16.val[2] = vmovl_u8(v_src.val[2]); + } + + uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]), + v_s1 = vget_low_u16(v_src16.val[1]), + v_s2 = vget_low_u16(v_src16.val[2]); + + uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift); + + v_s0 = vget_high_u16(v_src16.val[0]), + v_s1 = vget_high_u16(v_src16.val[1]), + v_s2 = vget_high_u16(v_src16.val[2]); + + uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift); + + v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1))); + v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1))); + v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1))); + + vst3_u8(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift); + int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift); + int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift); + dst[i] = saturate_cast(X); + dst[i+1] = saturate_cast(Y); + dst[i+2] = saturate_cast(Z); + } + } + + int srccn, coeffs[9]; + uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + uint32x4_t v_delta; +}; + +template <> +struct RGB2XYZ_i +{ + typedef ushort channel_type; + + RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn) + { + static const int coeffs0[] = + { + 1689, 1465, 739, + 871, 2929, 296, + 79, 488, 3892 + }; + for( int i = 0; i < 9; i++ ) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[2]); + std::swap(coeffs[3], coeffs[5]); + std::swap(coeffs[6], coeffs[8]); + } + + v_c0 = vdup_n_u16(coeffs[0]); + v_c1 = vdup_n_u16(coeffs[1]); + v_c2 = vdup_n_u16(coeffs[2]); + v_c3 = vdup_n_u16(coeffs[3]); + v_c4 = vdup_n_u16(coeffs[4]); + v_c5 = vdup_n_u16(coeffs[5]); + v_c6 = vdup_n_u16(coeffs[6]); + v_c7 = vdup_n_u16(coeffs[7]); + v_c8 = vdup_n_u16(coeffs[8]); + v_delta = vdupq_n_u32(1 << (xyz_shift - 1)); + } + + void operator()(const ushort * src, ushort * dst, int n) const + { + int scn = srccn, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, src += scn * 8) + { + uint16x8x3_t v_src, v_dst; + + if (scn == 3) + v_src = vld3q_u16(src); + else + { + uint16x8x4_t v_src4 = vld4q_u16(src); + v_src.val[0] = v_src4.val[0]; + v_src.val[1] = v_src4.val[1]; + v_src.val[2] = v_src4.val[2]; + } + + uint16x4_t v_s0 = vget_low_u16(v_src.val[0]), + v_s1 = vget_low_u16(v_src.val[1]), + v_s2 = vget_low_u16(v_src.val[2]); + + uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift); + + v_s0 = vget_high_u16(v_src.val[0]), + v_s1 = vget_high_u16(v_src.val[1]), + v_s2 = vget_high_u16(v_src.val[2]); + + uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift); + + v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1)); + v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1)); + v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1)); + + vst3q_u16(dst + i, v_dst); + } + + for ( ; i <= n - 12; i += 12, src += scn * 4) + { + uint16x4x3_t v_dst; + uint16x4_t v_s0, v_s1, v_s2; + + if (scn == 3) + { + uint16x4x3_t v_src = vld3_u16(src); + v_s0 = v_src.val[0]; + v_s1 = v_src.val[1]; + v_s2 = v_src.val[2]; + } + else + { + uint16x4x4_t v_src = vld4_u16(src); + v_s0 = v_src.val[0]; + v_s1 = v_src.val[1]; + v_s2 = v_src.val[2]; + } + + uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + + v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift)); + v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift)); + v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift)); + + vst3_u16(dst + i, v_dst); + } + + for ( ; i < n; i += 3, src += scn) + { + int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift); + int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift); + int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift); + dst[i] = saturate_cast(X); + dst[i+1] = saturate_cast(Y); + dst[i+2] = saturate_cast(Z); + } + } + + int srccn, coeffs[9]; + uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + uint32x4_t v_delta; +}; + +#endif template struct XYZ2RGB_f { @@ -1141,6 +2585,278 @@ template struct XYZ2RGB_i int coeffs[9]; }; +#if CV_NEON + +template <> +struct XYZ2RGB_i +{ + typedef uchar channel_type; + + XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = + { + 13273, -6296, -2042, + -3970, 7684, 170, + 228, -836, 4331 + }; + for(int i = 0; i < 9; i++) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[6]); + std::swap(coeffs[1], coeffs[7]); + std::swap(coeffs[2], coeffs[8]); + } + + v_c0 = vdup_n_s16(coeffs[0]); + v_c1 = vdup_n_s16(coeffs[1]); + v_c2 = vdup_n_s16(coeffs[2]); + v_c3 = vdup_n_s16(coeffs[3]); + v_c4 = vdup_n_s16(coeffs[4]); + v_c5 = vdup_n_s16(coeffs[5]); + v_c6 = vdup_n_s16(coeffs[6]); + v_c7 = vdup_n_s16(coeffs[7]); + v_c8 = vdup_n_s16(coeffs[8]); + v_delta = vdupq_n_s32(1 << (xyz_shift - 1)); + v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel::max())); + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int dcn = dstcn, i = 0; + uchar alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint8x8x3_t v_src = vld3_u8(src + i); + int16x8x3_t v_src16; + v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0])); + v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1])); + v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2])); + + int16x4_t v_s0 = vget_low_s16(v_src16.val[0]), + v_s1 = vget_low_s16(v_src16.val[1]), + v_s2 = vget_low_s16(v_src16.val[2]); + + int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift); + + v_s0 = vget_high_s16(v_src16.val[0]), + v_s1 = vget_high_s16(v_src16.val[1]), + v_s2 = vget_high_s16(v_src16.val[2]); + + int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift); + + uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1))); + uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1))); + uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1))); + + if (dcn == 3) + { + uint8x8x3_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + vst3_u8(dst, v_dst); + } + else + { + uint8x8x4_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift); + int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift); + int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift); + dst[0] = saturate_cast(B); dst[1] = saturate_cast(G); + dst[2] = saturate_cast(R); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[9]; + + int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8; + uint8x8_t v_alpha; + int32x4_t v_delta; +}; + +template <> +struct XYZ2RGB_i +{ + typedef ushort channel_type; + + XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs) + : dstcn(_dstcn), blueIdx(_blueIdx) + { + static const int coeffs0[] = + { + 13273, -6296, -2042, + -3970, 7684, 170, + 228, -836, 4331 + }; + for(int i = 0; i < 9; i++) + coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i]; + + if(blueIdx == 0) + { + std::swap(coeffs[0], coeffs[6]); + std::swap(coeffs[1], coeffs[7]); + std::swap(coeffs[2], coeffs[8]); + } + + v_c0 = vdupq_n_s32(coeffs[0]); + v_c1 = vdupq_n_s32(coeffs[1]); + v_c2 = vdupq_n_s32(coeffs[2]); + v_c3 = vdupq_n_s32(coeffs[3]); + v_c4 = vdupq_n_s32(coeffs[4]); + v_c5 = vdupq_n_s32(coeffs[5]); + v_c6 = vdupq_n_s32(coeffs[6]); + v_c7 = vdupq_n_s32(coeffs[7]); + v_c8 = vdupq_n_s32(coeffs[8]); + v_delta = vdupq_n_s32(1 << (xyz_shift - 1)); + v_alpha = vdupq_n_u16(ColorChannel::max()); + v_alpha2 = vget_low_u16(v_alpha); + } + + void operator()(const ushort* src, ushort* dst, int n) const + { + int dcn = dstcn, i = 0; + ushort alpha = ColorChannel::max(); + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + n *= 3; + + for ( ; i <= n - 24; i += 24, dst += dcn * 8) + { + uint16x8x3_t v_src = vld3q_u16(src + i); + int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))), + v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))), + v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2]))); + + int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift); + v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift); + v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift); + + v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))); + v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))); + v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2]))); + + int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift); + v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift); + v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift); + + uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1)); + uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1)); + uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1)); + + if (dcn == 3) + { + uint16x8x3_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + vst3q_u16(dst, v_dst); + } + else + { + uint16x8x4_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + v_dst.val[3] = v_alpha; + vst4q_u16(dst, v_dst); + } + } + + for ( ; i <= n - 12; i += 12, dst += dcn * 4) + { + uint16x4x3_t v_src = vld3_u16(src + i); + int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])), + v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])), + v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2])); + + int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2); + int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5); + int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8); + v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift); + v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift); + v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift); + + uint16x4_t v_b = vqmovun_s32(v_X); + uint16x4_t v_g = vqmovun_s32(v_Y); + uint16x4_t v_r = vqmovun_s32(v_Z); + + if (dcn == 3) + { + uint16x4x3_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + vst3_u16(dst, v_dst); + } + else + { + uint16x4x4_t v_dst; + v_dst.val[0] = v_b; + v_dst.val[1] = v_g; + v_dst.val[2] = v_r; + v_dst.val[3] = v_alpha2; + vst4_u16(dst, v_dst); + } + } + + for ( ; i < n; i += 3, dst += dcn) + { + int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift); + int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift); + int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift); + dst[0] = saturate_cast(B); dst[1] = saturate_cast(G); + dst[2] = saturate_cast(R); + if( dcn == 4 ) + dst[3] = alpha; + } + } + int dstcn, blueIdx; + int coeffs[9]; + + int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta; + uint16x4_t v_alpha2; + uint16x8_t v_alpha; +}; + +#endif ////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// @@ -1331,7 +3047,13 @@ struct HSV2RGB_b HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange) : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange) - {} + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(1.f/255.f); + v_scale = vdupq_n_f32(255.f); + v_alpha = vdup_n_u8(ColorChannel::max()); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { @@ -1342,8 +3064,30 @@ struct HSV2RGB_b for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; - for( j = 0; j < dn*3; j += 3 ) + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + uint8x8x3_t v_src = vld3_u8(src + j); + uint16x8_t v_t0 = vmovl_u8(v_src.val[0]), + v_t1 = vmovl_u8(v_src.val[1]), + v_t2 = vmovl_u8(v_src.val[2]); + + float32x4x3_t v_dst; + v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j + 12, v_dst); + } + #endif + + for( ; j < dn*3; j += 3 ) { buf[j] = src[j]; buf[j+1] = src[j+1]*(1.f/255.f); @@ -1351,7 +3095,39 @@ struct HSV2RGB_b } cvt(buf, buf, dn); - for( j = 0; j < dn*3; j += 3, dst += dcn ) + j = 0; + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); + uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); + uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); + + if (dcn == 4) + { + uint8x8x4_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + else + { + uint8x8x3_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + vst3_u8(dst, v_dst); + } + } + #endif + + for( ; j < dn*3; j += 3, dst += dcn ) { dst[0] = saturate_cast(buf[j]*255.f); dst[1] = saturate_cast(buf[j+1]*255.f); @@ -1364,6 +3140,10 @@ struct HSV2RGB_b int dstcn; HSV2RGB_f cvt; + #if CV_NEON + float32x4_t v_scale, v_scale_inv; + uint8x8_t v_alpha; + #endif }; @@ -1428,7 +3208,14 @@ struct RGB2HLS_b typedef uchar channel_type; RGB2HLS_b(int _srccn, int _blueIdx, int _hrange) - : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) {} + : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(1.f/255.f); + v_scale = vdupq_n_f32(255.f); + v_alpha = vdup_n_u8(ColorChannel::max()); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { @@ -1438,8 +3225,41 @@ struct RGB2HLS_b for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) { int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; - for( j = 0; j < dn*3; j += 3, src += scn ) + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn) + { + uint16x8_t v_t0, v_t1, v_t2; + + if (scn == 3) + { + uint8x8x3_t v_src = vld3_u8(src); + v_t0 = vmovl_u8(v_src.val[0]); + v_t1 = vmovl_u8(v_src.val[1]); + v_t2 = vmovl_u8(v_src.val[2]); + } + else + { + uint8x8x4_t v_src = vld4_u8(src); + v_t0 = vmovl_u8(v_src.val[0]); + v_t1 = vmovl_u8(v_src.val[1]); + v_t2 = vmovl_u8(v_src.val[2]); + } + + float32x4x3_t v_dst; + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j + 12, v_dst); + } + #endif + for( ; j < dn*3; j += 3, src += scn ) { buf[j] = src[0]*(1.f/255.f); buf[j+1] = src[1]*(1.f/255.f); @@ -1447,7 +3267,23 @@ struct RGB2HLS_b } cvt(buf, buf, dn); - for( j = 0; j < dn*3; j += 3 ) + j = 0; + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + + uint8x8x3_t v_dst; + v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])), + vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0])))); + v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); + v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); + vst3_u8(dst + j, v_dst); + } + #endif + for( ; j < dn*3; j += 3 ) { dst[j] = saturate_cast(buf[j]); dst[j+1] = saturate_cast(buf[j+1]*255.f); @@ -1458,6 +3294,10 @@ struct RGB2HLS_b int srccn; RGB2HLS_f cvt; + #if CV_NEON + float32x4_t v_scale, v_scale_inv; + uint8x8_t v_alpha; + #endif }; @@ -1531,7 +3371,13 @@ struct HLS2RGB_b HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange) : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange) - {} + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(1.f/255.f); + v_scale = vdupq_n_f32(255.f); + v_alpha = vdup_n_u8(ColorChannel::max()); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { @@ -1542,8 +3388,29 @@ struct HLS2RGB_b for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; - for( j = 0; j < dn*3; j += 3 ) + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + uint8x8x3_t v_src = vld3_u8(src + j); + uint16x8_t v_t0 = vmovl_u8(v_src.val[0]), + v_t1 = vmovl_u8(v_src.val[1]), + v_t2 = vmovl_u8(v_src.val[2]); + + float32x4x3_t v_dst; + v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j + 12, v_dst); + } + #endif + for( ; j < dn*3; j += 3 ) { buf[j] = src[j]; buf[j+1] = src[j+1]*(1.f/255.f); @@ -1551,7 +3418,38 @@ struct HLS2RGB_b } cvt(buf, buf, dn); - for( j = 0; j < dn*3; j += 3, dst += dcn ) + j = 0; + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); + uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); + uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); + + if (dcn == 4) + { + uint8x8x4_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + else + { + uint8x8x3_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + vst3_u8(dst, v_dst); + } + } + #endif + for( ; j < dn*3; j += 3, dst += dcn ) { dst[0] = saturate_cast(buf[j]*255.f); dst[1] = saturate_cast(buf[j+1]*255.f); @@ -1564,6 +3462,10 @@ struct HLS2RGB_b int dstcn; HLS2RGB_f cvt; + #if CV_NEON + float32x4_t v_scale, v_scale_inv; + uint8x8_t v_alpha; + #endif }; @@ -1871,7 +3773,15 @@ struct Lab2RGB_b Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs, const float* _whitept, bool _srgb ) - : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {} + : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(100.f/255.f); + v_scale = vdupq_n_f32(255.f); + v_alpha = vdup_n_u8(ColorChannel::max()); + v_128 = vdupq_n_f32(128.0f); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { @@ -1882,16 +3792,70 @@ struct Lab2RGB_b for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; - for( j = 0; j < dn*3; j += 3 ) + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + uint8x8x3_t v_src = vld3_u8(src + j); + uint16x8_t v_t0 = vmovl_u8(v_src.val[0]), + v_t1 = vmovl_u8(v_src.val[1]), + v_t2 = vmovl_u8(v_src.val[2]); + + float32x4x3_t v_dst; + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128); + v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128); + v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128); + vst3q_f32(buf + j + 12, v_dst); + } + #endif + + for( ; j < dn*3; j += 3 ) { buf[j] = src[j]*(100.f/255.f); buf[j+1] = (float)(src[j+1] - 128); buf[j+2] = (float)(src[j+2] - 128); } cvt(buf, buf, dn); + j = 0; - for( j = 0; j < dn*3; j += 3, dst += dcn ) + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); + uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); + uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); + + if (dcn == 4) + { + uint8x8x4_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + else + { + uint8x8x3_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + vst3_u8(dst, v_dst); + } + } + #endif + + for( ; j < dn*3; j += 3, dst += dcn ) { dst[0] = saturate_cast(buf[j]*255.f); dst[1] = saturate_cast(buf[j+1]*255.f); @@ -1904,6 +3868,11 @@ struct Lab2RGB_b int dstcn; Lab2RGB_f cvt; + + #if CV_NEON + float32x4_t v_scale, v_scale_inv, v_128; + uint8x8_t v_alpha; + #endif }; @@ -2067,7 +4036,18 @@ struct RGB2Luv_b RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs, const float* _whitept, bool _srgb ) - : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb) {} + : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb) + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(1.f/255.f); + v_scale = vdupq_n_f32(2.55f); + v_coeff1 = vdupq_n_f32(0.72033898305084743f); + v_coeff2 = vdupq_n_f32(96.525423728813564f); + v_coeff3 = vdupq_n_f32(0.9732824427480916f); + v_coeff4 = vdupq_n_f32(136.259541984732824f); + v_alpha = vdup_n_u8(ColorChannel::max()); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { @@ -2077,8 +4057,41 @@ struct RGB2Luv_b for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 ) { int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; - for( j = 0; j < dn*3; j += 3, src += scn ) + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn) + { + uint16x8_t v_t0, v_t1, v_t2; + + if (scn == 3) + { + uint8x8x3_t v_src = vld3_u8(src); + v_t0 = vmovl_u8(v_src.val[0]); + v_t1 = vmovl_u8(v_src.val[1]); + v_t2 = vmovl_u8(v_src.val[2]); + } + else + { + uint8x8x4_t v_src = vld4_u8(src); + v_t0 = vmovl_u8(v_src.val[0]); + v_t1 = vmovl_u8(v_src.val[1]); + v_t2 = vmovl_u8(v_src.val[2]); + } + + float32x4x3_t v_dst; + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv); + v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv); + vst3q_f32(buf + j + 12, v_dst); + } + #endif + for( ; j < dn*3; j += 3, src += scn ) { buf[j] = src[0]*(1.f/255.f); buf[j+1] = (float)(src[1]*(1.f/255.f)); @@ -2086,7 +4099,25 @@ struct RGB2Luv_b } cvt(buf, buf, dn); - for( j = 0; j < dn*3; j += 3 ) + j = 0; + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + + uint8x8x3_t v_dst; + v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); + v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))), + vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2))))); + v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))), + vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4))))); + + vst3_u8(dst + j, v_dst); + } + #endif + + for( ; j < dn*3; j += 3 ) { dst[j] = saturate_cast(buf[j]*2.55f); dst[j+1] = saturate_cast(buf[j+1]*0.72033898305084743f + 96.525423728813564f); @@ -2097,6 +4128,11 @@ struct RGB2Luv_b int srccn; RGB2Luv_f cvt; + + #if CV_NEON + float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4; + uint8x8_t v_alpha; + #endif }; @@ -2106,7 +4142,18 @@ struct Luv2RGB_b Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs, const float* _whitept, bool _srgb ) - : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {} + : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) + { + #if CV_NEON + v_scale_inv = vdupq_n_f32(100.f/255.f); + v_coeff1 = vdupq_n_f32(1.388235294117647f); + v_coeff2 = vdupq_n_f32(1.027450980392157f); + v_134 = vdupq_n_f32(134.f); + v_140 = vdupq_n_f32(140.f); + v_scale = vdupq_n_f32(255.f); + v_alpha = vdup_n_u8(ColorChannel::max()); + #endif + } void operator()(const uchar* src, uchar* dst, int n) const { @@ -2117,8 +4164,29 @@ struct Luv2RGB_b for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { int dn = std::min(n - i, (int)BLOCK_SIZE); + j = 0; - for( j = 0; j < dn*3; j += 3 ) + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24) + { + uint8x8x3_t v_src = vld3_u8(src + j); + uint16x8_t v_t0 = vmovl_u8(v_src.val[0]), + v_t1 = vmovl_u8(v_src.val[1]), + v_t2 = vmovl_u8(v_src.val[2]); + + float32x4x3_t v_dst; + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_coeff1), v_134); + v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_coeff2), v_140); + vst3q_f32(buf + j, v_dst); + + v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv); + v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_coeff1), v_134); + v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140); + vst3q_f32(buf + j + 12, v_dst); + } + #endif + for( ; j < dn*3; j += 3 ) { buf[j] = src[j]*(100.f/255.f); buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f); @@ -2126,7 +4194,39 @@ struct Luv2RGB_b } cvt(buf, buf, dn); - for( j = 0; j < dn*3; j += 3, dst += dcn ) + j = 0; + #if CV_NEON + for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8) + { + float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12); + uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale))))); + uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale))))); + uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))), + vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale))))); + + if (dcn == 4) + { + uint8x8x4_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + v_dst.val[3] = v_alpha; + vst4_u8(dst, v_dst); + } + else + { + uint8x8x3_t v_dst; + v_dst.val[0] = v_dst0; + v_dst.val[1] = v_dst1; + v_dst.val[2] = v_dst2; + vst3_u8(dst, v_dst); + } + } + #endif + + for( ; j < dn*3; j += 3, dst += dcn ) { dst[0] = saturate_cast(buf[j]*255.f); dst[1] = saturate_cast(buf[j+1]*255.f); @@ -2139,6 +4239,11 @@ struct Luv2RGB_b int dstcn; Luv2RGB_f cvt; + + #if CV_NEON + float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140; + uint8x8_t v_alpha; + #endif }; diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp index 6318784278..096997a609 100644 --- a/modules/imgproc/src/corner.cpp +++ b/modules/imgproc/src/corner.cpp @@ -126,7 +126,7 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) if( simd ) { __m128 k4 = _mm_set1_ps((float)k); - for( ; j <= size.width - 5; j += 4 ) + for( ; j <= size.width - 4; j += 4 ) { __m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x __m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x @@ -146,6 +146,17 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k ) _mm_storeu_ps(dst + j, a); } } + #elif CV_NEON + float32x4_t v_k = vdupq_n_f32((float)k); + + for( ; j <= size.width - 4; j += 4 ) + { + float32x4x3_t v_src = vld3q_f32(cov + j * 3); + float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2]; + float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b); + float32x4_t v_ac = vaddq_f32(v_a, v_c); + vst1q_f32(dst + j, vmlsq_f32(v_ac_bb, v_k, vmulq_f32(v_ac, v_ac))); + } #endif for( ; j < size.width; j++ ) @@ -607,10 +618,13 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord if( src.depth() == CV_8U ) factor *= 255; factor = 1./(factor * factor * factor); +#if CV_NEON || CV_SSE2 + float factor_f = (float)factor; +#endif #if CV_SSE2 volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2); - __m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f); + __m128 v_factor = _mm_set1_ps(factor_f), v_m2 = _mm_set1_ps(-2.0f); #endif Size size = src.size(); @@ -641,6 +655,15 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord _mm_storeu_ps(dstdata + j, v_s1); } } +#elif CV_NEON + for( ; j <= size.width - 4; j += 4 ) + { + float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j); + float32x4_t v_s = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j))); + v_s = vmlaq_f32(v_s, vld1q_f32(d2xdata + j), vmulq_f32(v_dy, v_dy)); + v_s = vmlaq_f32(v_s, vld1q_f32(dxydata + j), vmulq_n_f32(vmulq_f32(v_dy, v_dx), -2)); + vst1q_f32(dstdata + j, vmulq_n_f32(v_s, factor_f)); + } #endif for( ; j < size.width; j++ ) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index fd5f19172e..a0b19dfb67 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -894,6 +894,183 @@ struct VResizeCubicVec_32f } }; +#elif CV_NEON + +typedef VResizeNoVec VResizeLinearVec_32s8u; + +struct VResizeLinearVec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + ushort* dst = (ushort*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); + float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); + + vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLinearVec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + short* dst = (short*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); + float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeLinearVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1]; + float* dst = (float*)_dst; + int x = 0; + + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); + float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); + + vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); + vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); + } + + return x; + } +}; + +typedef VResizeNoVec VResizeCubicVec_32s8u; + +struct VResizeCubicVec_32f16u +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + ushort* dst = (ushort*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + + vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeCubicVec_32f16s +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + short* dst = (short*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x)); + float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4)); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); + } + + return x; + } +}; + +struct VResizeCubicVec_32f +{ + int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const + { + const float** src = (const float**)_src; + const float* beta = (const float*)_beta; + const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; + float* dst = (float*)_dst; + int x = 0; + float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), + v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); + + for( ; x <= width - 8; x += 8 ) + { + vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), + v_b1, vld1q_f32(S1 + x)), + v_b2, vld1q_f32(S2 + x)), + v_b3, vld1q_f32(S3 + x))); + vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), + v_b1, vld1q_f32(S1 + x + 4)), + v_b2, vld1q_f32(S2 + x + 4)), + v_b3, vld1q_f32(S3 + x + 4))); + } + + return x; + } +}; + #else typedef VResizeNoVec VResizeLinearVec_32s8u; @@ -1322,7 +1499,120 @@ struct ResizeAreaFastNoVec { return 0; } }; -#if CV_SSE2 +#if CV_NEON + +class ResizeAreaFastVec_SIMD_8u +{ +public: + ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : + cn(_cn), step(_step) + { + } + + int operator() (const uchar* S, uchar* D, int w) const + { + int dx = 0; + const uchar* S0 = S, * S1 = S0 + step; + + uint16x8_t v_2 = vdupq_n_u16(2); + + if (cn == 1) + { + for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16) + { + uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1); + + uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1])); + v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1]))); + v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2); + + uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1])); + v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1]))); + v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2); + + vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1); + + uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0)); + uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0)); + uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1)); + uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1)); + + uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)), + vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10))); + uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)), + vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11))); + uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2); + + vst1_u8(D, vmovn_u16(v_dst)); + } + } + + return dx; + } + +private: + int cn, step; +}; + +class ResizeAreaFastVec_SIMD_16u +{ +public: + ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : + cn(_cn), step(_step) + { + } + + int operator() (const ushort * S, ushort * D, int w) const + { + int dx = 0; + const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step); + + uint32x4_t v_2 = vdupq_n_u32(2); + + if (cn == 1) + { + for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1); + + uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1])); + v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1]))); + v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2); + + uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1])); + v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1]))); + v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2); + + vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1); + uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)), + vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1))); + vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2))); + } + } + + return dx; + } + +private: + int cn, step; +}; + +#elif CV_SSE2 + class ResizeAreaFastVec_SIMD_8u { public: @@ -3489,7 +3779,15 @@ public: bufxy = (*m1)(Rect(x, y, bcols, brows)); const ushort* sA = m2->ptr(y+y1) + x; - for( x1 = 0; x1 < bcols; x1++ ) + x1 = 0; + + #if CV_NEON + uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1); + for ( ; x1 <= bcols - 8; x1 += 8) + vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale)); + #endif + + for( ; x1 < bcols; x1++ ) A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1)); } else if( planar_input ) @@ -3534,6 +3832,22 @@ public: _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); } } + #elif CV_NEON + float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); + int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); + + for( ; x1 <= bcols - 4; x1 += 4 ) + { + int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)), + v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale)); + int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, + vandq_s32(v_sy, v_scale2)); + vst1_u16(A + x1, vqmovun_s32(v_v)); + + int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), + vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); + vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); + } #endif for( ; x1 < bcols; x1++ ) @@ -3549,6 +3863,26 @@ public: else { const float* sXY = m1->ptr(y+y1) + x*2; + x1 = 0; + + #if CV_NEON + float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE); + int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); + + for( ; x1 <= bcols - 4; x1 += 4 ) + { + float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1)); + int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale)); + int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale)); + int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, + vandq_s32(v_sy, v_scale2)); + vst1_u16(A + x1, vqmovun_s32(v_v)); + + int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), + vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); + vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); + } + #endif for( x1 = 0; x1 < bcols; x1++ ) { diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index d68248c36c..b292d99821 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -203,7 +203,7 @@ static Moments contourMoments( const Mat& contour ) \****************************************************************************************/ template -struct MomentsInTile_SSE +struct MomentsInTile_SIMD { int operator() (const T *, int, WT &, WT &, WT &, MT &) { @@ -214,9 +214,9 @@ struct MomentsInTile_SSE #if CV_SSE2 template <> -struct MomentsInTile_SSE +struct MomentsInTile_SIMD { - MomentsInTile_SSE() + MomentsInTile_SIMD() { useSIMD = checkHardwareSupport(CV_CPU_SSE2); } @@ -234,17 +234,16 @@ struct MomentsInTile_SSE for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); - qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); - __m128i px = _mm_mullo_epi16(p, qx); __m128i sx = _mm_mullo_epi16(qx, qx); + + qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); - qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx)); + qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); qx = _mm_add_epi16(qx, dx); } - int CV_DECL_ALIGNED(16) buf[4]; _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); @@ -258,17 +257,84 @@ struct MomentsInTile_SSE return x; } + int CV_DECL_ALIGNED(16) buf[4]; bool useSIMD; }; +#elif CV_NEON + +template <> +struct MomentsInTile_SIMD +{ + MomentsInTile_SIMD() + { + ushort CV_DECL_ALIGNED(8) init[4] = { 0, 1, 2, 3 }; + qx_init = vld1_u16(init); + v_step = vdup_n_u16(4); + } + + int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) + { + int x = 0; + + uint32x4_t v_z = vdupq_n_u32(0), v_x0 = v_z, v_x1 = v_z, + v_x2 = v_z, v_x3 = v_z; + uint16x4_t qx = qx_init; + + for( ; x <= len - 8; x += 8 ) + { + uint16x8_t v_src = vmovl_u8(vld1_u8(ptr + x)); + + // first part + uint32x4_t v_qx = vmovl_u16(qx); + uint16x4_t v_p = vget_low_u16(v_src); + uint32x4_t v_px = vmull_u16(qx, v_p); + + v_x0 = vaddw_u16(v_x0, v_p); + v_x1 = vaddq_u32(v_x1, v_px); + v_px = vmulq_u32(v_px, v_qx); + v_x2 = vaddq_u32(v_x2, v_px); + v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); + qx = vadd_u16(qx, v_step); + + // second part + v_qx = vmovl_u16(qx); + v_p = vget_high_u16(v_src); + v_px = vmull_u16(qx, v_p); + + v_x0 = vaddw_u16(v_x0, v_p); + v_x1 = vaddq_u32(v_x1, v_px); + v_px = vmulq_u32(v_px, v_qx); + v_x2 = vaddq_u32(v_x2, v_px); + v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx)); + + qx = vadd_u16(qx, v_step); + } + + vst1q_u32(buf, v_x0); + x0 = buf[0] + buf[1] + buf[2] + buf[3]; + vst1q_u32(buf, v_x1); + x1 = buf[0] + buf[1] + buf[2] + buf[3]; + vst1q_u32(buf, v_x2); + x2 = buf[0] + buf[1] + buf[2] + buf[3]; + vst1q_u32(buf, v_x3); + x3 = buf[0] + buf[1] + buf[2] + buf[3]; + + return x; + } + + uint CV_DECL_ALIGNED(16) buf[4]; + uint16x4_t qx_init, v_step; +}; + #endif #if CV_SSE4_1 template <> -struct MomentsInTile_SSE +struct MomentsInTile_SIMD { - MomentsInTile_SSE() + MomentsInTile_SIMD() { useSIMD = checkHardwareSupport(CV_CPU_SSE4_1); } @@ -302,9 +368,6 @@ struct MomentsInTile_SSE v_ix1 = _mm_add_epi32(v_ix1, v_delta); } - int CV_DECL_ALIGNED(16) buf[4]; - int64 CV_DECL_ALIGNED(16) buf64[2]; - _mm_store_si128((__m128i*)buf, v_x0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, v_x1); @@ -319,6 +382,8 @@ struct MomentsInTile_SSE return x; } + int CV_DECL_ALIGNED(16) buf[4]; + int64 CV_DECL_ALIGNED(16) buf64[2]; bool useSIMD; }; @@ -334,7 +399,7 @@ static void momentsInTile( const Mat& img, double* moments ) Size size = img.size(); int x, y; MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; - MomentsInTile_SSE vop; + MomentsInTile_SIMD vop; for( y = 0; y < size.height; y++ ) { diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index 8eab602437..8a8515e7a3 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -178,11 +178,190 @@ struct PyrDownVec_32f } }; +typedef NoVec PyrDownVec_32s16u; +typedef NoVec PyrDownVec_32s16s; + +typedef NoVec PyrUpVec_32f; + +#elif CV_NEON + +struct PyrDownVec_32s8u +{ + int operator()(int** src, uchar* dst, int, int width) const + { + int x = 0; + const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1], + *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3], + *row4 = (unsigned int*)src[4]; + uint16x8_t v_delta = vdupq_n_u16(128); + + for( ; x <= width - 16; x += 16 ) + { + uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4))); + uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4))); + uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4))); + uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4))); + uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4))); + + v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2)); + v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3); + uint16x8_t v_dst0 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); + + v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12))); + v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12))); + v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12))); + v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12))); + v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12))); + + v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2)); + v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3); + uint16x8_t v_dst1 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2)); + + vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)), + vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8)))); + } + + return x; + } +}; + +struct PyrDownVec_32s16u +{ + int operator()(int** src, ushort* dst, int, int width) const + { + int x = 0; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + int32x4_t v_delta = vdupq_n_s32(128); + + for( ; x <= width - 8; x += 8 ) + { + int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); + int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); + int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); + int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); + int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); + + v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20)); + v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30); + int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8); + + v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21)); + v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31); + int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8); + + vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1))); + } + + return x; + } +}; + +struct PyrDownVec_32s16s +{ + int operator()(int** src, short* dst, int, int width) const + { + int x = 0; + const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + int32x4_t v_delta = vdupq_n_s32(128); + + for( ; x <= width - 8; x += 8 ) + { + int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4); + int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4); + int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4); + int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4); + int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4); + + v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20)); + v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30); + int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8); + + v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21)); + v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31); + int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1))); + } + + return x; + } +}; + +struct PyrDownVec_32f +{ + int operator()(float** src, float* dst, int, int width) const + { + int x = 0; + const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; + float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_r0 = vld1q_f32(row0 + x); + float32x4_t v_r1 = vld1q_f32(row1 + x); + float32x4_t v_r2 = vld1q_f32(row2 + x); + float32x4_t v_r3 = vld1q_f32(row3 + x); + float32x4_t v_r4 = vld1q_f32(row4 + x); + + v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); + v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); + vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); + + v_r0 = vld1q_f32(row0 + x + 4); + v_r1 = vld1q_f32(row1 + x + 4); + v_r2 = vld1q_f32(row2 + x + 4); + v_r3 = vld1q_f32(row3 + x + 4); + v_r4 = vld1q_f32(row4 + x + 4); + + v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2)); + v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3); + vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale)); + } + + return x; + } +}; + +struct PyrUpVec_32f +{ + int operator()(float** src, float* dst, int, int width) const + { + int x = 0; + float ** dsts = (float **)dst; + const float *row0 = src[0], *row1 = src[1], *row2 = src[2]; + float *dst0 = dsts[0], *dst1 = dsts[1]; + float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f); + + for( ; x <= width - 8; x += 8 ) + { + float32x4_t v_r0 = vld1q_f32(row0 + x); + float32x4_t v_r1 = vld1q_f32(row1 + x); + float32x4_t v_r2 = vld1q_f32(row2 + x); + + vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2))); + vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2))); + + v_r0 = vld1q_f32(row0 + x + 4); + v_r1 = vld1q_f32(row1 + x + 4); + v_r2 = vld1q_f32(row2 + x + 4); + + vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2))); + vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2))); + } + + return x; + } +}; + #else typedef NoVec PyrDownVec_32s8u; +typedef NoVec PyrDownVec_32s16u; +typedef NoVec PyrDownVec_32s16s; typedef NoVec PyrDownVec_32f; +typedef NoVec PyrUpVec_32f; + #endif template void @@ -325,6 +504,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int) AutoBuffer _dtab(ssize.width*cn); int* dtab = _dtab; WT* rows[PU_SZ]; + T* dsts[2]; CastOp castOp; VecOp vecOp; @@ -385,8 +565,9 @@ pyrUp_( const Mat& _src, Mat& _dst, int) for( k = 0; k < PU_SZ; k++ ) rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep; row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; + dsts[0] = dst0; dsts[1] = dst1; - x = vecOp(rows, dst0, (int)_dst.step, dsize.width); + x = vecOp(rows, (T*)dsts, (int)_dst.step, dsize.width); for( ; x < dsize.width; x++ ) { T t1 = castOp((row1[x] + row2[x])*4); @@ -561,9 +742,9 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde if( depth == CV_8U ) func = pyrDown_, PyrDownVec_32s8u>; else if( depth == CV_16S ) - func = pyrDown_, NoVec >; + func = pyrDown_, PyrDownVec_32s16s >; else if( depth == CV_16U ) - func = pyrDown_, NoVec >; + func = pyrDown_, PyrDownVec_32s16u >; else if( depth == CV_32F ) func = pyrDown_, PyrDownVec_32f>; else if( depth == CV_64F ) @@ -636,7 +817,7 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT else if( depth == CV_16U ) func = pyrUp_, NoVec >; else if( depth == CV_32F ) - func = pyrUp_, NoVec >; + func = pyrUp_, PyrUpVec_32f >; else if( depth == CV_64F ) func = pyrUp_, NoVec >; else diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 0872c44c6b..d9678f2d62 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -132,8 +132,8 @@ struct ColumnSum : SUM = &sum[0]; if( sumCount == 0 ) { - for( i = 0; i < width; i++ ) - SUM[i] = 0; + memset((void*)SUM, 0, width*sizeof(ST)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) { const ST* Sp = (const ST*)src[0]; @@ -247,13 +247,16 @@ struct ColumnSum : #if CV_SSE2 if(haveSSE2) { - for( ; i < width-4; i+=4 ) + for( ; i <= width-4; i+=4 ) { __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); } } + #elif CV_NEON + for( ; i <= width - 4; i+=4 ) + vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); #endif for( ; i < width; i++ ) SUM[i] += Sp[i]; @@ -277,7 +280,7 @@ struct ColumnSum : if(haveSSE2) { const __m128 scale4 = _mm_set1_ps((float)_scale); - for( ; i < width-8; i+=8 ) + for( ; i <= width-8; i+=8 ) { __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); @@ -298,6 +301,22 @@ struct ColumnSum : _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); } } + #elif CV_NEON + float32x4_t v_scale = vdupq_n_f32((float)_scale); + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); + uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)); + vst1_u8(D + i, vqmovn_u16(v_dst)); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } #endif for( ; i < width; i++ ) { @@ -312,7 +331,7 @@ struct ColumnSum : #if CV_SSE2 if(haveSSE2) { - for( ; i < width-8; i+=8 ) + for( ; i <= width-8; i+=8 ) { __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); @@ -330,6 +349,18 @@ struct ColumnSum : _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); } } + #elif CV_NEON + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)); + vst1_u8(D + i, vqmovn_u16(v_dst)); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } #endif for( ; i < width; i++ ) @@ -390,13 +421,16 @@ struct ColumnSum : #if CV_SSE2 if(haveSSE2) { - for( ; i < width-4; i+=4 ) + for( ; i <= width-4; i+=4 ) { __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); } } + #elif CV_NEON + for( ; i <= width - 4; i+=4 ) + vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); #endif for( ; i < width; i++ ) SUM[i] += Sp[i]; @@ -420,7 +454,7 @@ struct ColumnSum : if(haveSSE2) { const __m128 scale4 = _mm_set1_ps((float)_scale); - for( ; i < width-8; i+=8 ) + for( ; i <= width-8; i+=8 ) { __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); @@ -439,6 +473,20 @@ struct ColumnSum : _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1)); } } + #elif CV_NEON + float32x4_t v_scale = vdupq_n_f32((float)_scale); + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); + int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); + vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d))); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } #endif for( ; i < width; i++ ) { @@ -453,7 +501,7 @@ struct ColumnSum : #if CV_SSE2 if(haveSSE2) { - for( ; i < width-8; i+=8 ) + for( ; i <= width-8; i+=8 ) { __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); @@ -470,6 +518,17 @@ struct ColumnSum : _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); } } + #elif CV_NEON + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01))); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } #endif for( ; i < width; i++ ) @@ -537,6 +596,9 @@ struct ColumnSum : _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp)); } } + #elif CV_NEON + for( ; i <= width - 4; i+=4 ) + vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); #endif for( ; i < width; i++ ) SUM[i] += Sp[i]; @@ -578,6 +640,20 @@ struct ColumnSum : _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); } } + #elif CV_NEON + float32x4_t v_scale = vdupq_n_f32((float)_scale); + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); + uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); + vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d))); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } #endif for( ; i < width; i++ ) { @@ -608,6 +684,17 @@ struct ColumnSum : _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); } } + #elif CV_NEON + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01))); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } #endif for( ; i < width; i++ ) @@ -626,6 +713,166 @@ struct ColumnSum : std::vector sum; }; +template<> +struct ColumnSum : + public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) : + BaseColumnFilter() + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + virtual void reset() { sumCount = 0; } + + virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int i; + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + #if CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void *)SUM, 0, sizeof(int) * width); + + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + i = 0; + + #if CV_SSE2 + if(haveSSE2) + { + for( ; i < width-4; i+=4 ) + { + __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); + __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp)); + } + } + #elif CV_NEON + for( ; i <= width - 4; i+=4 ) + vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i))); + #endif + + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int * Sp = (const int*)src[0]; + const int * Sm = (const int*)src[1-ksize]; + float* D = (float*)dst; + if( haveScale ) + { + i = 0; + + #if CV_SSE2 + if(haveSSE2) + { + const __m128 scale4 = _mm_set1_ps((float)_scale); + + for( ; i < width-4; i+=4) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + } + } + #elif CV_NEON + float32x4_t v_scale = vdupq_n_f32((float)_scale); + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale)); + vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale)); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } + #endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = (float)(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + i = 0; + + #if CV_SSE2 + if(haveSSE2) + { + for( ; i < width-4; i+=4) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0)); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + } + } + #elif CV_NEON + for( ; i <= width-8; i+=8 ) + { + int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)); + int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4)); + + vst1q_f32(D + i, vcvtq_f32_s32(v_s0)); + vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01)); + + vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i))); + vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4))); + } + #endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = (float)(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } + } + + double scale; + int sumCount; + std::vector sum; +}; + #ifdef HAVE_OPENCL #define DIVUP(total, grain) ((total + grain - 1) / (grain)) @@ -1360,6 +1607,21 @@ static inline void histogram_sub_simd( const HT x[16], HT y[16] ) _mm_store_si128(ry+1, r1); } +#elif CV_NEON +#define MEDIAN_HAVE_SIMD 1 + +static inline void histogram_add_simd( const HT x[16], HT y[16] ) +{ + vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y))); + vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8))); +} + +static inline void histogram_sub_simd( const HT x[16], HT y[16] ) +{ + vst1q_u16(y, vsubq_u16(vld1q_u16(x), vld1q_u16(y))); + vst1q_u16(y + 8, vsubq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8))); +} + #else #define MEDIAN_HAVE_SIMD 0 #endif @@ -1413,7 +1675,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) HT* h_coarse = alignPtr(&_h_coarse[0], 16); HT* h_fine = alignPtr(&_h_fine[0], 16); #if MEDIAN_HAVE_SIMD - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); + volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); #endif for( int x = 0; x < _dst.cols; x += STRIPE_SIZE ) @@ -1861,6 +2123,71 @@ struct MinMaxVec32f } }; +#elif CV_NEON + +struct MinMaxVec8u +{ + typedef uchar value_type; + typedef uint8x16_t arg_type; + enum { SIZE = 16 }; + arg_type load(const uchar* ptr) { return vld1q_u8(ptr); } + void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); } + void operator()(arg_type& a, arg_type& b) const + { + arg_type t = a; + a = vminq_u8(a, b); + b = vmaxq_u8(b, t); + } +}; + + +struct MinMaxVec16u +{ + typedef ushort value_type; + typedef uint16x8_t arg_type; + enum { SIZE = 8 }; + arg_type load(const ushort* ptr) { return vld1q_u16(ptr); } + void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); } + void operator()(arg_type& a, arg_type& b) const + { + arg_type t = a; + a = vminq_u16(a, b); + b = vmaxq_u16(b, t); + } +}; + + +struct MinMaxVec16s +{ + typedef short value_type; + typedef int16x8_t arg_type; + enum { SIZE = 8 }; + arg_type load(const short* ptr) { return vld1q_s16(ptr); } + void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); } + void operator()(arg_type& a, arg_type& b) const + { + arg_type t = a; + a = vminq_s16(a, b); + b = vmaxq_s16(b, t); + } +}; + + +struct MinMaxVec32f +{ + typedef float value_type; + typedef float32x4_t arg_type; + enum { SIZE = 4 }; + arg_type load(const float* ptr) { return vld1q_f32(ptr); } + void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); } + void operator()(arg_type& a, arg_type& b) const + { + arg_type t = a; + a = vminq_f32(a, b); + b = vmaxq_f32(b, t); + } +}; + #else @@ -1887,7 +2214,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) int i, j, k, cn = _src.channels(); Op op; VecOp vop; - volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); + volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); if( m == 3 ) { @@ -2203,7 +2530,7 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize ) #endif bool useSortNet = ksize == 3 || (ksize == 5 -#if !CV_SSE2 +#if !(CV_SSE2 || CV_NEON) && src0.depth() > CV_8U #endif ); @@ -2237,7 +2564,8 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize ) CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) ); double img_size_mp = (double)(src0.total())/(1 << 20); - if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*(MEDIAN_HAVE_SIMD && checkHardwareSupport(CV_CPU_SSE2) ? 1 : 3)) + if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)* + (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3)) medianBlur_8u_Om( src, dst, ksize ); else medianBlur_8u_O1( src, dst, ksize ); diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp index 09a1f6eeab..721a596e70 100644 --- a/modules/imgproc/src/thresh.cpp +++ b/modules/imgproc/src/thresh.cpp @@ -264,6 +264,74 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type ) } } } +#elif CV_NEON + uint8x16_t v_thresh = vdupq_n_u8(thresh), v_maxval = vdupq_n_u8(maxval); + + switch( type ) + { + case THRESH_BINARY: + for( i = 0; i < roi.height; i++ ) + { + const uchar* src = _src.ptr() + src_step*i; + uchar* dst = _dst.ptr() + dst_step*i; + + for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16) + vst1q_u8(dst + j_scalar, vandq_u8(vcgtq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval)); + } + break; + + case THRESH_BINARY_INV: + for( i = 0; i < roi.height; i++ ) + { + const uchar* src = _src.ptr() + src_step*i; + uchar* dst = _dst.ptr() + dst_step*i; + + for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16) + vst1q_u8(dst + j_scalar, vandq_u8(vcleq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval)); + } + break; + + case THRESH_TRUNC: + for( i = 0; i < roi.height; i++ ) + { + const uchar* src = _src.ptr() + src_step*i; + uchar* dst = _dst.ptr() + dst_step*i; + + for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16) + vst1q_u8(dst + j_scalar, vminq_u8(vld1q_u8(src + j_scalar), v_thresh)); + } + break; + + case THRESH_TOZERO: + for( i = 0; i < roi.height; i++ ) + { + const uchar* src = _src.ptr() + src_step*i; + uchar* dst = _dst.ptr() + dst_step*i; + + for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16) + { + uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcgtq_u8(v_src, v_thresh); + vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src)); + } + } + break; + + case THRESH_TOZERO_INV: + for( i = 0; i < roi.height; i++ ) + { + const uchar* src = _src.ptr() + src_step*i; + uchar* dst = _dst.ptr() + dst_step*i; + + for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16) + { + uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcleq_u8(v_src, v_thresh); + vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src)); + } + } + break; + default: + return CV_Error( CV_StsBadArg, "" ); + } #endif if( j_scalar < roi.width ) @@ -382,6 +450,14 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } + #elif CV_NEON + int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); + + for( ; j <= roi.width - 8; j += 8 ) + { + uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh); + vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); + } #endif for( ; j < roi.width; j++ ) @@ -410,6 +486,14 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } + #elif CV_NEON + int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); + + for( ; j <= roi.width - 8; j += 8 ) + { + uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh); + vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); + } #endif for( ; j < roi.width; j++ ) @@ -436,6 +520,11 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } + #elif CV_NEON + int16x8_t v_thresh = vdupq_n_s16(thresh); + + for( ; j <= roi.width - 8; j += 8 ) + vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) @@ -462,6 +551,15 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } + #elif CV_NEON + int16x8_t v_thresh = vdupq_n_s16(thresh); + + for( ; j <= roi.width - 8; j += 8 ) + { + int16x8_t v_src = vld1q_s16(src + j); + uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh); + vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); + } #endif for( ; j < roi.width; j++ ) @@ -491,6 +589,15 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } + #elif CV_NEON + int16x8_t v_thresh = vdupq_n_s16(thresh); + + for( ; j <= roi.width - 8; j += 8 ) + { + int16x8_t v_src = vld1q_s16(src + j); + uint16x8_t v_mask = vcleq_s16(v_src, v_thresh); + vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); + } #endif for( ; j < roi.width; j++ ) { @@ -576,6 +683,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) _mm_storeu_ps( dst + j + 4, v1 ); } } +#elif CV_NEON + float32x4_t v_thresh = vdupq_n_f32(thresh); + uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); + + for( ; j <= roi.width - 4; j += 4 ) + { + float32x4_t v_src = vld1q_f32(src + j); + uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval); + vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); + } #endif for( ; j < roi.width; j++ ) @@ -604,6 +721,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) _mm_storeu_ps( dst + j + 4, v1 ); } } +#elif CV_NEON + float32x4_t v_thresh = vdupq_n_f32(thresh); + uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); + + for( ; j <= roi.width - 4; j += 4 ) + { + float32x4_t v_src = vld1q_f32(src + j); + uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval); + vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); + } #endif for( ; j < roi.width; j++ ) @@ -630,6 +757,11 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) _mm_storeu_ps( dst + j + 4, v1 ); } } +#elif CV_NEON + float32x4_t v_thresh = vdupq_n_f32(thresh); + + for( ; j <= roi.width - 4; j += 4 ) + vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) @@ -656,6 +788,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) _mm_storeu_ps( dst + j + 4, v1 ); } } +#elif CV_NEON + float32x4_t v_thresh = vdupq_n_f32(thresh); + + for( ; j <= roi.width - 4; j += 4 ) + { + float32x4_t v_src = vld1q_f32(src + j); + uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), + vreinterpretq_u32_f32(v_src)); + vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); + } #endif for( ; j < roi.width; j++ ) @@ -685,6 +827,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) _mm_storeu_ps( dst + j + 4, v1 ); } } +#elif CV_NEON + float32x4_t v_thresh = vdupq_n_f32(thresh); + + for( ; j <= roi.width - 4; j += 4 ) + { + float32x4_t v_src = vld1q_f32(src + j); + uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), + vreinterpretq_u32_f32(v_src)); + vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); + } #endif for( ; j < roi.width; j++ ) { diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp index 3146ff72a9..eb13d35402 100644 --- a/modules/imgproc/test/test_imgwarp.cpp +++ b/modules/imgproc/test/test_imgwarp.cpp @@ -1545,4 +1545,63 @@ TEST(Imgproc_InitUndistortMap, accuracy) { CV_UndistortMapTest test; test.safe_r TEST(Imgproc_GetRectSubPix, accuracy) { CV_GetRectSubPixTest test; test.safe_run(); } TEST(Imgproc_GetQuadSubPix, accuracy) { CV_GetQuadSubPixTest test; test.safe_run(); } +////////////////////////////////////////////////////////////////////////// + +template +void resizeArea(const cv::Mat & src, cv::Mat & dst) +{ + int cn = src.channels(); + + for (int y = 0; y < dst.rows; ++y) + { + const T * sptr0 = src.ptr(y << 1); + const T * sptr1 = src.ptr((y << 1) + 1); + T * dptr = dst.ptr(y); + + for (int x = 0; x < dst.cols * cn; x += cn) + { + int x1 = x << 1; + + for (int c = 0; c < cn; ++c) + { + WT sum = WT(sptr0[x1 + c]) + WT(sptr0[x1 + c + cn]); + sum += WT(sptr1[x1 + c]) + WT(sptr1[x1 + c + cn]) + (WT)(2); + + dptr[x + c] = cv::saturate_cast(sum >> 2); + } + } + } +} + +TEST(Resize, Area_half) +{ + const int size = 10; + int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4 }; + + cv::RNG rng(17); + + for (int i = 0, _size = sizeof(types) / sizeof(types[0]); i < _size; ++i) + { + int type = types[i], depth = CV_MAT_DEPTH(type); + + SCOPED_TRACE(depth); + + cv::Mat src(size, size, type), dst_actual(size >> 1, size >> 1, type), + dst_reference(size >> 1, size >> 1, type); + + rng.fill(src, cv::RNG::UNIFORM, 0, 1000, true); + + if (depth == CV_8U) + resizeArea(src, dst_reference); + else if (depth == CV_16U) + resizeArea(src, dst_reference); + else + CV_Assert(0); + + cv::resize(src, dst_actual, dst_actual.size(), 0, 0, cv::INTER_AREA); + + ASSERT_EQ(0, cvtest::norm(dst_reference, dst_actual, cv::NORM_INF)); + } +} + /* End of file. */ diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp index ebbb63bb95..00d383a020 100644 --- a/modules/imgproc/test/test_imgwarp_strict.cpp +++ b/modules/imgproc/test/test_imgwarp_strict.cpp @@ -733,19 +733,25 @@ void CV_Remap_Test::generate_test_data() case CV_32FC2: { - MatIterator_ begin_x = mapx.begin(), end_x = mapx.end(); float fscols = static_cast(std::max(src.cols - 1 + n, 0)), fsrows = static_cast(std::max(src.rows - 1 + n, 0)); - for ( ; begin_x != end_x; ++begin_x) + int width = mapx.cols << 1; + + for (int y = 0; y < mapx.rows; ++y) { - begin_x[0] = rng.uniform(_n, fscols); - begin_x[1] = rng.uniform(_n, fsrows); + float * ptr = mapx.ptr(y); + + for (int x = 0; x < width; x += 2) + { + ptr[x] = rng.uniform(_n, fscols); + ptr[x + 1] = rng.uniform(_n, fsrows); + } } } break; default: - assert(0); + CV_Assert(0); break; } }