1132 lines
42 KiB
C++
1132 lines
42 KiB
C++
/*M///////////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
|
//
|
|
// By downloading, copying, installing or using the software you agree to this license.
|
|
// If you do not agree to this license, do not download, install,
|
|
// copy or use the software.
|
|
//
|
|
//
|
|
// License Agreement
|
|
// For Open Source Computer Vision Library
|
|
//
|
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
|
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
|
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
|
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
|
// Third party copyrights are property of their respective owners.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without modification,
|
|
// are permitted provided that the following conditions are met:
|
|
//
|
|
// * Redistribution's of source code must retain the above copyright notice,
|
|
// this list of conditions and the following disclaimer.
|
|
//
|
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
// and/or other materials provided with the distribution.
|
|
//
|
|
// * The name of the copyright holders may not be used to endorse or promote products
|
|
// derived from this software without specific prior written permission.
|
|
//
|
|
// This software is provided by the copyright holders and contributors "as is" and
|
|
// any express or implied warranties, including, but not limited to, the implied
|
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
|
// indirect, incidental, special, exemplary, or consequential damages
|
|
// (including, but not limited to, procurement of substitute goods or services;
|
|
// loss of use, data, or profits; or business interruption) however caused
|
|
// and on any theory of liability, whether in contract, strict liability,
|
|
// or tort (including negligence or otherwise) arising in any way out of
|
|
// the use of this software, even if advised of the possibility of such damage.
|
|
//
|
|
//M*/
|
|
|
|
#include "precomp.hpp"
|
|
#include "arithm_simd.hpp"
|
|
#include "arithm_core.hpp"
|
|
#include "replacement.hpp"
|
|
|
|
namespace cv { namespace hal {
|
|
|
|
//=======================================
|
|
|
|
#undef CALL_HAL
|
|
#define CALL_HAL(fun) \
|
|
int res = fun(src1, step1, src2, step2, dst, step, width, height); \
|
|
if (res == Error::Ok) \
|
|
return; \
|
|
else if (res != Error::NotImplemented) \
|
|
throw Failure(res);
|
|
|
|
#if (ARITHM_USE_IPP == 1)
|
|
static inline void fixSteps(width, height, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
|
|
{
|
|
if( height == 1 )
|
|
step1 = step2 = step = width*elemSize;
|
|
}
|
|
#define CALL_IPP_BIN_12(fun) \
|
|
CV_IPP_CHECK() \
|
|
{ \
|
|
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
|
|
if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \
|
|
{ \
|
|
CV_IMPL_ADD(CV_IMPL_IPP); \
|
|
return; \
|
|
} \
|
|
setIppErrorStatus(); \
|
|
}
|
|
#else
|
|
#define CALL_IPP_BIN_12(fun)
|
|
#endif
|
|
|
|
//=======================================
|
|
// Add
|
|
//=======================================
|
|
|
|
void add8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_add8u)
|
|
CALL_IPP_BIN_12(ippiAdd_8u_C1RSfs)
|
|
(vBinOp<uchar, cv::OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void add8s( const schar* src1, size_t step1,
|
|
const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_add8s)
|
|
vBinOp<schar, cv::OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void add16u( const ushort* src1, size_t step1,
|
|
const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_add16u)
|
|
CALL_IPP_BIN_12(ippiAdd_16u_C1RSfs)
|
|
(vBinOp<ushort, cv::OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void add16s( const short* src1, size_t step1,
|
|
const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_add16s)
|
|
CALL_IPP_BIN_12(ippiAdd_16s_C1RSfs)
|
|
(vBinOp<short, cv::OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void add32s( const int* src1, size_t step1,
|
|
const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_add32s)
|
|
vBinOp32<int, cv::OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void add32f( const float* src1, size_t step1,
|
|
const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_add32f)
|
|
CALL_IPP_BIN_12(ippiAdd_32f_C1R)
|
|
(vBinOp32<float, cv::OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void add64f( const double* src1, size_t step1,
|
|
const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_add64f)
|
|
vBinOp64<double, cv::OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
//=======================================
|
|
|
|
#if (ARITHM_USE_IPP == 1)
|
|
#define CALL_IPP_BIN_21(fun) \
|
|
CV_IPP_CHECK() \
|
|
{ \
|
|
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
|
|
if (0 <= fun(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \
|
|
{ \
|
|
CV_IMPL_ADD(CV_IMPL_IPP); \
|
|
return; \
|
|
} \
|
|
setIppErrorStatus(); \
|
|
}
|
|
#else
|
|
#define CALL_IPP_BIN_21(fun)
|
|
#endif
|
|
|
|
//=======================================
|
|
// Subtract
|
|
//=======================================
|
|
|
|
void sub8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_sub8u)
|
|
CALL_IPP_BIN_21(ippiSub_8u_C1RSfs)
|
|
(vBinOp<uchar, cv::OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void sub8s( const schar* src1, size_t step1,
|
|
const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_sub8s)
|
|
vBinOp<schar, cv::OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void sub16u( const ushort* src1, size_t step1,
|
|
const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_sub16u)
|
|
CALL_IPP_BIN_21(ippiSub_16u_C1RSfs)
|
|
(vBinOp<ushort, cv::OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void sub16s( const short* src1, size_t step1,
|
|
const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_sub16s)
|
|
CALL_IPP_BIN_21(ippiSub_16s_C1RSfs)
|
|
(vBinOp<short, cv::OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void sub32s( const int* src1, size_t step1,
|
|
const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_sub32s)
|
|
vBinOp32<int, cv::OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void sub32f( const float* src1, size_t step1,
|
|
const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_sub32f)
|
|
CALL_IPP_BIN_21(ippiSub_32f_C1R)
|
|
(vBinOp32<float, cv::OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void sub64f( const double* src1, size_t step1,
|
|
const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_sub64f)
|
|
vBinOp64<double, cv::OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
//=======================================
|
|
|
|
#if (ARITHM_USE_IPP == 1)
|
|
#define CALL_IPP_MIN_MAX(fun, type) \
|
|
CV_IPP_CHECK() \
|
|
{ \
|
|
type* s1 = (type*)src1; \
|
|
type* s2 = (type*)src2; \
|
|
type* d = dst; \
|
|
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
|
|
int i = 0; \
|
|
for(; i < height; i++) \
|
|
{ \
|
|
if (0 > fun(s1, s2, d, width)) \
|
|
break; \
|
|
s1 = (type*)((uchar*)s1 + step1); \
|
|
s2 = (type*)((uchar*)s2 + step2); \
|
|
d = (type*)((uchar*)d + step); \
|
|
} \
|
|
if (i == height) \
|
|
{ \
|
|
CV_IMPL_ADD(CV_IMPL_IPP); \
|
|
return; \
|
|
} \
|
|
setIppErrorStatus(); \
|
|
}
|
|
#else
|
|
#define CALL_IPP_MIN_MAX(fun, type)
|
|
#endif
|
|
|
|
//=======================================
|
|
// Max
|
|
//=======================================
|
|
|
|
void max8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_max8u)
|
|
CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar)
|
|
vBinOp<uchar, cv::OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void max8s( const schar* src1, size_t step1,
|
|
const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_max8s)
|
|
vBinOp<schar, cv::OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void max16u( const ushort* src1, size_t step1,
|
|
const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_max16u)
|
|
CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort)
|
|
vBinOp<ushort, cv::OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void max16s( const short* src1, size_t step1,
|
|
const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_max16s)
|
|
vBinOp<short, cv::OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void max32s( const int* src1, size_t step1,
|
|
const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_max32s)
|
|
vBinOp32<int, cv::OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void max32f( const float* src1, size_t step1,
|
|
const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_max32f)
|
|
CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float)
|
|
vBinOp32<float, cv::OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void max64f( const double* src1, size_t step1,
|
|
const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_max64f)
|
|
CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double)
|
|
vBinOp64<double, cv::OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
//=======================================
|
|
// Min
|
|
//=======================================
|
|
|
|
void min8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_min8u)
|
|
CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar)
|
|
vBinOp<uchar, cv::OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void min8s( const schar* src1, size_t step1,
|
|
const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_min8s)
|
|
vBinOp<schar, cv::OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void min16u( const ushort* src1, size_t step1,
|
|
const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_min16u)
|
|
CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort)
|
|
vBinOp<ushort, cv::OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void min16s( const short* src1, size_t step1,
|
|
const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_min16s)
|
|
vBinOp<short, cv::OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void min32s( const int* src1, size_t step1,
|
|
const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_min32s)
|
|
vBinOp32<int, cv::OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void min32f( const float* src1, size_t step1,
|
|
const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_min32f)
|
|
CALL_IPP_MIN_MAX(ippsMinEvery_32f, float)
|
|
vBinOp32<float, cv::OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void min64f( const double* src1, size_t step1,
|
|
const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_min64f)
|
|
CALL_IPP_MIN_MAX(ippsMinEvery_64f, double)
|
|
vBinOp64<double, cv::OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
//=======================================
|
|
// AbsDiff
|
|
//=======================================
|
|
|
|
void absdiff8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_absdiff8u)
|
|
CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R)
|
|
(vBinOp<uchar, cv::OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void absdiff8s( const schar* src1, size_t step1,
|
|
const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_absdiff8s)
|
|
vBinOp<schar, cv::OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void absdiff16u( const ushort* src1, size_t step1,
|
|
const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_absdiff16u)
|
|
CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R)
|
|
(vBinOp<ushort, cv::OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void absdiff16s( const short* src1, size_t step1,
|
|
const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_absdiff16s)
|
|
vBinOp<short, cv::OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void absdiff32s( const int* src1, size_t step1,
|
|
const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_absdiff32s)
|
|
vBinOp32<int, cv::OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
void absdiff32f( const float* src1, size_t step1,
|
|
const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_absdiff32f)
|
|
CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R)
|
|
(vBinOp32<float, cv::OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void absdiff64f( const double* src1, size_t step1,
|
|
const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_absdiff64f)
|
|
vBinOp64<double, cv::OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, width, height);
|
|
}
|
|
|
|
//=======================================
|
|
// Logical
|
|
//=======================================
|
|
|
|
void and8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_and8u)
|
|
CALL_IPP_BIN_12(ippiAnd_8u_C1R)
|
|
(vBinOp<uchar, cv::OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void or8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_or8u)
|
|
CALL_IPP_BIN_12(ippiOr_8u_C1R)
|
|
(vBinOp<uchar, cv::OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void xor8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_xor8u)
|
|
CALL_IPP_BIN_12(ippiXor_8u_C1R)
|
|
(vBinOp<uchar, cv::OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
void not8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* )
|
|
{
|
|
CALL_HAL(hal_not8u)
|
|
CALL_IPP_BIN_12(ippiNot_8u_C1R)
|
|
(vBinOp<uchar, cv::OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
|
|
}
|
|
|
|
//=======================================
|
|
|
|
#undef CALL_HAL
|
|
#define CALL_HAL(fun) \
|
|
int res = fun(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); \
|
|
if (res == Error::Ok) \
|
|
return; \
|
|
else if (res != Error::NotImplemented) \
|
|
throw Failure(res);
|
|
|
|
#if ARITHM_USE_IPP
|
|
inline static IppCmpOp convert_cmp(int _cmpop)
|
|
{
|
|
return _cmpop == CMP_EQ ? ippCmpEq :
|
|
_cmpop == CMP_GT ? ippCmpGreater :
|
|
_cmpop == CMP_GE ? ippCmpGreaterEq :
|
|
_cmpop == CMP_LT ? ippCmpLess :
|
|
_cmpop == CMP_LE ? ippCmpLessEq :
|
|
(IppCmpOp)-1;
|
|
}
|
|
#define CALL_IPP_CMP(fun) \
|
|
CV_IPP_CHECK() \
|
|
{ \
|
|
IppCmpOp op = convert_cmp(*(int *)_cmpop); \
|
|
if( op >= 0 ) \
|
|
{ \
|
|
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
|
|
if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \
|
|
{ \
|
|
CV_IMPL_ADD(CV_IMPL_IPP); \
|
|
return; \
|
|
} \
|
|
setIppErrorStatus(); \
|
|
} \
|
|
}
|
|
#else
|
|
#define CALL_IPP_CMP(fun)
|
|
#endif
|
|
|
|
//=======================================
|
|
// Compare
|
|
//=======================================
|
|
|
|
void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* _cmpop)
|
|
{
|
|
CALL_HAL(hal_cmp8u)
|
|
CALL_IPP_CMP(ippiCompare_8u_C1R)
|
|
//vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
|
|
int code = *(int*)_cmpop;
|
|
step1 /= sizeof(src1[0]);
|
|
step2 /= sizeof(src2[0]);
|
|
if( code == CMP_GE || code == CMP_LT )
|
|
{
|
|
std::swap(src1, src2);
|
|
std::swap(step1, step2);
|
|
code = code == CMP_GE ? CMP_LE : CMP_GT;
|
|
}
|
|
|
|
if( code == CMP_GT || code == CMP_LE )
|
|
{
|
|
int m = code == CMP_GT ? 0 : 255;
|
|
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
|
{
|
|
int x =0;
|
|
#if CV_SSE2
|
|
if( USE_SSE2 )
|
|
{
|
|
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
|
|
__m128i c128 = _mm_set1_epi8 (-128);
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
|
|
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
|
|
// no simd for 8u comparison, that's why we need the trick
|
|
r00 = _mm_sub_epi8(r00,c128);
|
|
r10 = _mm_sub_epi8(r10,c128);
|
|
|
|
r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
|
|
_mm_storeu_si128((__m128i*)(dst + x),r00);
|
|
|
|
}
|
|
}
|
|
#elif CV_NEON
|
|
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
|
|
}
|
|
|
|
#endif
|
|
|
|
for( ; x < width; x++ ){
|
|
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
|
|
}
|
|
}
|
|
}
|
|
else if( code == CMP_EQ || code == CMP_NE )
|
|
{
|
|
int m = code == CMP_EQ ? 0 : 255;
|
|
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
|
{
|
|
int x = 0;
|
|
#if CV_SSE2
|
|
if( USE_SSE2 )
|
|
{
|
|
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
|
|
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
|
|
r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
|
|
_mm_storeu_si128((__m128i*)(dst + x), r00);
|
|
}
|
|
}
|
|
#elif CV_NEON
|
|
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
|
|
}
|
|
#endif
|
|
for( ; x < width; x++ )
|
|
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
|
|
}
|
|
}
|
|
}
|
|
|
|
void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* _cmpop)
|
|
{
|
|
CALL_HAL(hal_cmp8s)
|
|
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
|
|
}
|
|
|
|
void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* _cmpop)
|
|
{
|
|
CALL_HAL(hal_cmp16u)
|
|
CALL_IPP_CMP(ippiCompare_16u_C1R)
|
|
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
|
|
}
|
|
|
|
void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* _cmpop)
|
|
{
|
|
CALL_HAL(hal_cmp16s)
|
|
CALL_IPP_CMP(ippiCompare_16s_C1R)
|
|
//vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
|
|
|
|
int code = *(int*)_cmpop;
|
|
step1 /= sizeof(src1[0]);
|
|
step2 /= sizeof(src2[0]);
|
|
if( code == CMP_GE || code == CMP_LT )
|
|
{
|
|
std::swap(src1, src2);
|
|
std::swap(step1, step2);
|
|
code = code == CMP_GE ? CMP_LE : CMP_GT;
|
|
}
|
|
|
|
if( code == CMP_GT || code == CMP_LE )
|
|
{
|
|
int m = code == CMP_GT ? 0 : 255;
|
|
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
|
{
|
|
int x =0;
|
|
#if CV_SSE2
|
|
if( USE_SSE2)
|
|
{
|
|
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
|
|
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
|
|
r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
|
|
__m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
|
|
__m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
|
|
r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
|
|
r11 = _mm_packs_epi16(r00, r01);
|
|
_mm_storeu_si128((__m128i*)(dst + x), r11);
|
|
}
|
|
if( x <= width-8)
|
|
{
|
|
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
|
|
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
|
|
r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
|
|
r10 = _mm_packs_epi16(r00, r00);
|
|
_mm_storel_epi64((__m128i*)(dst + x), r10);
|
|
|
|
x += 8;
|
|
}
|
|
}
|
|
#elif CV_NEON
|
|
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
int16x8_t in1 = vld1q_s16(src1 + x);
|
|
int16x8_t in2 = vld1q_s16(src2 + x);
|
|
uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
|
|
|
|
in1 = vld1q_s16(src1 + x + 8);
|
|
in2 = vld1q_s16(src2 + x + 8);
|
|
uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
|
|
|
|
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
|
|
}
|
|
#endif
|
|
|
|
for( ; x < width; x++ ){
|
|
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
|
|
}
|
|
}
|
|
}
|
|
else if( code == CMP_EQ || code == CMP_NE )
|
|
{
|
|
int m = code == CMP_EQ ? 0 : 255;
|
|
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
|
{
|
|
int x = 0;
|
|
#if CV_SSE2
|
|
if( USE_SSE2 )
|
|
{
|
|
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
|
|
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
|
|
r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
|
|
__m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
|
|
__m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
|
|
r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
|
|
r11 = _mm_packs_epi16(r00, r01);
|
|
_mm_storeu_si128((__m128i*)(dst + x), r11);
|
|
}
|
|
if( x <= width - 8)
|
|
{
|
|
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
|
|
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
|
|
r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
|
|
r10 = _mm_packs_epi16(r00, r00);
|
|
_mm_storel_epi64((__m128i*)(dst + x), r10);
|
|
|
|
x += 8;
|
|
}
|
|
}
|
|
#elif CV_NEON
|
|
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
|
|
|
|
for( ; x <= width - 16; x += 16 )
|
|
{
|
|
int16x8_t in1 = vld1q_s16(src1 + x);
|
|
int16x8_t in2 = vld1q_s16(src2 + x);
|
|
uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
|
|
|
|
in1 = vld1q_s16(src1 + x + 8);
|
|
in2 = vld1q_s16(src2 + x + 8);
|
|
uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
|
|
|
|
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
|
|
}
|
|
#endif
|
|
for( ; x < width; x++ )
|
|
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
|
|
}
|
|
}
|
|
}
|
|
|
|
void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* _cmpop)
|
|
{
|
|
CALL_HAL(hal_cmp32s)
|
|
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
|
|
}
|
|
|
|
void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* _cmpop)
|
|
{
|
|
CALL_HAL(hal_cmp32f)
|
|
CALL_IPP_CMP(ippiCompare_32f_C1R)
|
|
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
|
|
}
|
|
|
|
void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* _cmpop)
|
|
{
|
|
CALL_HAL(hal_cmp64f)
|
|
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
|
|
}
|
|
|
|
//=======================================
|
|
|
|
#undef CALL_HAL
|
|
#define CALL_HAL(fun) \
|
|
int res = fun(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); \
|
|
if (res == Error::Ok) \
|
|
return; \
|
|
else if (res != Error::NotImplemented) \
|
|
throw Failure(res);
|
|
|
|
#if defined HAVE_IPP
|
|
#define CALL_IPP_MUL(fun) \
|
|
CV_IPP_CHECK() \
|
|
{ \
|
|
if (std::fabs(fscale - 1) <= FLT_EPSILON) \
|
|
{ \
|
|
if (fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \
|
|
{ \
|
|
CV_IMPL_ADD(CV_IMPL_IPP); \
|
|
return; \
|
|
} \
|
|
setIppErrorStatus(); \
|
|
} \
|
|
}
|
|
#else
|
|
#define CALL_IPP_MUL(fun)
|
|
#endif
|
|
|
|
//=======================================
|
|
// Multilpy
|
|
//=======================================
|
|
|
|
void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_mul8u)
|
|
float fscale = (float)*(const double*)scale;
|
|
CALL_IPP_MUL(ippiMul_8u_C1RSfs)
|
|
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
|
|
}
|
|
|
|
void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_mul8s)
|
|
mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale);
|
|
}
|
|
|
|
void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_mul16u)
|
|
float fscale = (float)*(const double*)scale;
|
|
CALL_IPP_MUL(ippiMul_16u_C1RSfs)
|
|
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
|
|
}
|
|
|
|
void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_mul16s)
|
|
float fscale = (float)*(const double*)scale;
|
|
CALL_IPP_MUL(ippiMul_16s_C1RSfs)
|
|
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
|
|
}
|
|
|
|
void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_mul32s)
|
|
mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_mul32f)
|
|
float fscale = (float)*(const double*)scale;
|
|
CALL_IPP_MUL(ippiMul_32f_C1R)
|
|
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
|
|
}
|
|
|
|
void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_mul64f)
|
|
mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
//=======================================
|
|
// Divide
|
|
//=======================================
|
|
|
|
void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_div8u)
|
|
if( src1 )
|
|
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
else
|
|
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_div8s)
|
|
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_div16u)
|
|
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_div16s)
|
|
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_div32s)
|
|
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_div32f)
|
|
div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_div64f)
|
|
div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
//=======================================
|
|
// Reciprocial
|
|
//=======================================
|
|
|
|
void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_recip8u)
|
|
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_recip8s)
|
|
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_recip16u)
|
|
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_recip16s)
|
|
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_recip32s)
|
|
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_recip32f)
|
|
recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* scale)
|
|
{
|
|
CALL_HAL(hal_recip64f)
|
|
recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
|
|
}
|
|
|
|
//=======================================
|
|
|
|
#undef CALL_HAL
|
|
#define CALL_HAL(fun) \
|
|
int res = fun(src1, step1, src2, step2, dst, step, width, height, scalars); \
|
|
if (res == Error::Ok) \
|
|
return; \
|
|
else if (res != Error::NotImplemented) \
|
|
throw Failure(res);
|
|
|
|
//=======================================
|
|
// Add weighted
|
|
//=======================================
|
|
|
|
void
|
|
addWeighted8u( const uchar* src1, size_t step1,
|
|
const uchar* src2, size_t step2,
|
|
uchar* dst, size_t step, int width, int height,
|
|
void* scalars )
|
|
{
|
|
CALL_HAL(hal_addWeighted8u)
|
|
const double* scalars_ = (const double*)scalars;
|
|
float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2];
|
|
|
|
for( ; height--; src1 += step1, src2 += step2, dst += step )
|
|
{
|
|
int x = 0;
|
|
|
|
#if CV_SSE2
|
|
if( USE_SSE2 )
|
|
{
|
|
__m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
|
|
__m128i z = _mm_setzero_si128();
|
|
|
|
for( ; x <= width - 8; x += 8 )
|
|
{
|
|
__m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
|
|
__m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
|
|
|
|
__m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
|
|
__m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
|
|
__m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
|
|
__m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
|
|
|
|
u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
|
|
u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
|
|
u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
|
|
|
|
u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
|
|
u = _mm_packus_epi16(u, u);
|
|
|
|
_mm_storel_epi64((__m128i*)(dst + x), u);
|
|
}
|
|
}
|
|
#elif CV_NEON
|
|
float32x4_t g = vdupq_n_f32 (gamma);
|
|
|
|
for( ; x <= width - 8; x += 8 )
|
|
{
|
|
uint8x8_t in1 = vld1_u8(src1+x);
|
|
uint16x8_t in1_16 = vmovl_u8(in1);
|
|
float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
|
|
float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
|
|
|
|
uint8x8_t in2 = vld1_u8(src2+x);
|
|
uint16x8_t in2_16 = vmovl_u8(in2);
|
|
float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
|
|
float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
|
|
|
|
float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
|
|
float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
|
|
out_f_l = vaddq_f32(out_f_l, g);
|
|
out_f_h = vaddq_f32(out_f_h, g);
|
|
|
|
uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
|
|
uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
|
|
|
|
uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
|
|
uint8x8_t out = vqmovn_u16(out_16);
|
|
|
|
vst1_u8(dst+x, out);
|
|
}
|
|
#endif
|
|
#if CV_ENABLE_UNROLLED
|
|
for( ; x <= width - 4; x += 4 )
|
|
{
|
|
float t0, t1;
|
|
t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
|
|
t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
|
|
|
|
dst[x] = saturate_cast<uchar>(t0);
|
|
dst[x+1] = saturate_cast<uchar>(t1);
|
|
|
|
t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
|
|
t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
|
|
|
|
dst[x+2] = saturate_cast<uchar>(t0);
|
|
dst[x+3] = saturate_cast<uchar>(t1);
|
|
}
|
|
#endif
|
|
|
|
for( ; x < width; x++ )
|
|
{
|
|
float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
|
|
dst[x] = saturate_cast<uchar>(t0);
|
|
}
|
|
}
|
|
}
|
|
|
|
void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
|
|
schar* dst, size_t step, int width, int height, void* scalars )
|
|
{
|
|
CALL_HAL(hal_addWeighted8s)
|
|
addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
|
|
}
|
|
|
|
void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
|
|
ushort* dst, size_t step, int width, int height, void* scalars )
|
|
{
|
|
CALL_HAL(hal_addWeighted16u)
|
|
addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
|
|
}
|
|
|
|
void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
|
|
short* dst, size_t step, int width, int height, void* scalars )
|
|
{
|
|
CALL_HAL(hal_addWeighted16s)
|
|
addWeighted_<short, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
|
|
}
|
|
|
|
void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
|
|
int* dst, size_t step, int width, int height, void* scalars )
|
|
{
|
|
CALL_HAL(hal_addWeighted32s)
|
|
addWeighted_<int, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
|
|
}
|
|
|
|
void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
|
|
float* dst, size_t step, int width, int height, void* scalars )
|
|
{
|
|
CALL_HAL(hal_addWeighted32f)
|
|
addWeighted_<float, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
|
|
}
|
|
|
|
void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
|
|
double* dst, size_t step, int width, int height, void* scalars )
|
|
{
|
|
CALL_HAL(hal_addWeighted64f)
|
|
addWeighted_<double, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
|
|
}
|
|
|
|
}} // cv::hal::
|