Merge pull request #15422 from mipsopen-fwu:msa-dev
* Added MSA implementations for mips platforms. Intrinsics for MSA and build scripts for MIPS platforms are added. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Removed some unused code in mips.toolchain.cmake. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Added comments for mips toolchain configuration and disabled compiling warnings for libpng. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Fixed the build error of unsupported opcode 'pause' when mips isa_rev is less than 2. Signed-off-by: Fei Wu <fwu@wavecomp.com> * 1. Removed FP16 related item in MSA option defines in OpenCVCompilerOptimizations.cmake. 2. Use CV_CPU_COMPILE_MSA instead of __mips_msa for MSA feature check in cv_cpu_dispatch.h. 3. Removed hasSIMD128() in intrin_msa.hpp. 4. Define CPU_MSA as 150. Signed-off-by: Fei Wu <fwu@wavecomp.com> * 1. Removed unnecessary CV_SIMD128_64F guarding in intrin_msa.hpp. 2. Removed unnecessary CV_MSA related code block in dotProd_8u(). Signed-off-by: Fei Wu <fwu@wavecomp.com> * 1. Defined CPU_MSA_FLAGS_ON as "-mmsa". 2. Removed CV_SIMD128_64F guardings in intrin_msa.hpp. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Removed unused msa_mlal_u16() and msa_mlal_s16 from msa_macros.h. Signed-off-by: Fei Wu <fwu@wavecomp.com>
This commit is contained in:
committed by
Alexander Alekhin
parent
33e9fe9312
commit
b1ea91d8bd
@@ -409,13 +409,13 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD
|
||||
#if !CV_NEON
|
||||
#if !CV_NEON && !CV_MSA
|
||||
if (is_aligned(src1, src2, dst))
|
||||
{
|
||||
for (; x <= width - wide_step_l; x += wide_step_l)
|
||||
{
|
||||
ldr::la(src1 + x, src2 + x, dst + x);
|
||||
#if !CV_NEON && CV_SIMD_WIDTH == 16
|
||||
#if CV_SIMD_WIDTH == 16
|
||||
ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2476,6 +2476,45 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
|
||||
i += blockSize;
|
||||
}
|
||||
}
|
||||
#elif CV_MSA
|
||||
int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
|
||||
v4i32 v_zero = msa_dupq_n_s32(0);
|
||||
CV_DECL_ALIGNED(16) int buf[4];
|
||||
|
||||
while( i < len0 )
|
||||
{
|
||||
blockSize = std::min(len0 - i, blockSize0);
|
||||
v4i32 v_sum = v_zero;
|
||||
|
||||
int j = 0;
|
||||
for( ; j <= blockSize - 16; j += 16 )
|
||||
{
|
||||
v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
|
||||
|
||||
v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
|
||||
|
||||
v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
|
||||
v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
|
||||
}
|
||||
|
||||
for( ; j <= blockSize - 8; j += 8 )
|
||||
{
|
||||
v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
|
||||
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
|
||||
}
|
||||
|
||||
msa_st1q_s32(buf, v_sum);
|
||||
r += buf[0] + buf[1] + buf[2] + buf[3];
|
||||
|
||||
src1 += blockSize;
|
||||
src2 += blockSize;
|
||||
i += blockSize;
|
||||
}
|
||||
#endif
|
||||
|
||||
return r + dotProd_(src1, src2, len - i);
|
||||
|
||||
@@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
|
||||
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("yield" ::: "memory"); } } while (0)
|
||||
# elif defined __GNUC__ && defined __arm__
|
||||
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("" ::: "memory"); } } while (0)
|
||||
# elif defined __GNUC__ && defined __mips__ && __mips_isa_rev >= 2
|
||||
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause" ::: "memory"); } } while (0)
|
||||
# elif defined __GNUC__ && defined __PPC64__
|
||||
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("or 27,27,27" ::: "memory"); } } while (0)
|
||||
# else
|
||||
|
||||
@@ -368,6 +368,8 @@ struct HWFeatures
|
||||
g_hwFeatureNames[CPU_VSX] = "VSX";
|
||||
g_hwFeatureNames[CPU_VSX3] = "VSX3";
|
||||
|
||||
g_hwFeatureNames[CPU_MSA] = "CPU_MSA";
|
||||
|
||||
g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
|
||||
g_hwFeatureNames[CPU_AVX512_KNL] = "AVX512-KNL";
|
||||
g_hwFeatureNames[CPU_AVX512_KNM] = "AVX512-KNM";
|
||||
@@ -557,6 +559,9 @@ struct HWFeatures
|
||||
#if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
|
||||
have[CV_CPU_NEON] = true;
|
||||
#endif
|
||||
#ifdef __mips_msa
|
||||
have[CV_CPU_MSA] = true;
|
||||
#endif
|
||||
// there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
|
||||
have[CV_CPU_VSX] = (CV_VSX);
|
||||
// TODO: Check VSX3 availability in runtime for other platforms
|
||||
|
||||
Reference in New Issue
Block a user