Merge pull request #15422 from mipsopen-fwu:msa-dev

* Added MSA implementations for mips platforms. Intrinsics for MSA and build scripts for MIPS platforms are added.

Signed-off-by: Fei Wu <fwu@wavecomp.com>

* Removed some unused code in mips.toolchain.cmake.

Signed-off-by: Fei Wu <fwu@wavecomp.com>

* Added comments for mips toolchain configuration and disabled compiling warnings for libpng.

Signed-off-by: Fei Wu <fwu@wavecomp.com>

* Fixed the build error of unsupported opcode 'pause' when mips isa_rev is less than 2.

Signed-off-by: Fei Wu <fwu@wavecomp.com>

* 1. Removed FP16 related item in MSA option defines in OpenCVCompilerOptimizations.cmake.
2. Use CV_CPU_COMPILE_MSA instead of __mips_msa for MSA feature check in cv_cpu_dispatch.h.
3. Removed hasSIMD128() in intrin_msa.hpp.
4. Define CPU_MSA as 150.
Signed-off-by: Fei Wu <fwu@wavecomp.com>

* 1. Removed unnecessary CV_SIMD128_64F guarding in intrin_msa.hpp.
2. Removed unnecessary CV_MSA related code block in dotProd_8u().

Signed-off-by: Fei Wu <fwu@wavecomp.com>

* 1. Defined CPU_MSA_FLAGS_ON as "-mmsa".
2. Removed CV_SIMD128_64F guardings in intrin_msa.hpp.

Signed-off-by: Fei Wu <fwu@wavecomp.com>

* Removed unused msa_mlal_u16() and msa_mlal_s16 from msa_macros.h.

Signed-off-by: Fei Wu <fwu@wavecomp.com>
This commit is contained in:
mipsopen-fwu
2019-09-21 00:52:48 +08:00
committed by Alexander Alekhin
parent 33e9fe9312
commit b1ea91d8bd
20 changed files with 4400 additions and 5 deletions
+2 -2
View File
@@ -409,13 +409,13 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
int x = 0;
#if CV_SIMD
#if !CV_NEON
#if !CV_NEON && !CV_MSA
if (is_aligned(src1, src2, dst))
{
for (; x <= width - wide_step_l; x += wide_step_l)
{
ldr::la(src1 + x, src2 + x, dst + x);
#if !CV_NEON && CV_SIMD_WIDTH == 16
#if CV_SIMD_WIDTH == 16
ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
#endif
}
+39
View File
@@ -2476,6 +2476,45 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
i += blockSize;
}
}
#elif CV_MSA
int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
v4i32 v_zero = msa_dupq_n_s32(0);
CV_DECL_ALIGNED(16) int buf[4];
while( i < len0 )
{
blockSize = std::min(len0 - i, blockSize0);
v4i32 v_sum = v_zero;
int j = 0;
for( ; j <= blockSize - 16; j += 16 )
{
v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
}
for( ; j <= blockSize - 8; j += 8 )
{
v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
}
msa_st1q_s32(buf, v_sum);
r += buf[0] + buf[1] + buf[2] + buf[3];
src1 += blockSize;
src2 += blockSize;
i += blockSize;
}
#endif
return r + dotProd_(src1, src2, len - i);
+2
View File
@@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("yield" ::: "memory"); } } while (0)
# elif defined __GNUC__ && defined __arm__
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("" ::: "memory"); } } while (0)
# elif defined __GNUC__ && defined __mips__ && __mips_isa_rev >= 2
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause" ::: "memory"); } } while (0)
# elif defined __GNUC__ && defined __PPC64__
# define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("or 27,27,27" ::: "memory"); } } while (0)
# else
+5
View File
@@ -368,6 +368,8 @@ struct HWFeatures
g_hwFeatureNames[CPU_VSX] = "VSX";
g_hwFeatureNames[CPU_VSX3] = "VSX3";
g_hwFeatureNames[CPU_MSA] = "CPU_MSA";
g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
g_hwFeatureNames[CPU_AVX512_KNL] = "AVX512-KNL";
g_hwFeatureNames[CPU_AVX512_KNM] = "AVX512-KNM";
@@ -557,6 +559,9 @@ struct HWFeatures
#if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
have[CV_CPU_NEON] = true;
#endif
#ifdef __mips_msa
have[CV_CPU_MSA] = true;
#endif
// there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
have[CV_CPU_VSX] = (CV_VSX);
// TODO: Check VSX3 availability in runtime for other platforms