Merge pull request #15422 from mipsopen-fwu:msa-dev

* Added MSA implementations for mips platforms. Intrinsics for MSA and build scripts for MIPS platforms are added. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Removed some unused code in mips.toolchain.cmake. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Added comments for mips toolchain configuration and disabled compiling warnings for libpng. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Fixed the build error of unsupported opcode 'pause' when mips isa_rev is less than 2. Signed-off-by: Fei Wu <fwu@wavecomp.com> * 1. Removed FP16 related item in MSA option defines in OpenCVCompilerOptimizations.cmake. 2. Use CV_CPU_COMPILE_MSA instead of __mips_msa for MSA feature check in cv_cpu_dispatch.h. 3. Removed hasSIMD128() in intrin_msa.hpp. 4. Define CPU_MSA as 150. Signed-off-by: Fei Wu <fwu@wavecomp.com> * 1. Removed unnecessary CV_SIMD128_64F guarding in intrin_msa.hpp. 2. Removed unnecessary CV_MSA related code block in dotProd_8u(). Signed-off-by: Fei Wu <fwu@wavecomp.com> * 1. Defined CPU_MSA_FLAGS_ON as "-mmsa". 2. Removed CV_SIMD128_64F guardings in intrin_msa.hpp. Signed-off-by: Fei Wu <fwu@wavecomp.com> * Removed unused msa_mlal_u16() and msa_mlal_s16 from msa_macros.h. Signed-off-by: Fei Wu <fwu@wavecomp.com>
2019-09-21 00:52:48 +08:00
parent 33e9fe9312
commit b1ea91d8bd
20 changed files with 4400 additions and 5 deletions
@@ -409,13 +409,13 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
        int x = 0;

    #if CV_SIMD
-        #if !CV_NEON
+        #if !CV_NEON && !CV_MSA
        if (is_aligned(src1, src2, dst))
        {
            for (; x <= width - wide_step_l; x += wide_step_l)
            {
                ldr::la(src1 + x, src2 + x, dst + x);
-                #if !CV_NEON && CV_SIMD_WIDTH == 16
+                #if CV_SIMD_WIDTH == 16
                ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
                #endif
            }
@@ -2476,6 +2476,45 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
            i += blockSize;
        }
    }
+#elif CV_MSA
+    int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
+    v4i32 v_zero = msa_dupq_n_s32(0);
+    CV_DECL_ALIGNED(16) int buf[4];
+
+    while( i < len0 )
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        v4i32 v_sum = v_zero;
+
+        int j = 0;
+        for( ; j <= blockSize - 16; j += 16 )
+        {
+            v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
+
+            v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
+            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
+            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
+
+            v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
+            v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
+            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
+            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
+        }
+
+        for( ; j <= blockSize - 8; j += 8 )
+        {
+            v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
+            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
+            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
+        }
+
+        msa_st1q_s32(buf, v_sum);
+        r += buf[0] + buf[1] + buf[2] + buf[3];
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
 #endif

    return r + dotProd_(src1, src2, len - i);
@@ -59,6 +59,8 @@ DECLARE_CV_PAUSE
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("yield" ::: "memory"); } } while (0)
 # elif defined __GNUC__ && defined __arm__
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("" ::: "memory"); } } while (0)
+# elif defined __GNUC__ && defined __mips__ && __mips_isa_rev >= 2
+#   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("pause" ::: "memory"); } } while (0)
 # elif defined __GNUC__ && defined __PPC64__
 #   define CV_PAUSE(v) do { for (int __delay = (v); __delay > 0; --__delay) { asm volatile("or 27,27,27" ::: "memory"); } } while (0)
 # else
@@ -368,6 +368,8 @@ struct HWFeatures
        g_hwFeatureNames[CPU_VSX] = "VSX";
        g_hwFeatureNames[CPU_VSX3] = "VSX3";

+        g_hwFeatureNames[CPU_MSA] = "CPU_MSA";
+
        g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
        g_hwFeatureNames[CPU_AVX512_KNL] = "AVX512-KNL";
        g_hwFeatureNames[CPU_AVX512_KNM] = "AVX512-KNM";
@@ -557,6 +559,9 @@ struct HWFeatures
    #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
        have[CV_CPU_NEON] = true;
    #endif
+    #ifdef __mips_msa
+        have[CV_CPU_MSA] = true;
+    #endif
    // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
    have[CV_CPU_VSX] = (CV_VSX);
    // TODO: Check VSX3 availability in runtime for other platforms