diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 25cd0d2f68..c8dfbc039d 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
 ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
 ocv_add_dispatched_file(convert_scale SSE2 AVX2)
 ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
-ocv_add_dispatched_file(matmul SSE2 AVX2)
+ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX)
 ocv_add_dispatched_file(mean SSE2 AVX2)
 ocv_add_dispatched_file(merge SSE2 AVX2)
 ocv_add_dispatched_file(split SSE2 AVX2)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index c821dc3991..8f6c982c72 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1431,6 +1431,28 @@ inline v_float64x4 v_cvt_f64(const v_float32x8& a)
 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
 
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x4 v_cvt_f64(const v_int64x4& v)
+{
+    // constants encoded as floating-point
+    __m256i magic_i_lo   = _mm256_set1_epi64x(0x4330000000000000); // 2^52
+    __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m256i magic_i_all  = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m256d magic_d_all  = _mm256_castsi256_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m256i v_lo         = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
+    // Extract the 32 most significant bits of v
+    __m256i v_hi         = _mm256_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm256_xor_si256(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m256d v_hi_dbl     = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m256d result       = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
+    return v_float64x4(result);
+}
+
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x32 v256_lut(const schar* tab, const int* idx)
@@ -1638,12 +1660,165 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
 
 ////////// Matrix operations /////////
 
+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
-
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
 { return v_dotprod(a, b) + c; }
 
+// 32 >> 64
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
+{
+    __m256i even = _mm256_mul_epi32(a.val, b.val);
+    __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    return v_int64x4(_mm256_add_epi64(even, odd));
+}
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
+    __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_a  = _mm256_srli_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_b  = _mm256_srli_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_uint32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
+    __m256i odd_a  = _mm256_srai_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
+    __m256i odd_b  = _mm256_srai_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_int32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(p15_, p9d_),
+        _mm256_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+
+    return v_int64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(lo, hi),
+        _mm256_unpackhi_epi64(lo, hi)
+    ));
+}
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
+{ return v_dotprod(a, b); }
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod(a, b); }
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
+}
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+    return v_int64x4(_mm256_add_epi64(lo, hi));
+}
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
     v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
index d4edf0cdd1..844c546e38 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@@ -1473,6 +1473,32 @@ inline v_float64x8 v_cvt_f64(const v_float32x16& a)
 inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
 { return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
 
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x8 v_cvt_f64(const v_int64x8& v)
+{
+#if CV_AVX_512DQ
+    return v_float64x8(_mm512_cvtepi64_pd(v.val));
+#else
+    // constants encoded as floating-point
+    __m512i magic_i_lo   = _mm512_set1_epi64x(0x4330000000000000); // 2^52
+    __m512i magic_i_hi32 = _mm512_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m512i magic_i_all  = _mm512_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m512i v_lo         = _mm512_blend_epi32(magic_i_lo, v.val, 0x55);
+    // Extract the 32 most significant bits of v
+    __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
+    return v_float64x8(result);
+#endif
+}
+
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x64 v512_lut(const schar* tab, const int* idx)
@@ -1672,12 +1698,152 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
 
 ////////// Matrix operations /////////
 
+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
 { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
-
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
 { return v_dotprod(a, b) + c; }
 
+// 32 >> 64
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
+{
+    __m512i even = _mm512_mul_epi32(a.val, b.val);
+    __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
+    __m512i odd_a  = _mm512_srli_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
+    __m512i odd_b  = _mm512_srli_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_uint32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
+    __m512i odd_a  = _mm512_srai_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
+    __m512i odd_b  = _mm512_srai_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_int32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+
+    return v_uint64x8(_mm512_add_epi64(
+        _mm512_unpacklo_epi64(p15_, p9d_),
+        _mm512_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i prod = _mm512_madd_epi16(a.val, b.val);
+    __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
+    __m512i odd  = _mm512_srai_epi64(prod, 32);
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod(a, b); }
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod(a, b); }
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
+}
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
     v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 75c39acf39..9b3dc84681 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -171,7 +171,8 @@ Different type conversions and casts:
 
 ### Matrix operations
 
-In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
+In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
+@ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4
 
 ### Usability
 
@@ -195,7 +196,10 @@ Regular integers:
 |mul_expand         | x | x | x | x | x |   |
 |compare            | x | x | x | x | x | x |
 |shift              |   |   | x | x | x | x |
-|dotprod            |   |   |   | x |   |   |
+|dotprod            |   |   |   | x |   | x |
+|dotprod_fast       |   |   |   | x |   | x |
+|dotprod_expand     | x | x | x | x |   | x |
+|dotprod_expand_fast| x | x | x | x |   | x |
 |logical            | x | x | x | x | x | x |
 |min, max           | x | x | x | x | x | x |
 |absdiff            | x | x | x | x | x | x |
@@ -222,6 +226,7 @@ Big integers:
 |logical            | x | x |
 |extract            | x | x |
 |rotate (lanes)     | x | x |
+|cvt_flt64          |   | x |
 
 Floating point:
 
@@ -853,17 +858,18 @@ inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 /** @brief Dot product of elements
 
 Multiply values in two registers and sum adjacent result pairs.
+
 Scheme:
 @code
   {A1 A2 ...} // 16-bit
 x {B1 B2 ...} // 16-bit
 -------------
 {A1B1+A2B2 ...} // 32-bit
+
 @endcode
-Implemented only for 16-bit signed source type (v_int16x8).
 */
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
     typedef typename V_TypeTraits<_Tp>::w_type w_type;
     v_reg<w_type, n/2> c;
@@ -881,12 +887,11 @@ Scheme:
 x {B1 B2 ...} // 16-bit
 -------------
   {A1B1+A2B2+C1 ...} // 32-bit
-
 @endcode
-Implemented only for 16-bit signed source type (v_int16x8).
 */
 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
-    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+          const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
 {
     typedef typename V_TypeTraits<_Tp>::w_type w_type;
     v_reg<w_type, n/2> s;
@@ -895,6 +900,95 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
     return s;
 }
 
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+               const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{ return v_dotprod(a, b, c); }
+
+/** @brief Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
+
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
+    return s;
+}
+
+/** @brief Dot product of elements
+
+Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
+    return s;
+}
+
+/** @brief Fast Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod_expand(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{ return v_dotprod_expand(a, b, c); }
+
 /** @brief Multiply and expand
 
 Multiply values two registers and store results in two registers with wider pack type.
@@ -1810,6 +1904,17 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
     return c;
 }
 
+/** @brief Convert to double
+
+Supported input type is cv::v_int64x2. */
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
 template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
 {
     v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
diff --git a/modules/core/include/opencv2/core/hal/intrin_forward.hpp b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
index 6873633165..979f15a277 100644
--- a/modules/core/include/opencv2/core/hal/intrin_forward.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_forward.hpp
@@ -160,6 +160,16 @@ void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __
 void v_mul_expand(const __CV_V_INT32&,  const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
 #endif
 
+// Conversions
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
+
 /** Cleanup **/
 #undef CV__SIMD_FORWARD
 #undef __CV_VX
diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
index 6a964841f8..5ece9c131e 100755
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@@ -461,15 +461,124 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
                                     msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
 }
 
+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
-    return v_int32x4(msa_dotp_s_w(a.val, b.val));
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dotp_u_w(even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dpadd_u_w(c.val, even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
 }
 
-inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
-    return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val));
+    v8i16 prod = msa_dotp_s_h(a.val, b.val);
+    return v_int32x4(msa_hadd_s32(prod, prod));
 }
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dotp_u_d(even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dpadd_u_d(c.val, even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v4i32 prod = msa_dotp_s_w(a.val, b.val);
+    return v_int64x2(msa_hadd_s64(prod, prod));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
 
 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
 OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
@@ -1311,6 +1420,11 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
     return v_float64x2(msa_cvtfhq_f64_f32(a.val));
 }
 
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(msa_cvtfintq_f64_s64(a.val));
+}
+
 ////////////// Lookup table access ////////////////////
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index e5f707ca57..3e8321aca3 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -62,23 +62,63 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD128_64F 0
 #endif
 
+// TODO
+#define CV_NEON_DOT 0
+
+//////////// Utils ////////////
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
+#else
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#endif
+
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
-template <typename T> static inline \
-_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
-template <typename T> static inline \
-float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
-OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
+    template <typename T> static inline \
+    _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
+    template <typename T> static inline \
+    float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
+#else
+#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
 #endif
 
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
+
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8,  u8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16,  int8x8,   s8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8,  int16x4,  s16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4,  int32x2,  s32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
+#endif
+
+//////////// Types ////////////
+
 struct v_uint8x16
 {
     typedef uchar lane_type;
@@ -528,20 +568,272 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
                                   ));
 }
 
+//////// Dot Product ////////
+
+// 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
-    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
-    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
-    int32x4x2_t cd = vuzpq_s32(c, d);
-    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
 }
-
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
-    v_int32x4 s = v_dotprod(a, b);
-    return v_int32x4(vaddq_s32(s.val , c.val));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
 }
 
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
+#else
+    const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
+    const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
+    const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
+                                vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
+    uint16x8_t odd  = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
+                                vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
+
+    uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
+                              vreinterpretq_u32_u16(vbslq_u16(mask32, odd,  zero32)));
+    uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
+                              vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
+    return v_uint32x4(vaddq_u32(s0, s1));
+#endif
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand(a, b) + c;
+#endif
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
+#else
+    int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int16x8_t sum = vaddq_s16(uzp1, uzp2);
+    int16x4_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
+    return v_int32x4(vaddl_s16(uzpl1, uzpl2));
+#endif
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand(a, b) + c;
+#endif
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
+                                vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
+    uint32x4_t odd  = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
+                                vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
+    uint32x4_t uzp1, uzp2;
+    _v128_unzip(even, odd, uzp1, uzp2);
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t p0  = vmull_s16(vget_low_s16(a.val),  vget_low_s16(b.val));
+    int32x4_t p1  = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int32x4_t sum = vaddq_s32(uzp1, uzp2);
+
+    int32x2_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
+    return v_int64x2(vaddl_s32(uzpl1, uzpl2));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                  const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+#endif
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
+#else
+    uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+    uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+    uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
+    uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
+    return v_uint32x4(vaddq_u32(s0, s1));
+#endif
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand_fast(a, b) + c;
+#endif
+}
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
+#else
+    int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
+    return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
+#endif
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand_fast(a, b) + c;
+#endif
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint32x4_t p0  = vmull_u16(vget_low_u16(a.val),  vget_low_u16(b.val));
+    uint32x4_t p1  = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
+    return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+#endif
+
+
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@@ -1593,6 +1885,10 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
     return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
 }
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{  return v_float64x2(vcvtq_f64_s64(a.val)); }
+
 #endif
 
 ////////////// Lookup table access ////////////////////
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index f661c58010..c4de1195b5 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -225,9 +225,13 @@ struct v_uint64x2
     }
     uint64 get0() const
     {
+    #if !defined(__x86_64__) && !defined(_M_X64)
         int a = _mm_cvtsi128_si32(val);
         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
         return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #else
+        return (uint64)_mm_cvtsi128_si64(val);
+    #endif
     }
 
     __m128i val;
@@ -247,9 +251,13 @@ struct v_int64x2
     }
     int64 get0() const
     {
+    #if !defined(__x86_64__) && !defined(_M_X64)
         int a = _mm_cvtsi128_si32(val);
         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #else
+        return _mm_cvtsi128_si64(val);
+    #endif
     }
 
     __m128i val;
@@ -791,15 +799,195 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
 
-inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
-{
-    return v_int32x4(_mm_madd_epi16(a.val, b.val));
-}
+//////// Dot Product ////////
 
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
-    return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
+#if CV_SSE4_1
+    __m128i even = _mm_mul_epi32(a.val, b.val);
+    __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    return v_int64x2(_mm_add_epi64(even, odd));
+#else
+    __m128i even_u = _mm_mul_epu32(a.val, b.val);
+    __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
+    __m128i a_sign = _mm_srai_epi32(a.val, 31);
+    __m128i b_sign = _mm_srai_epi32(b.val, 31);
+    // |x * sign of x
+    __m128i axb  = _mm_and_si128(a.val, b_sign);
+    __m128i bxa  = _mm_and_si128(b.val, a_sign);
+    // sum of sign corrections
+    __m128i ssum = _mm_add_epi32(bxa, axb);
+    __m128i even_ssum = _mm_slli_epi64(ssum, 32);
+    __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
+    // convert to signed and prod
+    return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
+#endif
 }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srli_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srai_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return v_uint64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c0.val, d0.val),
+        _mm_unpackhi_epi64(c0.val, d0.val)
+    ));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c.val, d.val),
+        _mm_unpackhi_epi64(c.val, d.val)
+    ));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_cvt_f64(v_dotprod(a, b));
+#else
+    v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
+    v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
+
+    return v_float64x2(_mm_add_pd(
+        _mm_unpacklo_pd(c.val, d.val),
+        _mm_unpackhi_pd(c.val, d.val)
+    ));
+#endif
+}
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod_fast(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = v_expand_low(a).val;
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = v_expand_low(b).val;
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    __m128i a0 = _mm_cvtepi8_epi16(a.val);
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = _mm_cvtepi8_epi16(b.val);
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+#else
+    return v_dotprod_expand(a, b);
+#endif
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return c0 + d0;
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
 
 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
     OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
@@ -2739,6 +2927,32 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
     return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
 }
 
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x2 v_cvt_f64(const v_int64x2& v)
+{
+    // constants encoded as floating-point
+    __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m128i magic_i_all  = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m128d magic_d_all  = _mm_castsi128_pd(magic_i_all);
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+#if CV_SSE4_1
+    __m128i magic_i_lo   = _mm_set1_epi64x(0x4330000000000000); // 2^52
+    __m128i v_lo         = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
+#else
+    __m128i magic_i_lo   = _mm_set1_epi32(0x43300000); // 2^52
+    __m128i v_lo         = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
+#endif
+    // Extract the 32 most significant bits of v
+    __m128i v_hi         = _mm_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm_xor_si128(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m128d v_hi_dbl     = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m128d result       = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
+    return v_float64x2(result);
+}
+
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 4f41021875..0d65ca5e7a 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -499,12 +499,6 @@ inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
     v_zip(p0, p1, c, d);
 }
 
-inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
-{
-    c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
-    d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
-}
-
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
     vec_int4 p0 = vec_mule(a.val, b.val);
@@ -1043,14 +1037,8 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
 
-// The altivec intrinsic is missing for this 2.06 insn
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
-{
-vec_double2 out;
-
-__asm__ ("xvcvsxddp %x0,%x1" : "=wa"(out) : "wa"(a.val));
-return v_float64x2(out);
-}
+{ return v_float64x2(vec_ctd(a.val)); }
 
 ////////////// Lookup table access ////////////////////
 
@@ -1322,12 +1310,134 @@ inline void v_cleanup() {}
 
 ////////// Matrix operations /////////
 
+//////// Dot Product ////////
+// 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
-
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 { return v_int32x4(vec_msum(a.val, b.val, c.val)); }
 
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    vec_dword2 even = vec_mule(a.val, b.val);
+    vec_dword2 odd = vec_mulo(a.val, b.val);
+    return v_int64x2(vec_add(even, odd));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const vec_uint4 zero = vec_uint4_z;
+    vec_uint4 even = vec_mule(a.val, b.val);
+    vec_uint4 odd  = vec_mulo(a.val, b.val);
+    vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
+    vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
+    vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
+    vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
+    vec_udword2 s0 = vec_add(e0, o0);
+    vec_udword2 s1 = vec_add(e1, o1);
+    return v_uint64x2(vec_add(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    vec_short8 a0 = vec_unpackh(a.val);
+    vec_short8 a1 = vec_unpackl(a.val);
+    vec_short8 b0 = vec_unpackh(b.val);
+    vec_short8 b1 = vec_unpackl(b.val);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
index 302d834147..f2da617cfe 100644
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@@ -682,6 +682,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
     return s;
 }
 
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
+    return s;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+    v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                     const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
+    return s;
+}
+
 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
@@ -1282,6 +1305,14 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
     return c;
 }
 
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    v_float64x2 c;
+    for( int i = 0; i < 2; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
 template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
 {
     v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
@@ -2398,6 +2429,8 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
     return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
 }
 
+//////// Dot Product ////////
+
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
     v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
@@ -2410,15 +2443,140 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 }
 
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
-    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
-    v128_t a1 = wasm_i32x4_shr(a.val, 16);
-    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
-    v128_t b1 = wasm_i32x4_shr(b.val, 16);
-    v128_t d = wasm_i32x4_mul(a0, b0);
-    v128_t e = wasm_i32x4_mul(a1, b1);
-    return v_int32x4(wasm_i32x4_add(wasm_i32x4_add(d, e), c.val));
+#ifdef __wasm_unimplemented_simd128__
+    v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
+    v128_t a1 = wasm_i64x2_shr(a.val, 32);
+    v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
+    v128_t b1 = wasm_i64x2_shr(b.val, 32);
+    v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
+    v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
+    return v_int64x2(wasm_i64x2_add(c, d));
+#else
+    fallback::v_int32x4 a_(a);
+    fallback::v_int32x4 b_(b);
+    return fallback::v_dotprod(a_, b_);
+#endif
 }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+#ifdef __wasm_unimplemented_simd128__
+    return v_dotprod(a, b) + c;
+#else
+    fallback::v_int32x4 a_(a);
+    fallback::v_int32x4 b_(b);
+    fallback::v_int64x2 c_(c);
+    return fallback::v_dotprod(a_, b_, c_);
+#endif
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_u16x8_shr(a.val, 8);
+    v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_u16x8_shr(b.val, 8);
+    return v_uint32x4((
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
+    );
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_i16x8_shr(a.val, 8);
+    v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_i16x8_shr(b.val, 8);
+    return v_int32x4(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))
+    );
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    fallback::v_uint16x8 a_(a);
+    fallback::v_uint16x8 b_(b);
+    return fallback::v_dotprod_expand(a_, b_);
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    fallback::v_uint16x8 a_(a);
+    fallback::v_uint16x8 b_(b);
+    fallback::v_uint64x2 c_(c);
+    return fallback::v_dotprod_expand(a_, b_, c_);
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    fallback::v_int16x8 a_(a);
+    fallback::v_int16x8 b_(b);
+    return fallback::v_dotprod_expand(a_, b_);
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    fallback::v_int16x8 a_(a);
+    fallback::v_int16x8 b_(b);
+    fallback::v_int64x2 c_(c);
+    return fallback::v_dotprod_expand(a_, b_, c_);
+}
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
 
 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
 OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
@@ -3815,6 +3973,16 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
     return fallback::v_cvt_f64_high(a_);
 }
 
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    return v_float64x2(wasm_convert_f64x2_i64x2(a.val));
+#else
+    fallback::v_int64x2 a_(a);
+    return fallback::v_cvt_f64(a_);
+#endif
+}
+
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp
index 0f1029de83..91669bff31 100644
--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@@ -144,10 +144,10 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b)  \
     VSX_REDIRECT_2RG(vec_uint4,  vec_ushort8,  vec_mulo, __builtin_vec_mulo)
 
     // dword2 support arrived in ISA 2.07 and GCC 8+
-    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulesw, vec_mule)
-    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mule)
-    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulosw, vec_mulo)
-    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mulo)
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulosw, vec_mule)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulesw, vec_mulo)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
 
 #endif
 
diff --git a/modules/core/perf/perf_dot.cpp b/modules/core/perf/perf_dot.cpp
index 5321962319..1230220a4c 100644
--- a/modules/core/perf/perf_dot.cpp
+++ b/modules/core/perf/perf_dot.cpp
@@ -9,7 +9,7 @@ typedef TestBaseWithParam<MatType_Length_t> MatType_Length;
 
 PERF_TEST_P( MatType_Length, dot,
              testing::Combine(
-                 testing::Values( CV_8UC1, CV_32SC1, CV_32FC1 ),
+                 testing::Values( CV_8UC1, CV_8SC1, CV_16SC1, CV_16UC1, CV_32SC1, CV_32FC1 ),
                  testing::Values( 32, 64, 128, 256, 512, 1024 )
                  ))
 {
diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp
index 5d188029b1..1360ed46d6 100644
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -2320,26 +2320,22 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
-        v_int32 v_sum = vx_setzero_s32();
+        v_uint32 v_sum = vx_setzero_u32();
         const int cWidth = v_uint16::nlanes;
 
         int j = 0;
         for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
         {
-            v_uint16 v_src10, v_src20, v_src11, v_src21;
-            v_expand(vx_load(src1 + j), v_src10, v_src11);
-            v_expand(vx_load(src2 + j), v_src20, v_src21);
-
-            v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20));
-            v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21));
+            v_uint8 v_src1 = vx_load(src1 + j);
+            v_uint8 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
         }
 
         for (; j <= blockSize - cWidth; j += cWidth)
         {
             v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
             v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
-
-            v_sum += v_dotprod(v_src10, v_src20);
+            v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
         }
         r += (double)v_reduce_sum(v_sum);
 
@@ -2348,48 +2344,6 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
         i += blockSize;
     }
     vx_cleanup();
-#elif CV_NEON
-    if( cv::checkHardwareSupport(CV_CPU_NEON) )
-    {
-        int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
-        uint32x4_t v_zero = vdupq_n_u32(0u);
-        CV_DECL_ALIGNED(16) uint buf[4];
-
-        while( i < len0 )
-        {
-            blockSize = std::min(len0 - i, blockSize0);
-            uint32x4_t v_sum = v_zero;
-
-            int j = 0;
-            for( ; j <= blockSize - 16; j += 16 )
-            {
-                uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j);
-
-                uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2));
-                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
-                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
-
-                v_src10 = vmovl_u8(vget_high_u8(v_src1));
-                v_src20 = vmovl_u8(vget_high_u8(v_src2));
-                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
-                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
-            }
-
-            for( ; j <= blockSize - 8; j += 8 )
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j));
-                v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2));
-                v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2));
-            }
-
-            vst1q_u32(buf, v_sum);
-            r += buf[0] + buf[1] + buf[2] + buf[3];
-
-            src1 += blockSize;
-            src2 += blockSize;
-            i += blockSize;
-        }
-    }
 #endif
     return r + dotProd_(src1, src2, len - i);
 }
@@ -2412,20 +2366,16 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
         int j = 0;
         for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
         {
-            v_int16 v_src10, v_src20, v_src11, v_src21;
-            v_expand(vx_load(src1 + j), v_src10, v_src11);
-            v_expand(vx_load(src2 + j), v_src20, v_src21);
-
-            v_sum += v_dotprod(v_src10, v_src20);
-            v_sum += v_dotprod(v_src11, v_src21);
+            v_int8 v_src1 = vx_load(src1 + j);
+            v_int8 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
         }
 
         for (; j <= blockSize - cWidth; j += cWidth)
         {
-            v_int16 v_src10 = vx_load_expand(src1 + j);
-            v_int16 v_src20 = vx_load_expand(src2 + j);
-
-            v_sum += v_dotprod(v_src10, v_src20);
+            v_int16 v_src1 = vx_load_expand(src1 + j);
+            v_int16 v_src2 = vx_load_expand(src2 + j);
+            v_sum = v_dotprod_fast(v_src1, v_src2, v_sum);
         }
         r += (double)v_reduce_sum(v_sum);
 
@@ -2434,87 +2384,6 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
         i += blockSize;
     }
     vx_cleanup();
-#elif CV_NEON
-    if( cv::checkHardwareSupport(CV_CPU_NEON) )
-    {
-        int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
-        int32x4_t v_zero = vdupq_n_s32(0);
-        CV_DECL_ALIGNED(16) int buf[4];
-
-        while( i < len0 )
-        {
-            blockSize = std::min(len0 - i, blockSize0);
-            int32x4_t v_sum = v_zero;
-
-            int j = 0;
-            for( ; j <= blockSize - 16; j += 16 )
-            {
-                int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j);
-
-                int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2));
-                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
-                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
-
-                v_src10 = vmovl_s8(vget_high_s8(v_src1));
-                v_src20 = vmovl_s8(vget_high_s8(v_src2));
-                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
-                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
-            }
-
-            for( ; j <= blockSize - 8; j += 8 )
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j));
-                v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2));
-                v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2));
-            }
-
-            vst1q_s32(buf, v_sum);
-            r += buf[0] + buf[1] + buf[2] + buf[3];
-
-            src1 += blockSize;
-            src2 += blockSize;
-            i += blockSize;
-        }
-    }
-#elif CV_MSA
-    int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
-    v4i32 v_zero = msa_dupq_n_s32(0);
-    CV_DECL_ALIGNED(16) int buf[4];
-
-    while( i < len0 )
-    {
-        blockSize = std::min(len0 - i, blockSize0);
-        v4i32 v_sum = v_zero;
-
-        int j = 0;
-        for( ; j <= blockSize - 16; j += 16 )
-        {
-            v16i8 v_src1 = msa_ld1q_s8(src1 + j), v_src2 = msa_ld1q_s8(src2 + j);
-
-            v8i16 v_src10 = msa_movl_s8(msa_get_low_s8(v_src1)), v_src20 = msa_movl_s8(msa_get_low_s8(v_src2));
-            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
-            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
-
-            v_src10 = msa_movl_s8(msa_get_high_s8(v_src1));
-            v_src20 = msa_movl_s8(msa_get_high_s8(v_src2));
-            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src10), msa_get_low_s16(v_src20));
-            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src10), msa_get_high_s16(v_src20));
-        }
-
-        for( ; j <= blockSize - 8; j += 8 )
-        {
-            v8i16 v_src1 = msa_movl_s8(msa_ld1_s8(src1 + j)), v_src2 = msa_movl_s8(msa_ld1_s8(src2 + j));
-            v_sum = msa_mlal_s16(v_sum, msa_get_low_s16(v_src1), msa_get_low_s16(v_src2));
-            v_sum = msa_mlal_s16(v_sum, msa_get_high_s16(v_src1), msa_get_high_s16(v_src2));
-        }
-
-        msa_st1q_s32(buf, v_sum);
-        r += buf[0] + buf[1] + buf[2] + buf[3];
-
-        src1 += blockSize;
-        src2 += blockSize;
-        i += blockSize;
-    }
 #endif
 
     return r + dotProd_(src1, src2, len - i);
@@ -2522,42 +2391,97 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
 
 double dotProd_16u(const ushort* src1, const ushort* src2, int len)
 {
-    return dotProd_(src1, src2, len);
+    double r = 0.0;
+    int i = 0;
+
+#if CV_SIMD
+    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
+
+    while (i < len0)
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        v_uint64 v_sum = vx_setzero_u64();
+        const int cWidth = v_uint16::nlanes;
+
+        int j = 0;
+        for (; j <= blockSize - cWidth; j += cWidth)
+        {
+            v_uint16 v_src1 = vx_load(src1 + j);
+            v_uint16 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
+        }
+        r += (double)v_reduce_sum(v_sum);
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
+    vx_cleanup();
+#endif
+    return r + dotProd_(src1, src2, len - i);
 }
 
 double dotProd_16s(const short* src1, const short* src2, int len)
 {
-    return dotProd_(src1, src2, len);
+    double r = 0.0;
+    int i = 0;
+
+#if CV_SIMD
+    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
+
+    while (i < len0)
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        v_int64 v_sum = vx_setzero_s64();
+        const int cWidth = v_int16::nlanes;
+
+        int j = 0;
+        for (; j <= blockSize - cWidth; j += cWidth)
+        {
+            v_int16 v_src1 = vx_load(src1 + j);
+            v_int16 v_src2 = vx_load(src2 + j);
+            v_sum = v_dotprod_expand_fast(v_src1, v_src2, v_sum);
+        }
+        r += (double)v_reduce_sum(v_sum);
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
+    vx_cleanup();
+#endif
+    return r + dotProd_(src1, src2, len - i);
 }
 
 double dotProd_32s(const int* src1, const int* src2, int len)
 {
-#if CV_SIMD128_64F
-    double r = 0.0;
+#if CV_SIMD_64F
+    double r = .0;
     int i = 0;
-    int lenAligned = len & -v_int32x4::nlanes;
-    v_float64x2 a(0.0, 0.0);
-    v_float64x2 b(0.0, 0.0);
-
-    for( i = 0; i < lenAligned; i += v_int32x4::nlanes )
-        {
-        v_int32x4 s1 = v_load(src1);
-        v_int32x4 s2 = v_load(src2);
-
-#if CV_VSX
-        // Do 32x32->64 multiplies, convert/round to double, accumulate
-        // Potentially less precise than FMA, but 1.5x faster than fma below.
-        a += v_cvt_f64(v_int64(vec_mule(s1.val, s2.val)));
-        b += v_cvt_f64(v_int64(vec_mulo(s1.val, s2.val)));
-#else
-        a = v_fma(v_cvt_f64(s1), v_cvt_f64(s2), a);
-        b = v_fma(v_cvt_f64_high(s1), v_cvt_f64_high(s2), b);
+    const int step  = v_int32::nlanes;
+    v_float64 v_sum0 = vx_setzero_f64();
+#if CV_SIMD_WIDTH == 16
+    const int wstep = step * 2;
+    v_float64 v_sum1 = vx_setzero_f64();
+    for (; i < len - wstep; i += wstep, src1 += wstep, src2 += wstep)
+    {
+        v_int32 v_src10 = vx_load(src1);
+        v_int32 v_src20 = vx_load(src2);
+        v_int32 v_src11 = vx_load(src1 + step);
+        v_int32 v_src21 = vx_load(src2 + step);
+        v_sum0 = v_dotprod_expand_fast(v_src10, v_src20, v_sum0);
+        v_sum1 = v_dotprod_expand_fast(v_src11, v_src21, v_sum1);
+    }
+    v_sum0 += v_sum1;
 #endif
-        src1 += v_int32x4::nlanes;
-        src2 += v_int32x4::nlanes;
-        }
-    a += b;
-    r = v_reduce_sum(a);
+    for (; i < len - step; i += step, src1 += step, src2 += step)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src2 = vx_load(src2);
+        v_sum0 = v_dotprod_expand_fast(v_src1, v_src2, v_sum0);
+    }
+    r = v_reduce_sum(v_sum0);
+    vx_cleanup();
     return r + dotProd_(src1, src2, len - i);
 #else
     return dotProd_(src1, src2, len);
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 06aeac867d..fcb6b93a3c 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -603,12 +603,14 @@ template<typename R> struct TheTest
         return *this;
     }
 
-    TheTest & test_dot_prod()
+    TheTest & test_dotprod()
     {
         typedef typename V_RegTraits<R>::w_reg Rx2;
         typedef typename Rx2::lane_type w_type;
 
-        Data<R> dataA, dataB(2);
+        Data<R> dataA, dataB;
+        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+        dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
         R a = dataA, b = dataB;
 
         Data<Rx2> dataC;
@@ -621,12 +623,95 @@ template<typename R> struct TheTest
                   resE = v_dotprod(a, b, c);
 
         const int n = R::nlanes / 2;
+        w_type sumAB = 0, sumABC = 0, tmp_sum;
         for (int i = 0; i < n; ++i)
         {
             SCOPED_TRACE(cv::format("i=%d", i));
-            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
-            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
+
+            tmp_sum = (w_type)dataA[i*2] * (w_type)dataB[i*2] +
+                      (w_type)dataA[i*2 + 1] * (w_type)dataB[i*2 + 1];
+            sumAB  += tmp_sum;
+            EXPECT_EQ(tmp_sum, resD[i]);
+
+            tmp_sum = tmp_sum + dataC[i];
+            sumABC += tmp_sum;
+            EXPECT_EQ(tmp_sum, resE[i]);
         }
+
+        w_type resF = v_reduce_sum(v_dotprod_fast(a, b)),
+               resG = v_reduce_sum(v_dotprod_fast(a, b, c));
+        EXPECT_EQ(sumAB,  resF);
+        EXPECT_EQ(sumABC, resG);
+        return *this;
+    }
+
+    TheTest & test_dotprod_expand()
+    {
+        typedef typename V_RegTraits<R>::q_reg Rx4;
+        typedef typename Rx4::lane_type l4_type;
+
+        Data<R> dataA, dataB;
+        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+        dataB += std::numeric_limits<LaneType>::min() + R::nlanes;
+        R a = dataA, b = dataB;
+
+        Data<Rx4> dataC;
+        Rx4 c = dataC;
+
+        Data<Rx4> resD = v_dotprod_expand(a, b),
+                  resE = v_dotprod_expand(a, b, c);
+
+        l4_type sumAB = 0, sumABC = 0, tmp_sum;
+        for (int i = 0; i < Rx4::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            tmp_sum  = (l4_type)dataA[i*4]     * (l4_type)dataB[i*4]     +
+                       (l4_type)dataA[i*4 + 1] * (l4_type)dataB[i*4 + 1] +
+                       (l4_type)dataA[i*4 + 2] * (l4_type)dataB[i*4 + 2] +
+                       (l4_type)dataA[i*4 + 3] * (l4_type)dataB[i*4 + 3];
+            sumAB  += tmp_sum;
+            EXPECT_EQ(tmp_sum, resD[i]);
+
+            tmp_sum = tmp_sum + dataC[i];
+            sumABC += tmp_sum;
+            EXPECT_EQ(tmp_sum, resE[i]);
+        }
+
+        l4_type resF = v_reduce_sum(v_dotprod_expand_fast(a, b)),
+                resG = v_reduce_sum(v_dotprod_expand_fast(a, b, c));
+        EXPECT_EQ(sumAB,  resF);
+        EXPECT_EQ(sumABC, resG);
+
+        return *this;
+    }
+
+    TheTest & test_dotprod_expand_f64()
+    {
+    #if CV_SIMD_64F
+        Data<R> dataA, dataB;
+        dataA += std::numeric_limits<LaneType>::max() - R::nlanes;
+        dataB += std::numeric_limits<LaneType>::min();
+        R a = dataA, b = dataB;
+
+        Data<v_float64> dataC;
+        v_float64 c = dataC;
+
+        Data<v_float64> resA = v_dotprod_expand(a, a),
+                        resB = v_dotprod_expand(b, b),
+                        resC = v_dotprod_expand(a, b, c);
+
+        const int n = R::nlanes / 2;
+        for (int i = 0; i < n; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ((double)dataA[i*2]     * (double)dataA[i*2] +
+                      (double)dataA[i*2 + 1] * (double)dataA[i*2  + 1], resA[i]);
+            EXPECT_EQ((double)dataB[i*2]     * (double)dataB[i*2] +
+                      (double)dataB[i*2 + 1] * (double)dataB[i*2  + 1], resB[i]);
+            EXPECT_EQ((double)dataA[i*2]     * (double)dataB[i*2] +
+                      (double)dataA[i*2 + 1] * (double)dataB[i*2  + 1] + dataC[i], resC[i]);
+        }
+    #endif
         return *this;
     }
 
@@ -1165,6 +1250,29 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_cvt64_double()
+    {
+#if CV_SIMD_64F
+        Data<R> dataA(std::numeric_limits<LaneType>::max()),
+                dataB(std::numeric_limits<LaneType>::min());
+        dataB += R::nlanes;
+
+        R a = dataA, b = dataB;
+        v_float64 c = v_cvt_f64(a), d = v_cvt_f64(b);
+
+        Data<v_float64> resC = c;
+        Data<v_float64> resD = d;
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ((double)dataA[i], resC[i]);
+            EXPECT_EQ((double)dataB[i], resD[i]);
+        }
+#endif
+        return *this;
+    }
+
     TheTest & test_matmul()
     {
         Data<R> dataV, dataA, dataB, dataC, dataD;
@@ -1341,6 +1449,7 @@ void test_hal_intrin_uint8()
         .test_mul_expand()
         .test_cmp()
         .test_logic()
+        .test_dotprod_expand()
         .test_min_max()
         .test_absdiff()
         .test_reduce_sad()
@@ -1378,6 +1487,7 @@ void test_hal_intrin_int8()
         .test_mul_expand()
         .test_cmp()
         .test_logic()
+        .test_dotprod_expand()
         .test_min_max()
         .test_absdiff()
         .test_absdiffs()
@@ -1408,6 +1518,7 @@ void test_hal_intrin_uint16()
         .test_cmp()
         .test_shift<1>()
         .test_shift<8>()
+        .test_dotprod_expand()
         .test_logic()
         .test_min_max()
         .test_absdiff()
@@ -1437,7 +1548,8 @@ void test_hal_intrin_int16()
         .test_cmp()
         .test_shift<1>()
         .test_shift<8>()
-        .test_dot_prod()
+        .test_dotprod()
+        .test_dotprod_expand()
         .test_logic()
         .test_min_max()
         .test_absdiff()
@@ -1497,6 +1609,8 @@ void test_hal_intrin_int32()
         .test_cmp()
         .test_popcount()
         .test_shift<1>().test_shift<8>()
+        .test_dotprod()
+        .test_dotprod_expand_f64()
         .test_logic()
         .test_min_max()
         .test_absdiff()
@@ -1538,6 +1652,7 @@ void test_hal_intrin_int64()
         .test_logic()
         .test_extract<0>().test_extract<1>()
         .test_rotate<0>().test_rotate<1>()
+        .test_cvt64_double()
         ;
 }