From 507f8add1cd940fd803f375e3c2bfa9dee4c116a Mon Sep 17 00:00:00 2001 From: Brad Kelly Date: Thu, 17 Jan 2019 10:56:41 -0800 Subject: [PATCH] Implementing AVX512 Support for 2 and 4 channel mats for CV_64F format --- modules/imgproc/perf/perf_integral.cpp | 14 +- modules/imgproc/src/sumpixels.avx512_skx.cpp | 304 ++++++++++++++----- modules/imgproc/src/sumpixels.cpp | 5 +- modules/imgproc/test/test_filter.cpp | 3 +- 4 files changed, 230 insertions(+), 96 deletions(-) diff --git a/modules/imgproc/perf/perf_integral.cpp b/modules/imgproc/perf/perf_integral.cpp index d64c49e0a9..85a426efc9 100644 --- a/modules/imgproc/perf/perf_integral.cpp +++ b/modules/imgproc/perf/perf_integral.cpp @@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam Size_MatType_OutMatD PERF_TEST_P(Size_MatType_OutMatDepth, integral, testing::Combine( testing::Values(TYPICAL_MAT_SIZES), - testing::Values(CV_8UC1, CV_8UC3, CV_8UC4), + testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4), testing::Values(CV_32S, CV_32F, CV_64F) ) ) @@ -32,9 +32,9 @@ PERF_TEST_P(Size_MatType_OutMatDepth, integral, PERF_TEST_P(Size_MatType_OutMatDepth, integral_sqsum, testing::Combine( - testing::Values(::perf::szVGA, ::perf::sz1080p), - testing::Values(CV_8UC1, CV_8UC4), - testing::Values(CV_32S, CV_32F) + testing::Values(TYPICAL_MAT_SIZES), + testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4), + testing::Values(CV_32S, CV_32F, CV_64F) ) ) { @@ -57,9 +57,9 @@ PERF_TEST_P(Size_MatType_OutMatDepth, integral_sqsum, PERF_TEST_P( Size_MatType_OutMatDepth, integral_sqsum_tilted, testing::Combine( - testing::Values( ::perf::szVGA, ::perf::szODD , ::perf::sz1080p ), - testing::Values( CV_8UC1, CV_8UC4 ), - testing::Values( CV_32S, CV_32F ) + testing::Values(TYPICAL_MAT_SIZES), + testing::Values( CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4 ), + testing::Values( CV_32S, CV_32F, CV_64F ) ) ) { diff --git a/modules/imgproc/src/sumpixels.avx512_skx.cpp b/modules/imgproc/src/sumpixels.avx512_skx.cpp index 7e5cbdcf88..511123b4f8 100644 --- a/modules/imgproc/src/sumpixels.avx512_skx.cpp +++ b/modules/imgproc/src/sumpixels.avx512_skx.cpp @@ -13,33 +13,34 @@ namespace { // Anonymous namespace to avoid exposing the implementation classes // NOTE: Look at the bottom of the file for the entry-point function for external callers // -// At the moment only 3 channel support untilted is supported -// More channel support coming soon. -// TODO: Add support for sqsum and 1,2, and 4 channels -class IntegralCalculator_3Channel { +// TODO: Add support for 1 channel input (WIP: currently hitting hardware glassjaw) +template class IntegralCalculator; + +template +class IntegralCalculator { public: - IntegralCalculator_3Channel() {}; + IntegralCalculator() {}; void calculate_integral_avx512(const uchar *src, size_t _srcstep, double *sum, size_t _sumstep, double *sqsum, size_t _sqsumstep, - int width, int height, int cn) + int width, int height) { const int srcstep = (int)(_srcstep/sizeof(uchar)); const int sumstep = (int)(_sumstep/sizeof(double)); const int sqsumstep = (int)(_sqsumstep/sizeof(double)); - const int ops_per_line = width * cn; + const int ops_per_line = width * num_channels; // Clear the first line of the sum as per spec (see integral documentation) // Also adjust the index of sum and sqsum to be at the real 0th element // and not point to the border pixel so it stays in sync with the src pointer - memset( sum, 0, (ops_per_line+cn)*sizeof(double)); - sum += cn; + memset( sum, 0, (ops_per_line+num_channels)*sizeof(double)); + sum += num_channels; if (sqsum) { - memset( sqsum, 0, (ops_per_line+cn)*sizeof(double)); - sqsum += cn; + memset( sqsum, 0, (ops_per_line+num_channels)*sizeof(double)); + sqsum += num_channels; } // Now calculate the integral over the whole image one line at a time @@ -50,22 +51,22 @@ public: double * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep] : NULL; double * sqsum_line = (sqsum) ? &sqsum_above[sqsumstep] : NULL; - integral_line_3channel_avx512(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line); + calculate_integral_for_line(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line); } } - static inline - void integral_line_3channel_avx512(const uchar *srcs, - double *sums, double *sums_above, - double *sqsums, double *sqsums_above, - int num_ops_in_line) + static CV_ALWAYS_INLINE + void calculate_integral_for_line(const uchar *srcs, + double *sums, double *sums_above, + double *sqsums, double *sqsums_above, + int num_ops_in_line) { __m512i sum_accumulator = _mm512_setzero_si512(); // holds rolling sums for the line __m512i sqsum_accumulator = _mm512_setzero_si512(); // holds rolling sqsums for the line // The first element on each line must be zeroes as per spec (see integral documentation) - set_border_pixel_value(sums, sqsums); + zero_out_border_pixel(sums, sqsums); // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk aligned_integral( srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line); @@ -74,20 +75,18 @@ public: } - static inline - void set_border_pixel_value(double *sums, double *sqsums) + static CV_ALWAYS_INLINE + void zero_out_border_pixel(double *sums, double *sqsums) { - // Sets the border pixel value to 0s. - // Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now - __m512i zeroes = _mm512_setzero_si512(); - - _mm512_mask_storeu_epi64(&sums[-3], 0x7, zeroes); + // Note the negative index is because the sums/sqsums pointers point to the first real pixel + // after the border pixel so we have to look backwards + _mm512_mask_storeu_epi64(&sums[-num_channels], (1<>8, sums+1, sum_accumulator); - // Calculate integral for the sum on the 8 entries - integral_8_operations(src_longs, sums_above, data_mask, sums, sum_accumulator); - sums++; sums_above++; - - if (sqsums){ // Calculate integral for the sum on the 8 entries - __m512i squared_source = _mm512_mullo_epi64(src_longs, src_longs); - - integral_8_operations(squared_source, sqsums_above, data_mask, sqsums, sqsum_accumulator); - sqsums++; sqsums_above++; - } - - // Prepare for next iteration of inner loop - // shift source to align next 8 bytes to lane 0 and shift the mask - src_16bytes = shift_right_8_bytes(src_16bytes); - data_mask = data_mask >> 8; + if (sqsums) { + __m512i squared_source_lo = square_m512(src_longs_lo); + __m512i squared_source_hi = square_m512(src_longs_hi); + integral_8_operations(squared_source_lo, sqsums_above, data_mask, sqsums, sqsum_accumulator); + integral_8_operations(squared_source_hi, sqsums_above+1, data_mask>>8, sqsums+1, sqsum_accumulator); + sqsums += 2; + sqsums_above+=2; } - // Prepare for next iteration of outer loop + // Prepare for next iteration of loop + // shift source to align next 16 bytes to lane 0, shift the mask, and advance the pointers + sums += 2; + sums_above += 2; + data_mask = data_mask >> 16; src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk); + } + } - static inline + static CV_ALWAYS_INLINE void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask, __m512i *results_ptr, __m512i &accumulator) { + // NOTE that the calculate_integral function referenced here must be implemented in the templated + // derivatives because the algorithm depends heavily on the number of channels in the image + // _mm512_mask_storeu_pd( results_ptr, // Store the result here data_mask, // Using the data mask to avoid overrunning the line @@ -188,59 +191,178 @@ public: } - static inline - __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator) - { - __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5); - - // Align data to prepare for the adds: - // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes - // Vertical LANES: 76543210 - // src_longs : HGFEDCBA - // shited3lanes : + EDCBA - // shifted6lanes : + BA - // carry_over_idxs : + 65765765 (index position of result from previous iteration) - // = integral - __m512i shifted3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs); - __m512i shifted6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs); - __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); - - // Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator) - __m512i sum_shift3and6 = _mm512_add_epi64(shifted3lanes, shifted6lanes); - __m512i sum_src_carry = _mm512_add_epi64(src_longs, carry_over); - accumulator = _mm512_add_epi64(sum_shift3and6, sum_src_carry); - - // Convert to packed double and add to the line above to get the true integral value - __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); - __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); - return integral_pd; - } + // The calculate_integral function referenced here must be implemented in the templated derivatives + // because the algorithm depends heavily on the number of channels in the image + // This is the incomplete definition (just the prototype) here. + // + static CV_ALWAYS_INLINE + __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator); - static inline + static CV_ALWAYS_INLINE __m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask) { return _mm512_maskz_loadu_epi8(data_mask, srcs); } - static inline + static CV_ALWAYS_INLINE + __m128i extract_lower_16bytes(__m512i src_64byte_chunk) { + return _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); + } + + + static CV_ALWAYS_INLINE __m512i convert_lower_8bytes_to_longs(__m128i src_16bytes) { return _mm512_cvtepu8_epi64(src_16bytes); } - static inline + static CV_ALWAYS_INLINE + __m512i square_m512(__m512i src_longs) { + return _mm512_mullo_epi64(src_longs, src_longs); + } + + + static CV_ALWAYS_INLINE __m128i shift_right_8_bytes(__m128i src_16bytes) { return _mm_maskz_compress_epi64(2, src_16bytes); } - static inline + + static CV_ALWAYS_INLINE __m512i shift_right_16_bytes(__m512i src_64byte_chunk) { return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk); } + }; + + +//============================================================================================================ +// This the only section that needs to change with respect to algorithm based on the number of channels +// It is responsible for returning the calculation of 8 lanes worth of the integral and returning in the +// accumulated sums in the accumulator parameter (NOTE: accumulator is an input and output parameter) +// +// The function prototype that needs to be implemented is: +// +// __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... } +// +// Description of parameters: +// INPUTS: +// src_longs : 8 lanes worth of the source bytes converted to 64 bit integers +// above_values: 8 lanes worth of the result values from the line above (See the integral spec) +// accumulator : 8 lanes worth of sums from the previous iteration +// IMPORTANT NOTE: This parameter is both an INPUT AND OUTPUT parameter to this function +// +// OUTPUTS: +// return value: The actual integral value for all 8 lanes which is defined by the spec as +// the sum of all channel values to the left of a given pixel plus the result +// written to the line directly above the current line. +// accumulator: This is an input and and output. This parameter should be left with the accumulated +// sums for the current 8 lanes keeping all entries in the proper lane (do not shuffle it) +// +// Below here is the channel specific implementation +// + +//======================================== +// 2 Channel Integral Implementation +//======================================== +template<> +CV_ALWAYS_INLINE +__m512d IntegralCalculator < 2 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator) +{ + __m512i carryover_idxs = _mm512_set_epi64(7, 6, 7, 6, 7, 6, 7, 6); + + // Align data to prepare for the adds: + // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes + // Vertical LANES : 76543210 + // src_longs : HGFEDCBA + // shift2lanes : + FEDCBA + // shift4lanes : + DCBA + // shift6lanes : + BA + // carry_over_idxs : + 76767676 (index position of result from previous iteration) + // = integral + __m512i shift2lanes = _mm512_maskz_expand_epi64(0xFC, src_longs); + __m512i shift4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs); + __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs); + __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); + + // Add all values in tree form for perf ((0+2) + (4+6)) + __m512i sum_shift_02 = _mm512_add_epi64(src_longs, shift2lanes); + __m512i sum_shift_46 = _mm512_add_epi64(shift4lanes, shift6lanes); + __m512i sum_all = _mm512_add_epi64(sum_shift_02, sum_shift_46); + accumulator = _mm512_add_epi64(sum_all, carry_over); + + // Convert to packed double and add to the line above to get the true integral value + __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); + __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); + return integral_pd; +} + +//======================================== +// 3 Channel Integral Implementation +//======================================== +template<> +CV_ALWAYS_INLINE +__m512d IntegralCalculator < 3 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator) +{ + __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5); + + // Align data to prepare for the adds: + // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes + // Vertical LANES: 76543210 + // src_longs : HGFEDCBA + // shit3lanes : + EDCBA + // shift6lanes : + BA + // carry_over_idxs : + 65765765 (index position of result from previous iteration) + // = integral + __m512i shift3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs); + __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs); + __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); + + // Do the adds in tree form + __m512i sum_shift_03 = _mm512_add_epi64(src_longs, shift3lanes); + __m512i sum_rest = _mm512_add_epi64(shift6lanes, carry_over); + accumulator = _mm512_add_epi64(sum_shift_03, sum_rest); + + // Convert to packed double and add to the line above to get the true integral value + __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); + __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); + return integral_pd; +} + + +//======================================== +// 4 Channel Integral Implementation +//======================================== +template<> +CV_ALWAYS_INLINE +__m512d IntegralCalculator < 4 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator) +{ + __m512i carryover_idxs = _mm512_set_epi64(7, 6, 5, 4, 7, 6, 5, 4); + + // Align data to prepare for the adds: + // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes + // Vertical LANES: 76543210 + // src_longs : HGFEDCBA + // shit4lanes : + DCBA + // carry_over_idxs : + 76547654 (index position of result from previous iteration) + // = integral + __m512i shifted4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs); + __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); + + // Add data pixels and carry over from last iteration + __m512i sum_shift_04 = _mm512_add_epi64(src_longs, shifted4lanes); + accumulator = _mm512_add_epi64(sum_shift_04, carry_over); + + // Convert to packed double and add to the line above to get the true integral value + __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); + __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); + return integral_pd; +} + + } // end of anonymous namespace namespace opt_AVX512_SKX { @@ -253,8 +375,22 @@ void calculate_integral_avx512(const uchar *src, size_t _srcstep, double *sqsum, size_t _sqsumstep, int width, int height, int cn) { - IntegralCalculator_3Channel calculator; - calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn); + switch(cn){ + case 2: { + IntegralCalculator<2> calculator; + calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height); + break; + } + case 3: { + IntegralCalculator<3> calculator; + calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height); + break; + } + case 4: { + IntegralCalculator<4> calculator; + calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height); + } + } } diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index ae7647b8bd..ca8b99afde 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -46,7 +46,6 @@ #include "opencv2/core/hal/intrin.hpp" #include "sumpixels.hpp" - namespace cv { @@ -77,8 +76,8 @@ struct Integral_SIMD { { #if CV_TRY_AVX512_SKX CV_UNUSED(_tiltedstep); - // TODO: Add support for 1,2, and 4 channels - if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && cn == 3){ + // TODO: Add support for 1 channel input (WIP) + if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && ((cn >= 2) && (cn <= 4))){ opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn); return true; diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp index a77186d304..5b73e3bf8e 100644 --- a/modules/imgproc/test/test_filter.cpp +++ b/modules/imgproc/test/test_filter.cpp @@ -1667,12 +1667,11 @@ void CV_IntegralTest::get_test_array_types_and_sizes( int test_case_idx, { RNG& rng = ts->get_rng(); int depth = cvtest::randInt(rng) % 2, sum_depth; - int cn = cvtest::randInt(rng) % 3 + 1; + int cn = cvtest::randInt(rng) % 4 + 1; cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types ); Size sum_size; depth = depth == 0 ? CV_8U : CV_32F; - cn += cn == 2; int b = (cvtest::randInt(rng) & 1) != 0; sum_depth = depth == CV_8U && b ? CV_32S : b ? CV_32F : CV_64F;