Merge pull request #20361 from r0hit2005:master
Tutorial for parallel_for_ and Universal Intrinsic (GSoC '21) * New parallel_for tutorial * Universal Intrinsics Draft Tutorial * Added draft of universal intrinsic tutorial * * Added final markdown for parallel_for_new * Added first half of universal intrinsic tutorial * Fixed warnings in documentation and sample code for parallel_for_new tutorial * Restored original parallel_for_ tutorial and table_of_content_core * Minor changes * Added demonstration of 1-D vectorized convolution * * Added 2-D convolution implementation and tutorial * Minor changes in vectorized implementation of 1-D and 2-D convolution * Minor changes to univ_intrin tutorial. Added new tutorials to the table of contents * Minor changes * Removed variable sized array initializations * Fixed conversion warnings * Added doxygen references, minor fixes * Added jpg image for parallel_for_ doc
This commit is contained in:
+332
@@ -0,0 +1,332 @@
|
||||
#include <iostream>
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/imgcodecs.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
|
||||
namespace
|
||||
{
|
||||
//! [convolution-sequential]
|
||||
void conv_seq(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
//![convolution-make-borders]
|
||||
int rows = src.rows, cols = src.cols;
|
||||
dst = Mat(rows, cols, src.type());
|
||||
|
||||
// Taking care of edge values
|
||||
// Make border = kernel.rows / 2;
|
||||
|
||||
int sz = kernel.rows / 2;
|
||||
copyMakeBorder(src, src, sz, sz, sz, sz, BORDER_REPLICATE);
|
||||
//![convolution-make-borders]
|
||||
|
||||
//! [convolution-kernel-loop]
|
||||
for (int i = 0; i < rows; i++)
|
||||
{
|
||||
uchar *dptr = dst.ptr(i);
|
||||
for (int j = 0; j < cols; j++)
|
||||
{
|
||||
double value = 0;
|
||||
|
||||
for (int k = -sz; k <= sz; k++)
|
||||
{
|
||||
// slightly faster results when we create a ptr due to more efficient memory access.
|
||||
uchar *sptr = src.ptr(i + sz + k);
|
||||
for (int l = -sz; l <= sz; l++)
|
||||
{
|
||||
value += kernel.ptr<double>(k + sz)[l + sz] * sptr[j + sz + l];
|
||||
}
|
||||
}
|
||||
dptr[j] = saturate_cast<uchar>(value);
|
||||
}
|
||||
}
|
||||
//! [convolution-kernel-loop]
|
||||
}
|
||||
//! [convolution-sequential]
|
||||
|
||||
#ifdef CV_CXX11
|
||||
void conv_parallel(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
int rows = src.rows, cols = src.cols;
|
||||
|
||||
dst = Mat(rows, cols, CV_8UC1, Scalar(0));
|
||||
|
||||
// Taking care of edge values
|
||||
// Make border = kernel.rows / 2;
|
||||
|
||||
int sz = kernel.rows / 2;
|
||||
copyMakeBorder(src, src, sz, sz, sz, sz, BORDER_REPLICATE);
|
||||
|
||||
//! [convolution-parallel-cxx11]
|
||||
parallel_for_(Range(0, rows * cols), [&](const Range &range)
|
||||
{
|
||||
for (int r = range.start; r < range.end; r++)
|
||||
{
|
||||
int i = r / cols, j = r % cols;
|
||||
|
||||
double value = 0;
|
||||
for (int k = -sz; k <= sz; k++)
|
||||
{
|
||||
uchar *sptr = src.ptr(i + sz + k);
|
||||
for (int l = -sz; l <= sz; l++)
|
||||
{
|
||||
value += kernel.ptr<double>(k + sz)[l + sz] * sptr[j + sz + l];
|
||||
}
|
||||
}
|
||||
dst.ptr(i)[j] = saturate_cast<uchar>(value);
|
||||
}
|
||||
});
|
||||
//! [convolution-parallel-cxx11]
|
||||
}
|
||||
|
||||
void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
int rows = src.rows, cols = src.cols;
|
||||
|
||||
dst = Mat(rows, cols, CV_8UC1, Scalar(0));
|
||||
|
||||
// Taking care of edge values
|
||||
// Make border = kernel.rows / 2;
|
||||
|
||||
int sz = kernel.rows / 2;
|
||||
copyMakeBorder(src, src, sz, sz, sz, sz, BORDER_REPLICATE);
|
||||
|
||||
//! [convolution-parallel-cxx11-row-split]
|
||||
parallel_for_(Range(0, rows), [&](const Range &range)
|
||||
{
|
||||
for (int i = range.start; i < range.end; i++)
|
||||
{
|
||||
|
||||
uchar *dptr = dst.ptr(i);
|
||||
for (int j = 0; j < cols; j++)
|
||||
{
|
||||
double value = 0;
|
||||
for (int k = -sz; k <= sz; k++)
|
||||
{
|
||||
uchar *sptr = src.ptr(i + sz + k);
|
||||
for (int l = -sz; l <= sz; l++)
|
||||
{
|
||||
value += kernel.ptr<double>(k + sz)[l + sz] * sptr[j + sz + l];
|
||||
}
|
||||
}
|
||||
dptr[j] = saturate_cast<uchar>(value);
|
||||
}
|
||||
}
|
||||
});
|
||||
//! [convolution-parallel-cxx11-row-split]
|
||||
}
|
||||
#else
|
||||
|
||||
//! [convolution-parallel]
|
||||
class parallelConvolution : public ParallelLoopBody
|
||||
{
|
||||
private:
|
||||
Mat m_src, &m_dst;
|
||||
Mat m_kernel;
|
||||
int sz;
|
||||
|
||||
public:
|
||||
parallelConvolution(Mat src, Mat &dst, Mat kernel)
|
||||
: m_src(src), m_dst(dst), m_kernel(kernel)
|
||||
{
|
||||
sz = kernel.rows / 2;
|
||||
}
|
||||
|
||||
//! [overload-full]
|
||||
virtual void operator()(const Range &range) const CV_OVERRIDE
|
||||
{
|
||||
for (int r = range.start; r < range.end; r++)
|
||||
{
|
||||
int i = r / m_src.cols, j = r % m_src.cols;
|
||||
|
||||
double value = 0;
|
||||
for (int k = -sz; k <= sz; k++)
|
||||
{
|
||||
uchar *sptr = m_src.ptr(i + sz + k);
|
||||
for (int l = -sz; l <= sz; l++)
|
||||
{
|
||||
value += m_kernel.ptr<double>(k + sz)[l + sz] * sptr[j + sz + l];
|
||||
}
|
||||
}
|
||||
m_dst.ptr(i)[j] = saturate_cast<uchar>(value);
|
||||
}
|
||||
}
|
||||
//! [overload-full]
|
||||
};
|
||||
//! [convolution-parallel]
|
||||
|
||||
void conv_parallel(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
int rows = src.rows, cols = src.cols;
|
||||
|
||||
dst = Mat(rows, cols, CV_8UC1, Scalar(0));
|
||||
|
||||
// Taking care of edge values
|
||||
// Make border = kernel.rows / 2;
|
||||
|
||||
int sz = kernel.rows / 2;
|
||||
copyMakeBorder(src, src, sz, sz, sz, sz, BORDER_REPLICATE);
|
||||
|
||||
//! [convolution-parallel-function]
|
||||
parallelConvolution obj(src, dst, kernel);
|
||||
parallel_for_(Range(0, rows * cols), obj);
|
||||
//! [convolution-parallel-function]
|
||||
}
|
||||
|
||||
//! [conv-parallel-row-split]
|
||||
class parallelConvolutionRowSplit : public ParallelLoopBody
|
||||
{
|
||||
private:
|
||||
Mat m_src, &m_dst;
|
||||
Mat m_kernel;
|
||||
int sz;
|
||||
|
||||
public:
|
||||
parallelConvolutionRowSplit(Mat src, Mat &dst, Mat kernel)
|
||||
: m_src(src), m_dst(dst), m_kernel(kernel)
|
||||
{
|
||||
sz = kernel.rows / 2;
|
||||
}
|
||||
|
||||
//! [overload-row-split]
|
||||
virtual void operator()(const Range &range) const CV_OVERRIDE
|
||||
{
|
||||
for (int i = range.start; i < range.end; i++)
|
||||
{
|
||||
|
||||
uchar *dptr = dst.ptr(i);
|
||||
for (int j = 0; j < cols; j++)
|
||||
{
|
||||
double value = 0;
|
||||
for (int k = -sz; k <= sz; k++)
|
||||
{
|
||||
uchar *sptr = src.ptr(i + sz + k);
|
||||
for (int l = -sz; l <= sz; l++)
|
||||
{
|
||||
value += kernel.ptr<double>(k + sz)[l + sz] * sptr[j + sz + l];
|
||||
}
|
||||
}
|
||||
dptr[j] = saturate_cast<uchar>(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
//! [overload-row-split]
|
||||
};
|
||||
//! [conv-parallel-row-split]
|
||||
|
||||
void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
int rows = src.rows, cols = src.cols;
|
||||
|
||||
dst = Mat(rows, cols, CV_8UC1, Scalar(0));
|
||||
|
||||
// Taking care of edge values
|
||||
// Make border = kernel.rows / 2;
|
||||
|
||||
int sz = kernel.rows / 2;
|
||||
copyMakeBorder(src, src, sz, sz, sz, sz, BORDER_REPLICATE);
|
||||
|
||||
//! [convolution-parallel-function-row]
|
||||
parallelConvolutionRowSplit obj(src, dst, kernel);
|
||||
parallel_for_(Range(0, rows), obj);
|
||||
//! [convolution-parallel-function-row]
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void help(char *progName)
|
||||
{
|
||||
cout << endl
|
||||
<< " This program shows how to use the OpenCV parallel_for_ function and \n"
|
||||
<< " compares the performance of the sequential and parallel implementations for a \n"
|
||||
<< " convolution operation\n"
|
||||
<< " Usage:\n "
|
||||
<< progName << " [image_path -- default lena.jpg] " << endl
|
||||
<< endl;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
|
||||
help(argv[0]);
|
||||
const char *filepath = argc >= 2 ? argv[1] : "../../../../data/lena.jpg";
|
||||
|
||||
Mat src, dst, kernel;
|
||||
src = imread(filepath, IMREAD_GRAYSCALE);
|
||||
|
||||
if (src.empty())
|
||||
{
|
||||
cerr << "Can't open [" << filepath << "]" << endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
namedWindow("Input", 1);
|
||||
namedWindow("Output1", 1);
|
||||
namedWindow("Output2", 1);
|
||||
namedWindow("Output3", 1);
|
||||
imshow("Input", src);
|
||||
|
||||
kernel = (Mat_<double>(3, 3) << 1, 0, -1,
|
||||
1, 0, -1,
|
||||
1, 0, -1);
|
||||
|
||||
/*
|
||||
Uncomment the kernels you want to use or write your own kernels to test out
|
||||
performance.
|
||||
*/
|
||||
|
||||
/*
|
||||
kernel = (Mat_<double>(5, 5) << 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1);
|
||||
kernel /= 100;
|
||||
*/
|
||||
|
||||
/*
|
||||
kernel = (Mat_<double>(3, 3) << 1, 1, 1,
|
||||
0, 0, 0,
|
||||
-1, -1, -1);
|
||||
|
||||
*/
|
||||
|
||||
double t = (double)getTickCount();
|
||||
|
||||
conv_seq(src, dst, kernel);
|
||||
|
||||
t = ((double)getTickCount() - t) / getTickFrequency();
|
||||
cout << " Sequential implementation: " << t << "s" << endl;
|
||||
|
||||
imshow("Output1", dst);
|
||||
waitKey(0);
|
||||
|
||||
t = (double)getTickCount();
|
||||
|
||||
conv_parallel(src, dst, kernel);
|
||||
|
||||
t = ((double)getTickCount() - t) / getTickFrequency();
|
||||
cout << " Parallel Implementation: " << t << "s" << endl;
|
||||
|
||||
imshow("Output2", dst);
|
||||
waitKey(0);
|
||||
|
||||
t = (double)getTickCount();
|
||||
|
||||
conv_parallel_row_split(src, dst, kernel);
|
||||
|
||||
t = ((double)getTickCount() - t) / getTickFrequency();
|
||||
cout << " Parallel Implementation(Row Split): " << t << "s" << endl
|
||||
<< endl;
|
||||
|
||||
imshow("Output3", dst);
|
||||
waitKey(0);
|
||||
|
||||
// imwrite("src.png", src);
|
||||
// imwrite("dst.png", dst);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,230 @@
|
||||
#include <iostream>
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/imgcodecs.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/highgui.hpp>
|
||||
#include <opencv2/core/simd_intrinsics.hpp>
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
|
||||
const int N = 100005, K = 2000;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
void conv_seq(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
int rows = src.rows, cols = src.cols;
|
||||
dst = Mat(rows, cols, CV_8UC1);
|
||||
|
||||
int sz = kernel.rows / 2;
|
||||
copyMakeBorder(src, src, sz, sz, sz, sz, BORDER_REPLICATE);
|
||||
for (int i = 0; i < rows; i++)
|
||||
{
|
||||
uchar *dptr = dst.ptr<uchar>(i);
|
||||
for (int j = 0; j < cols; j++)
|
||||
{
|
||||
float value = 0;
|
||||
|
||||
for (int k = -sz; k <= sz; k++)
|
||||
{
|
||||
// slightly faster results when we create a ptr due to more efficient memory access.
|
||||
uchar *sptr = src.ptr<uchar>(i + sz + k);
|
||||
for (int l = -sz; l <= sz; l++)
|
||||
{
|
||||
value += kernel.ptr<float>(k + sz)[l + sz] * sptr[j + sz + l];
|
||||
}
|
||||
}
|
||||
dptr[j] = saturate_cast<uchar>(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//! [convolution-1D-scalar]
|
||||
void conv1d(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
|
||||
//! [convolution-1D-border]
|
||||
int len = src.cols;
|
||||
dst = Mat(1, len, CV_8UC1);
|
||||
|
||||
int sz = kernel.cols / 2;
|
||||
copyMakeBorder(src, src, 0, 0, sz, sz, BORDER_REPLICATE);
|
||||
//! [convolution-1D-border]
|
||||
|
||||
//! [convolution-1D-scalar-main]
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
double value = 0;
|
||||
for (int k = -sz; k <= sz; k++)
|
||||
value += src.ptr<uchar>(0)[i + k + sz] * kernel.ptr<float>(0)[k + sz];
|
||||
|
||||
dst.ptr<uchar>(0)[i] = saturate_cast<uchar>(value);
|
||||
}
|
||||
//! [convolution-1D-scalar-main]
|
||||
}
|
||||
//! [convolution-1D-scalar]
|
||||
|
||||
//! [convolution-1D-vector]
|
||||
void conv1dsimd(Mat src, Mat kernel, float *ans, int row = 0, int rowk = 0, int len = -1)
|
||||
{
|
||||
if (len == -1)
|
||||
len = src.cols;
|
||||
|
||||
//! [convolution-1D-convert]
|
||||
Mat src_32, kernel_32;
|
||||
|
||||
const int alpha = 1;
|
||||
src.convertTo(src_32, CV_32FC1, alpha);
|
||||
|
||||
int ksize = kernel.cols, sz = kernel.cols / 2;
|
||||
copyMakeBorder(src_32, src_32, 0, 0, sz, sz, BORDER_REPLICATE);
|
||||
//! [convolution-1D-convert]
|
||||
|
||||
|
||||
//! [convolution-1D-main]
|
||||
//! [convolution-1D-main-h1]
|
||||
int step = v_float32().nlanes;
|
||||
float *sptr = src_32.ptr<float>(row), *kptr = kernel.ptr<float>(rowk);
|
||||
for (int k = 0; k < ksize; k++)
|
||||
{
|
||||
//! [convolution-1D-main-h1]
|
||||
//! [convolution-1D-main-h2]
|
||||
v_float32 kernel_wide = vx_setall_f32(kptr[k]);
|
||||
int i;
|
||||
for (i = 0; i + step < len; i += step)
|
||||
{
|
||||
v_float32 window = vx_load(sptr + i + k);
|
||||
v_float32 sum = vx_load(ans + i) + kernel_wide * window;
|
||||
v_store(ans + i, sum);
|
||||
}
|
||||
//! [convolution-1D-main-h2]
|
||||
|
||||
//! [convolution-1D-main-h3]
|
||||
for (; i < len; i++)
|
||||
{
|
||||
*(ans + i) += sptr[i + k]*kptr[k];
|
||||
}
|
||||
//! [convolution-1D-main-h3]
|
||||
}
|
||||
//! [convolution-1D-main]
|
||||
}
|
||||
//! [convolution-1D-vector]
|
||||
|
||||
//! [convolution-2D]
|
||||
void convolute_simd(Mat src, Mat &dst, Mat kernel)
|
||||
{
|
||||
//! [convolution-2D-init]
|
||||
int rows = src.rows, cols = src.cols;
|
||||
int ksize = kernel.rows, sz = ksize / 2;
|
||||
dst = Mat(rows, cols, CV_32FC1);
|
||||
|
||||
copyMakeBorder(src, src, sz, sz, 0, 0, BORDER_REPLICATE);
|
||||
|
||||
int step = v_float32().nlanes;
|
||||
//! [convolution-2D-init]
|
||||
|
||||
//! [convolution-2D-main]
|
||||
for (int i = 0; i < rows; i++)
|
||||
{
|
||||
for (int k = 0; k < ksize; k++)
|
||||
{
|
||||
float ans[N] = {0};
|
||||
conv1dsimd(src, kernel, ans, i + k, k, cols);
|
||||
int j;
|
||||
for (j = 0; j + step < cols; j += step)
|
||||
{
|
||||
v_float32 sum = vx_load(&dst.ptr<float>(i)[j]) + vx_load(&ans[j]);
|
||||
v_store(&dst.ptr<float>(i)[j], sum);
|
||||
}
|
||||
|
||||
for (; j < cols; j++)
|
||||
dst.ptr<float>(i)[j] += ans[j];
|
||||
}
|
||||
}
|
||||
//! [convolution-2D-main]
|
||||
|
||||
//! [convolution-2D-conv]
|
||||
const int alpha = 1;
|
||||
dst.convertTo(dst, CV_8UC1, alpha);
|
||||
//! [convolution-2D-conv]
|
||||
}
|
||||
//! [convolution-2D]
|
||||
|
||||
static void help(char *progName)
|
||||
{
|
||||
cout << endl
|
||||
<< " This program shows how to use the OpenCV parallel_for_ function and \n"
|
||||
<< " compares the performance of the sequential and parallel implementations for a \n"
|
||||
<< " convolution operation\n"
|
||||
<< " Usage:\n "
|
||||
<< progName << " [image_path -- default lena.jpg] " << endl
|
||||
<< endl;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
|
||||
// 1-D Convolution //
|
||||
Mat vsrc(1, N, CV_8UC1), k(1, K, CV_32FC1), vdst;
|
||||
RNG rng(time(0));
|
||||
rng.RNG::fill(vsrc, RNG::UNIFORM, Scalar(0), Scalar(255));
|
||||
rng.RNG::fill(k, RNG::UNIFORM, Scalar(-50), Scalar(50));
|
||||
|
||||
double t = (double)getTickCount();
|
||||
conv1d(vsrc, vdst, k);
|
||||
t = ((double)getTickCount() - t) / getTickFrequency();
|
||||
cout << " Sequential 1-D convolution implementation: " << t << "s" << endl;
|
||||
|
||||
t = (double)getTickCount();
|
||||
float ans[N] = {0};
|
||||
conv1dsimd(vsrc, k, ans);
|
||||
t = ((double)getTickCount() - t) / getTickFrequency();
|
||||
cout << " Vectorized 1-D convolution implementation: " << t << "s" << endl;
|
||||
|
||||
// 2-D Convolution //
|
||||
help(argv[0]);
|
||||
|
||||
const char *filepath = argc >= 2 ? argv[1] : "../../../../data/lena.jpg";
|
||||
|
||||
Mat src, dst1, dst2, kernel;
|
||||
src = imread(filepath, IMREAD_GRAYSCALE);
|
||||
|
||||
if (src.empty())
|
||||
{
|
||||
cerr << "Can't open [" << filepath << "]" << endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
namedWindow("Input", 1);
|
||||
namedWindow("Output", 1);
|
||||
imshow("Input", src);
|
||||
|
||||
kernel = (Mat_<float>(3, 3) << 1, 0, -1,
|
||||
2, 0, -2,
|
||||
1, 0, -1);
|
||||
|
||||
t = (double)getTickCount();
|
||||
|
||||
conv_seq(src, dst1, kernel);
|
||||
|
||||
t = ((double)getTickCount() - t) / getTickFrequency();
|
||||
cout << " Sequential 2-D convolution implementation: " << t << "s" << endl;
|
||||
|
||||
imshow("Output", dst1);
|
||||
waitKey(0);
|
||||
|
||||
t = (double)getTickCount();
|
||||
|
||||
convolute_simd(src, dst2, kernel);
|
||||
|
||||
t = ((double)getTickCount() - t) / getTickFrequency();
|
||||
cout << " Vectorized 2-D convolution implementation: " << t << "s" << endl
|
||||
<< endl;
|
||||
|
||||
imshow("Output", dst2);
|
||||
waitKey(0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user