From 7a4c5e942121c7ca7ce7c51dd545acaf9dd78ab1 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Wed, 24 Jan 2018 20:26:16 +0800 Subject: [PATCH 1/4] slice layer ocl support Signed-off-by: Li Peng --- modules/dnn/src/layers/slice_layer.cpp | 49 +++++++++++++++ modules/dnn/src/opencl/slice.cl | 87 ++++++++++++++++++++++++++ modules/dnn/test/test_layers.cpp | 12 +++- 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 modules/dnn/src/opencl/slice.cl diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 18758b98bf..aba3af8b0d 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -43,6 +43,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include +#include "opencl_kernels_dnn.hpp" namespace cv { @@ -171,11 +172,59 @@ public: } } +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + { + std::vector inputs; + std::vector outputs; + + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + if (inputs[0].dims < 4) + return false; + + const UMat& inpMat = inputs[0]; + for (size_t i = 0; i < outputs.size(); i++) + { + int groups = outputs[i].size[0]; + int channels = outputs[i].size[1]; + int rows = outputs[i].size[2]; + int cols = outputs[i].size[3]; + + int number = (cols % 8 == 0) ? 8 : ((cols % 4 == 0) ? 4 : 1); + String buildopt = format("-DNUM=%d ", number); + String kname = format("slice%d", number); + ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, buildopt); + size_t global[] = { (size_t)groups * channels, (size_t)rows * cols / number }; + int idx = 0; + kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat)); + kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3])); + kernel.set(idx++, (int)inpMat.size[3]); + kernel.set(idx++, (int)global[0]); + kernel.set(idx++, (int)(rows * cols)); + kernel.set(idx++, (int)cols); + kernel.set(idx++, (int)sliceRanges[i][2].start); + kernel.set(idx++, (int)sliceRanges[i][3].start); + kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i])); + bool ret = kernel.run(2, global, NULL, false); + if (!ret) + return false; + } + + return true; + } +#endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); } diff --git a/modules/dnn/src/opencl/slice.cl b/modules/dnn/src/opencl/slice.cl new file mode 100644 index 0000000000..81a71489be --- /dev/null +++ b/modules/dnn/src/opencl/slice.cl @@ -0,0 +1,87 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define Dtype float +#define Dtype4 float4 +#define Dtype8 float8 + +#if NUM == 8 + #define load(src, index) vload8(0, src + index) + #define store(vec, dst, index) vstore8(vec, 0, dst + index) + #define vec_type Dtype8 + #define SLICE slice8 +#elif NUM == 4 + #define load(src, index) vload4(0, src + index) + #define store(vec, dst, index) vstore4(vec, 0, dst + index) + #define vec_type Dtype4 + #define SLICE slice4 +#elif NUM == 1 + #define load(src, index) src[index] + #define store(vec, dst, index) dst[index] = vec + #define vec_type Dtype + #define SLICE slice1 +#endif + +__kernel void SLICE(__global const Dtype* src, + const int src_plane_size, + const int src_cols, + const int channels, + const int dst_plane_size, + const int dst_cols, + const int row_offset, + const int col_offset, + __global Dtype* dst) +{ + int x = get_global_id(0); + int y = get_global_id(1) * NUM; + + if ((x >= channels) || (y >= dst_plane_size)) + return; + + int row = y / dst_cols + row_offset; + int col = y % dst_cols + col_offset; + + int src_index = x * src_plane_size + row * src_cols + col; + int dst_index = x * dst_plane_size + y; + vec_type val = load(src, src_index); + store(val, dst, dst_index); +} diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 73f7c3594f..a178add7f8 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -367,11 +367,14 @@ OCL_TEST(Layer_Test_PReLU, Accuracy) // ); //} -static void test_Reshape_Split_Slice_layers() +static void test_Reshape_Split_Slice_layers(int targetId) { Net net = readNetFromCaffe(_tf("reshape_and_slice_routines.prototxt")); ASSERT_FALSE(net.empty()); + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(targetId); + Mat input(6, 12, CV_32F); RNG rng(0); rng.fill(input, RNG::UNIFORM, -1, 1); @@ -384,7 +387,12 @@ static void test_Reshape_Split_Slice_layers() TEST(Layer_Test_Reshape_Split_Slice, Accuracy) { - test_Reshape_Split_Slice_layers(); + test_Reshape_Split_Slice_layers(DNN_TARGET_CPU); +} + +OCL_TEST(Layer_Test_Reshape_Split_Slice, Accuracy) +{ + test_Reshape_Split_Slice_layers(DNN_TARGET_OPENCL); } TEST(Layer_Conv_Elu, Accuracy) From 54c81cbde43d879fbfc9c272a311e3947a327db3 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Thu, 25 Jan 2018 22:15:41 +0800 Subject: [PATCH 2/4] eltwise layer SUM op update Signed-off-by: Li Peng --- modules/dnn/src/layers/eltwise_layer.cpp | 54 +++++++++---- modules/dnn/src/opencl/eltwise.cl | 98 ++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 14 deletions(-) create mode 100644 modules/dnn/src/opencl/eltwise.cl diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 40375734d8..27bd7eedd2 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -43,6 +43,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "op_halide.hpp" +#include "opencl_kernels_dnn.hpp" namespace cv { @@ -271,22 +272,47 @@ public: switch (op) { case SUM: - if (coeffs.empty()) { - add(inputs[0], inputs[1], outputs[0]); - for (int i = 2; i < inputs.size(); ++i) - add(outputs[0], inputs[i], outputs[0]); - } - else - { - UMat mul0, mul1; - multiply(coeffs[0], inputs[0], mul0); - multiply(coeffs[1], inputs[1], mul1); - add(mul0, mul1, outputs[0]); - for (int i = 2; i < inputs.size(); ++i) + int channels = total(shape(outputs[0]), 0, 2); + int plane_size = total(shape(outputs[0]), 2); + if (channels % 4 == 0 && plane_size % 4 == 0) { - multiply(coeffs[i], inputs[i], mul0); - add(mul0, outputs[0], outputs[0]); + size_t localsize[] = { 128 }; + size_t globalsize[] = { (size_t)channels / 4 * localsize[0] }; + + for (int i = 0; i < (inputs.size() - 1); ++i) + { + String buildopt = format("-DLOOP=%d", i); + ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt); + int idx = 0; + UMat inpMat = (i == 0) ? inputs[0] : UMat(); + float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i]; + float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1]; + kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0])); + kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1])); + kernel.set(idx++, (int)plane_size); + kernel.set(idx++, (float)coeff1); + kernel.set(idx++, (float)coeff2); + kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0])); + bool ret = kernel.run(1, globalsize, localsize, false); + if (!ret) + return false; + } + } + else + { + float coeff1 = coeffs.empty() ? 1.f : coeffs[0]; + float coeff2 = coeffs.empty() ? 1.f : coeffs[1]; + UMat mul0, mul1; + multiply(coeff1, inputs[0], mul0); + multiply(coeff2, inputs[1], mul1); + add(mul0, mul1, outputs[0]); + for (int i = 2; i < inputs.size(); ++i) + { + float coeff = coeffs.empty() ? 1.f : coeffs[i]; + multiply(coeff, inputs[i], mul0); + add(mul0, outputs[0], outputs[0]); + } } } break; diff --git a/modules/dnn/src/opencl/eltwise.cl b/modules/dnn/src/opencl/eltwise.cl new file mode 100644 index 0000000000..6f3a374f54 --- /dev/null +++ b/modules/dnn/src/opencl/eltwise.cl @@ -0,0 +1,98 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define Dtype float +#define Dtype4 float4 +#define Dtype8 float8 + +__kernel void op_sum4(__global const Dtype * A, + __global const Dtype * B, + unsigned int A_col_size, + const float coeff1, + const float coeff2, + __global Dtype * C) +{ + unsigned int row_gid = get_group_id(0); + unsigned int lid = get_local_id(0); + const __global Dtype *src0_read = A + row_gid * 4 * A_col_size; + const __global Dtype *src1_read = B + row_gid * 4 * A_col_size; + __global Dtype *dst0_read = C + row_gid * 4 * A_col_size; + + Dtype4 a0, a1, a2, a3; + Dtype4 dot0, dot1, dot2, dot3; + unsigned int i = lid; + while( i < A_col_size / 4) + { + const Dtype4 b0 = vload4(i, src1_read); + const Dtype4 b1 = vload4(i, src1_read + A_col_size); + const Dtype4 b2 = vload4(i, src1_read + 2 * A_col_size); + const Dtype4 b3 = vload4(i, src1_read + 3 * A_col_size); + +#if LOOP == 0 + a0 = vload4(i, src0_read); + a1 = vload4(i, src0_read + A_col_size); + a2 = vload4(i, src0_read + 2 * A_col_size); + a3 = vload4(i, src0_read + 3 * A_col_size); + + dot0 = a0 * coeff1 + b0 * coeff2; + dot1 = a1 * coeff1 + b1 * coeff2; + dot2 = a2 * coeff1 + b2 * coeff2; + dot3 = a3 * coeff1 + b3 * coeff2; +#else + a0 = vload4(i, dst0_read); + a1 = vload4(i, dst0_read + A_col_size); + a2 = vload4(i, dst0_read + 2 * A_col_size); + a3 = vload4(i, dst0_read + 3 * A_col_size); + + dot0 = a0 + b0 * coeff2; + dot1 = a1 + b1 * coeff2; + dot2 = a2 + b2 * coeff2; + dot3 = a3 + b3 * coeff2; +#endif + vstore4(dot0, i, dst0_read); + vstore4(dot1, i, dst0_read + A_col_size); + vstore4(dot2, i, dst0_read + 2 * A_col_size); + vstore4(dot3, i, dst0_read + 3 * A_col_size); + + i += get_local_size(0); + } +} From 83b16ab7b71bec486be7a6f013ddfb77653dd0f8 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Thu, 1 Feb 2018 02:09:13 +0800 Subject: [PATCH 3/4] fix extra spaces in build option Signed-off-by: Li Peng --- modules/dnn/src/layers/batch_norm_layer.cpp | 2 +- modules/dnn/src/layers/mvn_layer.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 8acf8b2477..52ce95f57f 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -144,7 +144,7 @@ public: UMat src = inputs[ii].reshape(1, s.size(), &s[0]); UMat dst = outputs[ii].reshape(1, s.size(), &s[0]); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); - String buildopt = format("-DNUM=%d ", number); + String buildopt = format("-DNUM=%d", number); String kname = format("batch_norm%d", number); ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt); if (kernel.empty()) diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 1d5e12b2b6..a74bc0e14e 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -132,7 +132,7 @@ public: int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) }; - String buildopt = format("-DNUM=%d ", number); + String buildopt = format("-DNUM=%d", number); if (normVariance) { String kname = format("calc_mean%d", number); @@ -156,9 +156,9 @@ public: } String kname = format("mvn%d", number); - buildopt += format("%s %s %s ", (normVariance) ? "-DNORM_VARIANCE" : "", - (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "", - (fuse_relu) ? "-DFUSE_RELU" : ""); + buildopt += format("%s%s%s", (normVariance) ? " -DNORM_VARIANCE" : "", + (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "", + (fuse_relu) ? " -DFUSE_RELU" : ""); ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt); if (kernel1.empty()) return false; From 6aec71d7ee02955d781ef57126693ec674976076 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Wed, 31 Jan 2018 19:28:58 +0800 Subject: [PATCH 4/4] mvn layer ocl update it fuse ocl kernels to reduce kernel enqueue Signed-off-by: Li Peng --- modules/dnn/src/layers/mvn_layer.cpp | 76 ++++++++++- modules/dnn/src/opencl/mvn.cl | 180 +++++++++++++++++++++++++++ 2 files changed, 251 insertions(+), 5 deletions(-) diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index a74bc0e14e..c911b741b4 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -93,6 +93,67 @@ public: } #ifdef HAVE_OPENCL + bool fast_forward_ocl(std::vector &inputs, std::vector &outputs) + { + if( fuse_batch_norm && scale.empty()) + { + bnorm->getScaleShift(scale, shift); + bnorm_weight = scale.getUMat(ACCESS_READ); + bnorm_bias = shift.getUMat(ACCESS_READ); + } + + int splitDim = (acrossChannels) ? 1 : 2; + for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++) + { + UMat &inpMat = inputs[inpIdx]; + UMat &outMat = outputs[inpIdx]; + int newRows = total(shape(inpMat), 0, splitDim); + + MatShape s = shape(newRows, inpMat.total() / newRows); + UMat oneMat = UMat::ones(s[1], 1, CV_32F); + UMat meanMat = UMat(s[0], 1, CV_32F); + UMat tmpMat = UMat(s[0], s[1], CV_32F); + float alpha = 1.0f / s[1]; + + String buildopt = "-DNUM=4"; + ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt); + size_t localsize[] = { 128 }; + size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] }; + + int argId = 0; + k.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat)); + k.set(argId++, (int)s[1]); + k.set(argId++, alpha); + k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat)); + k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat)); + k.set(argId++, NULL, localsize[0] * sizeof(cl_float4)); + bool ret = k.run(1, globalsize, localsize, false); + if (!ret) + return false; + + buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "", + (fuse_relu) ? "-DFUSE_RELU" : ""); + + ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt); + argId = 0; + k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat)); + k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat)); + k1.set(argId++, ocl::KernelArg::PtrReadOnly(meanMat)); + k1.set(argId++, (int)s[1]); + k1.set(argId++, (float)alpha); + k1.set(argId++, (float)eps); + k1.set(argId++, (float)relu_slope); + k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight)); + k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias)); + k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat)); + k1.set(argId++, NULL, localsize[0] * sizeof(cl_float4)); + ret = k1.run(1, globalsize, localsize, false); + if (!ret) + return false; + } + return true; + } + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) { std::vector inputs; @@ -101,6 +162,15 @@ public: inputs_.getUMatVector(inputs); outputs_.getUMatVector(outputs); + int splitDim = (acrossChannels) ? 1 : 2; + int row_size = total(shape(inputs[0]), 0, splitDim); + int plane_size = total(shape(inputs[0]), splitDim); + if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0)) + { + bool ret = fast_forward_ocl(inputs, outputs); + return ret; + } + if( fuse_batch_norm && scale.empty()) { bnorm->getScaleShift(scale, shift); @@ -112,11 +182,7 @@ public: { UMat &inpMat = inputs[inpIdx]; UMat &outMat = outputs[inpIdx]; - - int splitDim = (acrossChannels) ? 1 : 2; - int i, newRows = 1; - for( i = 0; i < splitDim; i++ ) - newRows *= inpMat.size[i]; + int newRows = total(shape(inpMat), 0, splitDim); MatShape s = shape(newRows, inpMat.total() / newRows); UMat oneMat = UMat::ones(s[1], 1, CV_32F); diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl index cc059eeb1a..9f8ab574ca 100644 --- a/modules/dnn/src/opencl/mvn.cl +++ b/modules/dnn/src/opencl/mvn.cl @@ -50,18 +50,24 @@ #define vec_type Dtype8 #define CALC_MEAN calc_mean8 #define MVN mvn8 + #define MEAN_FUSE mean_fuse8 + #define MVN_FUSE mvn_fuse8 #elif NUM == 4 #define load(src, index) vload4(0, src + index) #define store(vec, dst, index) vstore4(vec, 0, dst + index) #define vec_type Dtype4 #define CALC_MEAN calc_mean4 #define MVN mvn4 + #define MEAN_FUSE mean_fuse4 + #define MVN_FUSE mvn_fuse4 #elif NUM == 1 #define load(src, index) src[index] #define store(vec, dst, index) dst[index] = vec #define vec_type Dtype #define CALC_MEAN calc_mean1 #define MVN mvn1 + #define MEAN_FUSE mean_fuse1 + #define MVN_FUSE mvn_fuse1 #endif __kernel void CALC_MEAN(__global const Dtype* src, @@ -128,3 +134,177 @@ __kernel void MVN(__global const Dtype* src, store(dst_vec, dst, index); } + +__kernel void MEAN_FUSE(__global const Dtype * A, + unsigned int A_col_size, + float alpha, + __global Dtype4 * result, + __global Dtype * B, + __local Dtype4 * work) +{ + unsigned int row_gid = get_group_id(0); + unsigned int lid = get_local_id(0); + const __global Dtype *src0_read = A + row_gid * 4 * A_col_size; + __global Dtype *dst0_read = B + row_gid * 4 * A_col_size; + Dtype4 dot0, dot1, dot2, dot3; + dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); + + unsigned int i = lid; + const Dtype4 b0 = (Dtype4)1.f; + while( i < A_col_size / 4) + { + const Dtype4 a0 = vload4(i, src0_read); + const Dtype4 a1 = vload4(i, src0_read + A_col_size); + const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); + const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); + + dot0 += a0; + dot1 += a1; + dot2 += a2; + dot3 += a3; + + i += get_local_size(0); + } + + work[lid].s0 = dot(dot0, b0); + work[lid].s1 = dot(dot1, b0); + work[lid].s2 = dot(dot2, b0); + work[lid].s3 = dot(dot3, b0); + + for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < stride) + work[lid] += work[lid+stride]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(lid == 0) + { + result[row_gid] = alpha * work[0]; + } + + Dtype4 sum = work[0] * alpha; + i = lid; + while( i < A_col_size / 4) + { + const Dtype4 a0 = vload4(i, src0_read); + const Dtype4 a1 = vload4(i, src0_read + A_col_size); + const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); + const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); + + dot0 = native_powr(a0 - (Dtype4)sum.x, 2); + dot1 = native_powr(a1 - (Dtype4)sum.y, 2); + dot2 = native_powr(a2 - (Dtype4)sum.z, 2); + dot3 = native_powr(a3 - (Dtype4)sum.w, 2); + + vstore4(dot0, i, dst0_read); + vstore4(dot1, i, dst0_read + A_col_size); + vstore4(dot2, i, dst0_read + 2 * A_col_size); + vstore4(dot3, i, dst0_read + 3 * A_col_size); + + i += get_local_size(0); + } +} + +__kernel void MVN_FUSE(__global const Dtype * tmp, + __global const Dtype * A, + __global const Dtype4 * mean, + unsigned int A_col_size, + const float alpha_val, + const float eps, + const float relu_slope, + __global const Dtype4 * bnorm_weight, + __global const Dtype4 * bnorm_bias, + __global Dtype * B, + __local Dtype4 * work) +{ + unsigned int row_gid = get_group_id(0); + unsigned int lid = get_local_id(0); + const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size; + const __global Dtype *src1_read = A + row_gid * 4 * A_col_size; + __global Dtype *dst0_read = B + row_gid * 4 * A_col_size; + Dtype4 dot0, dot1, dot2, dot3; + dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f); + + unsigned int i = lid; + const Dtype4 b0 = (Dtype4)1.f; + while( i < A_col_size / 4) + { + const Dtype4 a0 = vload4(i, src0_read); + const Dtype4 a1 = vload4(i, src0_read + A_col_size); + const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size); + const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size); + + dot0 += a0; + dot1 += a1; + dot2 += a2; + dot3 += a3; + + i += get_local_size(0); + } + + work[lid].s0 = dot(dot0, b0); + work[lid].s1 = dot(dot1, b0); + work[lid].s2 = dot(dot2, b0); + work[lid].s3 = dot(dot3, b0); + + for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) + { + barrier(CLK_LOCAL_MEM_FENCE); + if(lid < stride) + work[lid] += work[lid+stride]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + Dtype4 mean_val = mean[row_gid]; + Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps; + Dtype4 alpha = (Dtype4)1.f / dev_val; + + Dtype4 w = (Dtype4)1.f; + Dtype4 b = (Dtype4)0.f; +#ifdef FUSE_BATCH_NORM + w = bnorm_weight[row_gid]; + b = bnorm_bias[row_gid]; +#endif + + i = lid; + while( i < A_col_size / 4) + { + const Dtype4 a0 = vload4(i, src1_read); + const Dtype4 a1 = vload4(i, src1_read + A_col_size); + const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size); + const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size); + + dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x; + dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y; + dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z; + dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w; + + dot0 = dot0 * w.x + (Dtype4)b.x; + dot1 = dot1 * w.y + (Dtype4)b.y; + dot2 = dot2 * w.z + (Dtype4)b.z; + dot3 = dot3 * w.w + (Dtype4)b.w; + +#ifdef FUSE_RELU + Dtype4 new0 = dot0 * relu_slope; + dot0 = select(new0, dot0, dot0 > (Dtype4)0.f); + + Dtype4 new1 = dot1 * relu_slope; + dot1 = select(new1, dot1, dot1 > (Dtype4)0.f); + + Dtype4 new2 = dot2 * relu_slope; + dot2 = select(new2, dot2, dot2 > (Dtype4)0.f); + + Dtype4 new3 = dot3 * relu_slope; + dot3 = select(new3, dot3, dot3 > (Dtype4)0.f); +#endif + + vstore4(dot0, i, dst0_read); + vstore4(dot1, i, dst0_read + A_col_size); + vstore4(dot2, i, dst0_read + 2 * A_col_size); + vstore4(dot3, i, dst0_read + 3 * A_col_size); + + i += get_local_size(0); + } +}